chiark / gitweb /
exec: introduce PrivateNetwork= process option to turn off network access to specific...
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40
41 #include "log.h"
42 #include "util.h"
43 #include "missing.h"
44 #include "cgroup-util.h"
45 #include "sd-daemon.h"
46 #include "strv.h"
47 #include "loopback-setup.h"
48
49 static char *arg_directory = NULL;
50 static char *arg_user = NULL;
51 static bool arg_private_network = false;
52
53 static int help(void) {
54
55         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
56                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
57                "  -h --help            Show this help\n"
58                "  -D --directory=NAME  Root directory for the container\n"
59                "  -u --user=USER       Run the command under specified user or uid\n"
60                "     --private-network Disable network in container\n",
61                program_invocation_short_name);
62
63         return 0;
64 }
65
66 static int parse_argv(int argc, char *argv[]) {
67
68         enum {
69                 ARG_PRIVATE_NETWORK = 0x100
70         };
71
72         static const struct option options[] = {
73                 { "help",            no_argument,       NULL, 'h'                 },
74                 { "directory",       required_argument, NULL, 'D'                 },
75                 { "user",            required_argument, NULL, 'u'                 },
76                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
77                 { NULL,              0,                 NULL, 0                   }
78         };
79
80         int c;
81
82         assert(argc >= 0);
83         assert(argv);
84
85         while ((c = getopt_long(argc, argv, "+hD:u:", options, NULL)) >= 0) {
86
87                 switch (c) {
88
89                 case 'h':
90                         help();
91                         return 0;
92
93                 case 'D':
94                         free(arg_directory);
95                         if (!(arg_directory = strdup(optarg))) {
96                                 log_error("Failed to duplicate root directory.");
97                                 return -ENOMEM;
98                         }
99
100                         break;
101
102                 case 'u':
103                         free(arg_user);
104                         if (!(arg_user = strdup(optarg))) {
105                                 log_error("Failed to duplicate user name.");
106                                 return -ENOMEM;
107                         }
108
109                         break;
110
111                 case ARG_PRIVATE_NETWORK:
112                         arg_private_network = true;
113                         break;
114
115                 case '?':
116                         return -EINVAL;
117
118                 default:
119                         log_error("Unknown option code %c", c);
120                         return -EINVAL;
121                 }
122         }
123
124         return 1;
125 }
126
127 static int mount_all(const char *dest) {
128
129         typedef struct MountPoint {
130                 const char *what;
131                 const char *where;
132                 const char *type;
133                 const char *options;
134                 unsigned long flags;
135                 bool fatal;
136         } MountPoint;
137
138         static const MountPoint mount_table[] = {
139                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
140                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
141                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
142                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
143                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
144                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID,                    true  },
145                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
146                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV,           true  },
147 #ifdef HAVE_SELINUX
148                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
149                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
150 #endif
151         };
152
153         unsigned k;
154         int r = 0;
155         char *where;
156
157         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
158                 int t;
159
160                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
161                         log_error("Out of memory");
162
163                         if (r == 0)
164                                 r = -ENOMEM;
165
166                         break;
167                 }
168
169                 if ((t = path_is_mount_point(where)) < 0) {
170                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
171                         free(where);
172
173                         if (r == 0)
174                                 r = t;
175
176                         continue;
177                 }
178
179                 mkdir_p(where, 0755);
180
181                 if (mount(mount_table[k].what,
182                           where,
183                           mount_table[k].type,
184                           mount_table[k].flags,
185                           mount_table[k].options) < 0 &&
186                     mount_table[k].fatal) {
187
188                         log_error("mount(%s) failed: %m", where);
189
190                         if (r == 0)
191                                 r = -errno;
192                 }
193
194                 free(where);
195         }
196
197         /* Fix the timezone, if possible */
198         if (asprintf(&where, "%s/%s", dest, "/etc/localtime") >= 0) {
199                 mount("/etc/localtime", where, "bind", MS_BIND, NULL);
200                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
201                 free(where);
202         }
203
204         return r;
205 }
206
207 static int copy_devnodes(const char *dest, const char *console) {
208
209         static const char devnodes[] =
210                 "null\0"
211                 "zero\0"
212                 "full\0"
213                 "random\0"
214                 "urandom\0"
215                 "tty\0"
216                 "ptmx\0"
217                 "kmsg\0"
218                 "rtc0\0";
219
220         const char *d;
221         int r = 0, k;
222         mode_t u;
223         struct stat st;
224         char *from = NULL, *to = NULL;
225
226         assert(dest);
227         assert(console);
228
229         u = umask(0000);
230
231         NULSTR_FOREACH(d, devnodes) {
232                 from = to = NULL;
233
234                 asprintf(&from, "/dev/%s", d);
235                 asprintf(&to, "%s/dev/%s", dest, d);
236
237                 if (!from || !to) {
238                         log_error("Failed to allocate devnode path");
239
240                         free(from);
241                         free(to);
242
243                         from = to = NULL;
244
245                         if (r == 0)
246                                 r = -ENOMEM;
247
248                         break;
249                 }
250
251                 if (stat(from, &st) < 0) {
252
253                         if (errno != ENOENT) {
254                                 log_error("Failed to stat %s: %m", from);
255                                 if (r == 0)
256                                         r = -errno;
257                         }
258
259                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
260
261                         log_error("%s is not a char or block device, cannot copy.", from);
262                         if (r == 0)
263                                 r = -EIO;
264
265                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
266
267                         log_error("mknod(%s) failed: %m", dest);
268                         if (r == 0)
269                                 r = -errno;
270                 }
271
272                 free(from);
273                 free(to);
274         }
275
276         if (stat(console, &st) < 0) {
277
278                 log_error("Failed to stat %s: %m", console);
279                 if (r == 0)
280                         r = -errno;
281
282                 goto finish;
283
284         } else if (!S_ISCHR(st.st_mode)) {
285
286                 log_error("/dev/console is not a char device.");
287                 if (r == 0)
288                         r = -EIO;
289
290                 goto finish;
291         }
292
293         if (asprintf(&to, "%s/dev/console", dest) < 0) {
294
295                 log_error("Out of memory");
296                 if (r == 0)
297                         r = -ENOMEM;
298
299                  goto finish;
300         }
301
302         /* We need to bind mount the right tty to /dev/console since
303          * ptys can only exist on pts file systems. To have something
304          * to bind mount things on we create a device node first, that
305          * has the right major/minor (note that the major minor
306          * doesn't actually matter here, since we mount it over
307          * anyway). */
308
309         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
310                 log_error("mknod for /dev/console failed: %m");
311
312         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
313                 log_error("bind mount for /dev/console failed: %m");
314
315                 if (r == 0)
316                         r = -errno;
317         }
318
319         free(to);
320
321         if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
322                 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
323
324                 if (r == 0)
325                         r = k;
326         }
327
328 finish:
329         umask(u);
330
331         return r;
332 }
333
334 static int drop_capabilities(void) {
335         static const unsigned long retain[] = {
336                 CAP_CHOWN,
337                 CAP_DAC_OVERRIDE,
338                 CAP_DAC_READ_SEARCH,
339                 CAP_FOWNER,
340                 CAP_FSETID,
341                 CAP_IPC_OWNER,
342                 CAP_KILL,
343                 CAP_LEASE,
344                 CAP_LINUX_IMMUTABLE,
345                 CAP_NET_BIND_SERVICE,
346                 CAP_NET_BROADCAST,
347                 CAP_NET_RAW,
348                 CAP_SETGID,
349                 CAP_SETFCAP,
350                 CAP_SETPCAP,
351                 CAP_SETUID,
352                 CAP_SYS_ADMIN,
353                 CAP_SYS_CHROOT,
354                 CAP_SYS_NICE,
355                 CAP_SYS_PTRACE,
356                 CAP_SYS_TTY_CONFIG
357         };
358
359         unsigned long l;
360
361         for (l = 0; l <= MAX(63LU, (unsigned long) CAP_LAST_CAP); l++) {
362                 unsigned i;
363
364                 for (i = 0; i < ELEMENTSOF(retain); i++)
365                         if (retain[i] == l)
366                                 break;
367
368                 if (i < ELEMENTSOF(retain))
369                         continue;
370
371                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
372
373                         /* If this capability is not known, EINVAL
374                          * will be returned, let's ignore this. */
375                         if (errno == EINVAL)
376                                 break;
377
378                         log_error("PR_CAPBSET_DROP failed: %m");
379                         return -errno;
380                 }
381         }
382
383         return 0;
384 }
385
386 static int is_os_tree(const char *path) {
387         int r;
388         char *p;
389         /* We use /bin/sh as flag file if something is an OS */
390
391         if (asprintf(&p, "%s/bin/sh", path) < 0)
392                 return -ENOMEM;
393
394         r = access(p, F_OK);
395         free(p);
396
397         return r < 0 ? 0 : 1;
398 }
399
400 #define BUFFER_SIZE 1024
401
402 static int process_pty(int master, sigset_t *mask) {
403
404         char in_buffer[BUFFER_SIZE], out_buffer[BUFFER_SIZE];
405         size_t in_buffer_full = 0, out_buffer_full = 0;
406         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
407         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
408         int ep = -1, signal_fd = -1, r;
409
410         fd_nonblock(STDIN_FILENO, 1);
411         fd_nonblock(STDOUT_FILENO, 1);
412         fd_nonblock(master, 1);
413
414         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
415                 log_error("signalfd(): %m");
416                 r = -errno;
417                 goto finish;
418         }
419
420         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
421                 log_error("Failed to create epoll: %m");
422                 r = -errno;
423                 goto finish;
424         }
425
426         zero(stdin_ev);
427         stdin_ev.events = EPOLLIN|EPOLLET;
428         stdin_ev.data.fd = STDIN_FILENO;
429
430         zero(stdout_ev);
431         stdout_ev.events = EPOLLOUT|EPOLLET;
432         stdout_ev.data.fd = STDOUT_FILENO;
433
434         zero(master_ev);
435         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
436         master_ev.data.fd = master;
437
438         zero(signal_ev);
439         signal_ev.events = EPOLLIN;
440         signal_ev.data.fd = signal_fd;
441
442         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
443             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
444             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
445             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
446                 log_error("Failed to regiser fds in epoll: %m");
447                 r = -errno;
448                 goto finish;
449         }
450
451         for (;;) {
452                 struct epoll_event ev[16];
453                 ssize_t k;
454                 int i, nfds;
455
456                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
457
458                         if (errno == EINTR || errno == EAGAIN)
459                                 continue;
460
461                         log_error("epoll_wait(): %m");
462                         r = -errno;
463                         goto finish;
464                 }
465
466                 assert(nfds >= 1);
467
468                 for (i = 0; i < nfds; i++) {
469                         if (ev[i].data.fd == STDIN_FILENO) {
470
471                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
472                                         stdin_readable = true;
473
474                         } else if (ev[i].data.fd == STDOUT_FILENO) {
475
476                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
477                                         stdout_writable = true;
478
479                         } else if (ev[i].data.fd == master) {
480
481                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
482                                         master_readable = true;
483
484                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
485                                         master_writable = true;
486
487                         } else if (ev[i].data.fd == signal_fd) {
488                                 struct signalfd_siginfo sfsi;
489                                 ssize_t n;
490
491                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
492
493                                         if (n >= 0) {
494                                                 log_error("Failed to read from signalfd: invalid block size");
495                                                 r = -EIO;
496                                                 goto finish;
497                                         }
498
499                                         if (errno != EINTR && errno != EAGAIN) {
500                                                 log_error("Failed to read from signalfd: %m");
501                                                 r = -errno;
502                                                 goto finish;
503                                         }
504                                 } else {
505
506                                         if (sfsi.ssi_signo == SIGWINCH) {
507                                                 struct winsize ws;
508
509                                                 /* The window size changed, let's forward that. */
510                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
511                                                         ioctl(master, TIOCSWINSZ, &ws);
512                                         } else {
513                                                 r = 0;
514                                                 goto finish;
515                                         }
516                                 }
517                         }
518                 }
519
520                 while ((stdin_readable && in_buffer_full <= 0) ||
521                        (master_writable && in_buffer_full > 0) ||
522                        (master_readable && out_buffer_full <= 0) ||
523                        (stdout_writable && out_buffer_full > 0)) {
524
525                         if (stdin_readable && in_buffer_full < BUFFER_SIZE) {
526
527                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, BUFFER_SIZE - in_buffer_full)) < 0) {
528
529                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
530                                                 stdin_readable = false;
531                                         else {
532                                                 log_error("read(): %m");
533                                                 r = -errno;
534                                                 goto finish;
535                                         }
536                                 } else
537                                         in_buffer_full += (size_t) k;
538                         }
539
540                         if (master_writable && in_buffer_full > 0) {
541
542                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
543
544                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
545                                                 master_writable = false;
546                                         else {
547                                                 log_error("write(): %m");
548                                                 r = -errno;
549                                                 goto finish;
550                                         }
551
552                                 } else {
553                                         assert(in_buffer_full >= (size_t) k);
554                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
555                                         in_buffer_full -= k;
556                                 }
557                         }
558
559                         if (master_readable && out_buffer_full < BUFFER_SIZE) {
560
561                                 if ((k = read(master, out_buffer + out_buffer_full, BUFFER_SIZE - out_buffer_full)) < 0) {
562
563                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
564                                                 master_readable = false;
565                                         else {
566                                                 log_error("read(): %m");
567                                                 r = -errno;
568                                                 goto finish;
569                                         }
570                                 }  else
571                                         out_buffer_full += (size_t) k;
572                         }
573
574                         if (stdout_writable && out_buffer_full > 0) {
575
576                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
577
578                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
579                                                 stdout_writable = false;
580                                         else {
581                                                 log_error("write(): %m");
582                                                 r = -errno;
583                                                 goto finish;
584                                         }
585
586                                 } else {
587                                         assert(out_buffer_full >= (size_t) k);
588                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
589                                         out_buffer_full -= k;
590                                 }
591                         }
592                 }
593         }
594
595 finish:
596         if (ep >= 0)
597                 close_nointr_nofail(ep);
598
599         if (signal_fd >= 0)
600                 close_nointr_nofail(signal_fd);
601
602         return r;
603 }
604
605 int main(int argc, char *argv[]) {
606         pid_t pid = 0;
607         int r = EXIT_FAILURE, k;
608         char *oldcg = NULL, *newcg = NULL;
609         int master = -1;
610         const char *console = NULL;
611         struct termios saved_attr, raw_attr;
612         sigset_t mask;
613         bool saved_attr_valid = false;
614         struct winsize ws;
615
616         log_parse_environment();
617         log_open();
618
619         if ((r = parse_argv(argc, argv)) <= 0)
620                 goto finish;
621
622         if (arg_directory) {
623                 char *p;
624
625                 p = path_make_absolute_cwd(arg_directory);
626                 free(arg_directory);
627                 arg_directory = p;
628         } else
629                 arg_directory = get_current_dir_name();
630
631         if (!arg_directory) {
632                 log_error("Failed to determine path");
633                 goto finish;
634         }
635
636         path_kill_slashes(arg_directory);
637
638         if (geteuid() != 0) {
639                 log_error("Need to be root.");
640                 goto finish;
641         }
642
643         if (sd_booted() <= 0) {
644                 log_error("Not running on a systemd system.");
645                 goto finish;
646         }
647
648         if (path_equal(arg_directory, "/")) {
649                 log_error("Spawning container on root directory not supported.");
650                 goto finish;
651         }
652
653         if (is_os_tree(arg_directory) <= 0) {
654                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
655                 goto finish;
656         }
657
658         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
659                 log_error("Failed to determine current cgroup: %s", strerror(-k));
660                 goto finish;
661         }
662
663         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
664                 log_error("Failed to allocate cgroup path.");
665                 goto finish;
666         }
667
668         if ((k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0)) < 0)  {
669                 log_error("Failed to create cgroup: %s", strerror(-k));
670                 goto finish;
671         }
672
673         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
674                 log_error("Failed to acquire pseudo tty: %m");
675                 goto finish;
676         }
677
678         if (!(console = ptsname(master))) {
679                 log_error("Failed to determine tty name: %m");
680                 goto finish;
681         }
682
683         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
684
685         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
686                 ioctl(master, TIOCSWINSZ, &ws);
687
688         if (unlockpt(master) < 0) {
689                 log_error("Failed to unlock tty: %m");
690                 goto finish;
691         }
692
693         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
694                 log_error("Failed to get terminal attributes: %m");
695                 goto finish;
696         }
697
698         saved_attr_valid = true;
699
700         raw_attr = saved_attr;
701         cfmakeraw(&raw_attr);
702         raw_attr.c_lflag &= ~ECHO;
703
704         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
705                 log_error("Failed to set terminal attributes: %m");
706                 goto finish;
707         }
708
709         assert_se(sigemptyset(&mask) == 0);
710         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
711         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
712
713         if ((pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL)) < 0) {
714                 log_error("clone() failed: %m");
715                 goto finish;
716         }
717
718         if (pid == 0) {
719                 /* child */
720
721                 const char *hn;
722                 const char *home = NULL;
723                 uid_t uid = (uid_t) -1;
724                 gid_t gid = (gid_t) -1;
725                 const char *envp[] = {
726                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
727                         NULL, /* TERM */
728                         NULL, /* HOME */
729                         NULL, /* USER */
730                         NULL, /* LOGNAME */
731                         NULL
732                 };
733
734                 envp[1] = strv_find_prefix(environ, "TERM=");
735
736                 close_nointr_nofail(master);
737
738                 close_nointr(STDIN_FILENO);
739                 close_nointr(STDOUT_FILENO);
740                 close_nointr(STDERR_FILENO);
741
742                 close_all_fds(NULL, 0);
743
744                 reset_all_signal_handlers();
745
746                 assert_se(sigemptyset(&mask) == 0);
747                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
748
749                 if (setsid() < 0)
750                         goto child_fail;
751
752                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
753                         goto child_fail;
754
755                 /* Mark / as private, in case somebody marked it shared */
756                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
757                         goto child_fail;
758
759                 if (mount_all(arg_directory) < 0)
760                         goto child_fail;
761
762                 if (copy_devnodes(arg_directory, console) < 0)
763                         goto child_fail;
764
765                 if (chdir(arg_directory) < 0) {
766                         log_error("chdir(%s) failed: %m", arg_directory);
767                         goto child_fail;
768                 }
769
770                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
771                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
772                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
773                         goto child_fail;
774
775                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
776                         log_error("mount(MS_MOVE) failed: %m");
777                         goto child_fail;
778                 }
779
780                 if (chroot(".") < 0) {
781                         log_error("chroot() failed: %m");
782                         goto child_fail;
783                 }
784
785                 if (chdir("/") < 0) {
786                         log_error("chdir() failed: %m");
787                         goto child_fail;
788                 }
789
790                 umask(0022);
791
792                 loopback_setup();
793
794                 if (drop_capabilities() < 0)
795                         goto child_fail;
796
797                 if (arg_user) {
798
799                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
800                                 log_error("get_user_creds() failed: %m");
801                                 goto child_fail;
802                         }
803
804                         if (mkdir_parents(home, 0775) < 0) {
805                                 log_error("mkdir_parents() failed: %m");
806                                 goto child_fail;
807                         }
808
809                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
810                                 log_error("safe_mkdir() failed: %m");
811                                 goto child_fail;
812                         }
813
814                         if (initgroups((const char*)arg_user, gid) < 0) {
815                                 log_error("initgroups() failed: %m");
816                                 goto child_fail;
817                         }
818
819                         if (setresgid(gid, gid, gid) < 0) {
820                                 log_error("setregid() failed: %m");
821                                 goto child_fail;
822                         }
823
824                         if (setresuid(uid, uid, uid) < 0) {
825                                 log_error("setreuid() failed: %m");
826                                 goto child_fail;
827                         }
828                 }
829
830                 if ((asprintf((char**)(envp + 2), "HOME=%s", home? home: "/root") < 0) ||
831                     (asprintf((char**)(envp + 3), "USER=%s", arg_user? arg_user : "root") < 0) ||
832                     (asprintf((char**)(envp + 4), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
833                     log_error("Out of memory");
834                     goto child_fail;
835                 }
836
837                 if ((hn = file_name_from_path(arg_directory)))
838                         sethostname(hn, strlen(hn));
839
840                 if (argc > optind)
841                         execvpe(argv[optind], argv + optind, (char**) envp);
842                 else {
843                         chdir(home ? home : "/root");
844                         execle("/bin/bash", "-bash", NULL, (char**) envp);
845                 }
846
847                 log_error("execv() failed: %m");
848
849         child_fail:
850                 _exit(EXIT_FAILURE);
851         }
852
853         if (process_pty(master, &mask) < 0)
854                 goto finish;
855
856         if (saved_attr_valid) {
857                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
858                 saved_attr_valid = false;
859         }
860
861         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
862
863         if (r < 0)
864                 r = EXIT_FAILURE;
865
866 finish:
867         if (saved_attr_valid)
868                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
869
870         if (master >= 0)
871                 close_nointr_nofail(master);
872
873         if (oldcg)
874                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
875
876         if (newcg)
877                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
878
879         free(arg_directory);
880         free(oldcg);
881         free(newcg);
882
883         return r;
884 }