chiark / gitweb /
nspawn: be more careful when initializing the hostname from the directory name
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59
60 static int help(void) {
61
62         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64                "  -h --help             Show this help\n"
65                "  -D --directory=NAME   Root directory for the container\n"
66                "  -u --user=USER        Run the command under specified user or uid\n"
67                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68                "     --private-network  Disable network in container\n",
69                program_invocation_short_name);
70
71         return 0;
72 }
73
74 static int parse_argv(int argc, char *argv[]) {
75
76         enum {
77                 ARG_PRIVATE_NETWORK = 0x100
78         };
79
80         static const struct option options[] = {
81                 { "help",            no_argument,       NULL, 'h'                 },
82                 { "directory",       required_argument, NULL, 'D'                 },
83                 { "user",            required_argument, NULL, 'u'                 },
84                 { "controllers",     required_argument, NULL, 'C'                 },
85                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
86                 { NULL,              0,                 NULL, 0                   }
87         };
88
89         int c;
90
91         assert(argc >= 0);
92         assert(argv);
93
94         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
95
96                 switch (c) {
97
98                 case 'h':
99                         help();
100                         return 0;
101
102                 case 'D':
103                         free(arg_directory);
104                         arg_directory = canonicalize_file_name(optarg);
105                         if (!arg_directory) {
106                                 log_error("Failed to canonicalize root directory.");
107                                 return -ENOMEM;
108                         }
109
110                         break;
111
112                 case 'u':
113                         free(arg_user);
114                         if (!(arg_user = strdup(optarg))) {
115                                 log_error("Failed to duplicate user name.");
116                                 return -ENOMEM;
117                         }
118
119                         break;
120
121                 case 'C':
122                         strv_free(arg_controllers);
123                         arg_controllers = strv_split(optarg, ",");
124                         if (!arg_controllers) {
125                                 log_error("Failed to split controllers list.");
126                                 return -ENOMEM;
127                         }
128                         strv_uniq(arg_controllers);
129
130                         break;
131
132                 case ARG_PRIVATE_NETWORK:
133                         arg_private_network = true;
134                         break;
135
136                 case '?':
137                         return -EINVAL;
138
139                 default:
140                         log_error("Unknown option code %c", c);
141                         return -EINVAL;
142                 }
143         }
144
145         return 1;
146 }
147
148 static int mount_all(const char *dest) {
149
150         typedef struct MountPoint {
151                 const char *what;
152                 const char *where;
153                 const char *type;
154                 const char *options;
155                 unsigned long flags;
156                 bool fatal;
157         } MountPoint;
158
159         static const MountPoint mount_table[] = {
160                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
161                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
162                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
163                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
164                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
165                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
166                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
167                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
168 #ifdef HAVE_SELINUX
169                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
170                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
171 #endif
172         };
173
174         unsigned k;
175         int r = 0;
176         char *where;
177
178         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
179                 int t;
180
181                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
182                         log_error("Out of memory");
183
184                         if (r == 0)
185                                 r = -ENOMEM;
186
187                         break;
188                 }
189
190                 t = path_is_mount_point(where, false);
191                 if (t < 0) {
192                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
193                         free(where);
194
195                         if (r == 0)
196                                 r = t;
197
198                         continue;
199                 }
200
201                 mkdir_p(where, 0755);
202
203                 if (mount(mount_table[k].what,
204                           where,
205                           mount_table[k].type,
206                           mount_table[k].flags,
207                           mount_table[k].options) < 0 &&
208                     mount_table[k].fatal) {
209
210                         log_error("mount(%s) failed: %m", where);
211
212                         if (r == 0)
213                                 r = -errno;
214                 }
215
216                 free(where);
217         }
218
219         return r;
220 }
221
222 static int setup_timezone(const char *dest) {
223         char *where;
224
225         assert(dest);
226
227         /* Fix the timezone, if possible */
228         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
229                 log_error("Out of memory");
230                 return -ENOMEM;
231         }
232
233         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
234                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
235
236         free(where);
237
238         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
239                 log_error("Out of memory");
240                 return -ENOMEM;
241         }
242
243         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
244                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
245
246         free(where);
247
248         return 0;
249 }
250
251 static int copy_devnodes(const char *dest) {
252
253         static const char devnodes[] =
254                 "null\0"
255                 "zero\0"
256                 "full\0"
257                 "random\0"
258                 "urandom\0"
259                 "tty\0"
260                 "ptmx\0"
261                 "rtc0\0";
262
263         const char *d;
264         int r = 0;
265         mode_t u;
266
267         assert(dest);
268
269         u = umask(0000);
270
271         NULSTR_FOREACH(d, devnodes) {
272                 struct stat st;
273                 char *from = NULL, *to = NULL;
274
275                 asprintf(&from, "/dev/%s", d);
276                 asprintf(&to, "%s/dev/%s", dest, d);
277
278                 if (!from || !to) {
279                         log_error("Failed to allocate devnode path");
280
281                         free(from);
282                         free(to);
283
284                         from = to = NULL;
285
286                         if (r == 0)
287                                 r = -ENOMEM;
288
289                         break;
290                 }
291
292                 if (stat(from, &st) < 0) {
293
294                         if (errno != ENOENT) {
295                                 log_error("Failed to stat %s: %m", from);
296                                 if (r == 0)
297                                         r = -errno;
298                         }
299
300                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
301
302                         log_error("%s is not a char or block device, cannot copy.", from);
303                         if (r == 0)
304                                 r = -EIO;
305
306                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
307
308                         log_error("mknod(%s) failed: %m", dest);
309                         if (r == 0)
310                                 r = -errno;
311                 }
312
313                 free(from);
314                 free(to);
315         }
316
317         umask(u);
318
319         return r;
320 }
321
322 static int setup_dev_console(const char *dest, const char *console) {
323         struct stat st;
324         char *to = NULL;
325         int r;
326         mode_t u;
327
328         assert(dest);
329         assert(console);
330
331         u = umask(0000);
332
333         if (stat(console, &st) < 0) {
334                 log_error("Failed to stat %s: %m", console);
335                 r = -errno;
336                 goto finish;
337
338         } else if (!S_ISCHR(st.st_mode)) {
339                 log_error("/dev/console is not a char device.");
340                 r = -EIO;
341                 goto finish;
342         }
343
344         r = chmod_and_chown(console, 0600, 0, 0);
345         if (r < 0) {
346                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
347                 goto finish;
348         }
349
350         if (asprintf(&to, "%s/dev/console", dest) < 0) {
351                 log_error("Out of memory");
352                 r = -ENOMEM;
353                 goto finish;
354         }
355
356         /* We need to bind mount the right tty to /dev/console since
357          * ptys can only exist on pts file systems. To have something
358          * to bind mount things on we create a device node first, that
359          * has the right major/minor (note that the major minor
360          * doesn't actually matter here, since we mount it over
361          * anyway). */
362
363         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
364                 log_error("mknod() for /dev/console failed: %m");
365                 r = -errno;
366                 goto finish;
367         }
368
369         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
370                 log_error("Bind mount for /dev/console failed: %m");
371                 r = -errno;
372                 goto finish;
373         }
374
375 finish:
376         free(to);
377         umask(u);
378
379         return r;
380 }
381
382 static int setup_kmsg(const char *dest, int kmsg_socket) {
383         char *from = NULL, *to = NULL;
384         int r, fd, k;
385         mode_t u;
386         union {
387                 struct cmsghdr cmsghdr;
388                 uint8_t buf[CMSG_SPACE(sizeof(int))];
389         } control;
390         struct msghdr mh;
391         struct cmsghdr *cmsg;
392
393         assert(dest);
394         assert(kmsg_socket >= 0);
395
396         u = umask(0000);
397
398         /* We create the kmsg FIFO as /dev/kmsg, but immediately
399          * delete it after bind mounting it to /proc/kmsg. While FIFOs
400          * on the reading side behave very similar to /proc/kmsg,
401          * their writing side behaves differently from /dev/kmsg in
402          * that writing blocks when nothing is reading. In order to
403          * avoid any problems with containers deadlocking due to this
404          * we simply make /dev/kmsg unavailable to the container. */
405         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
406                 log_error("Out of memory");
407                 r = -ENOMEM;
408                 goto finish;
409         }
410
411         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
412                 log_error("Out of memory");
413                 r = -ENOMEM;
414                 goto finish;
415         }
416
417         if (mkfifo(from, 0600) < 0) {
418                 log_error("mkfifo() for /dev/kmsg failed: %m");
419                 r = -errno;
420                 goto finish;
421         }
422
423         r = chmod_and_chown(from, 0600, 0, 0);
424         if (r < 0) {
425                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
426                 goto finish;
427         }
428
429         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
430                 log_error("Bind mount for /proc/kmsg failed: %m");
431                 r = -errno;
432                 goto finish;
433         }
434
435         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
436         if (fd < 0) {
437                 log_error("Failed to open fifo: %m");
438                 r = -errno;
439                 goto finish;
440         }
441
442         zero(mh);
443         zero(control);
444
445         mh.msg_control = &control;
446         mh.msg_controllen = sizeof(control);
447
448         cmsg = CMSG_FIRSTHDR(&mh);
449         cmsg->cmsg_level = SOL_SOCKET;
450         cmsg->cmsg_type = SCM_RIGHTS;
451         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
452         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
453
454         mh.msg_controllen = cmsg->cmsg_len;
455
456         /* Store away the fd in the socket, so that it stays open as
457          * long as we run the child */
458         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
459         close_nointr_nofail(fd);
460
461         if (k < 0) {
462                 log_error("Failed to send FIFO fd: %m");
463                 r = -errno;
464                 goto finish;
465         }
466
467         /* And now make the FIFO unavailable as /dev/kmsg... */
468         unlink(from);
469
470 finish:
471         free(from);
472         free(to);
473         umask(u);
474
475         return r;
476 }
477
478 static int setup_hostname(void) {
479         char *hn;
480         int r = 0;
481
482         hn = file_name_from_path(arg_directory);
483         if (hn) {
484                 hn = strdup(hn);
485                 if (!hn)
486                         return -ENOMEM;
487
488                 hostname_cleanup(hn);
489
490                 if (!isempty(hn))
491                         if (sethostname(hn, strlen(hn)) < 0)
492                                 r = -errno;
493
494                 free(hn);
495         }
496
497         return r;
498 }
499
500 static int drop_capabilities(void) {
501         static const unsigned long retain[] = {
502                 CAP_CHOWN,
503                 CAP_DAC_OVERRIDE,
504                 CAP_DAC_READ_SEARCH,
505                 CAP_FOWNER,
506                 CAP_FSETID,
507                 CAP_IPC_OWNER,
508                 CAP_KILL,
509                 CAP_LEASE,
510                 CAP_LINUX_IMMUTABLE,
511                 CAP_NET_BIND_SERVICE,
512                 CAP_NET_BROADCAST,
513                 CAP_NET_RAW,
514                 CAP_SETGID,
515                 CAP_SETFCAP,
516                 CAP_SETPCAP,
517                 CAP_SETUID,
518                 CAP_SYS_ADMIN,
519                 CAP_SYS_CHROOT,
520                 CAP_SYS_NICE,
521                 CAP_SYS_PTRACE,
522                 CAP_SYS_TTY_CONFIG
523         };
524
525         unsigned long l;
526
527         for (l = 0; l <= cap_last_cap(); l++) {
528                 unsigned i;
529
530                 for (i = 0; i < ELEMENTSOF(retain); i++)
531                         if (retain[i] == l)
532                                 break;
533
534                 if (i < ELEMENTSOF(retain))
535                         continue;
536
537                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
538                         log_error("PR_CAPBSET_DROP failed: %m");
539                         return -errno;
540                 }
541         }
542
543         return 0;
544 }
545
546 static int is_os_tree(const char *path) {
547         int r;
548         char *p;
549         /* We use /bin/sh as flag file if something is an OS */
550
551         if (asprintf(&p, "%s/bin/sh", path) < 0)
552                 return -ENOMEM;
553
554         r = access(p, F_OK);
555         free(p);
556
557         return r < 0 ? 0 : 1;
558 }
559
560 static int process_pty(int master, sigset_t *mask) {
561
562         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
563         size_t in_buffer_full = 0, out_buffer_full = 0;
564         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
565         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
566         int ep = -1, signal_fd = -1, r;
567
568         fd_nonblock(STDIN_FILENO, 1);
569         fd_nonblock(STDOUT_FILENO, 1);
570         fd_nonblock(master, 1);
571
572         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
573                 log_error("signalfd(): %m");
574                 r = -errno;
575                 goto finish;
576         }
577
578         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
579                 log_error("Failed to create epoll: %m");
580                 r = -errno;
581                 goto finish;
582         }
583
584         zero(stdin_ev);
585         stdin_ev.events = EPOLLIN|EPOLLET;
586         stdin_ev.data.fd = STDIN_FILENO;
587
588         zero(stdout_ev);
589         stdout_ev.events = EPOLLOUT|EPOLLET;
590         stdout_ev.data.fd = STDOUT_FILENO;
591
592         zero(master_ev);
593         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
594         master_ev.data.fd = master;
595
596         zero(signal_ev);
597         signal_ev.events = EPOLLIN;
598         signal_ev.data.fd = signal_fd;
599
600         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
601             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
602             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
603             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
604                 log_error("Failed to regiser fds in epoll: %m");
605                 r = -errno;
606                 goto finish;
607         }
608
609         for (;;) {
610                 struct epoll_event ev[16];
611                 ssize_t k;
612                 int i, nfds;
613
614                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
615
616                         if (errno == EINTR || errno == EAGAIN)
617                                 continue;
618
619                         log_error("epoll_wait(): %m");
620                         r = -errno;
621                         goto finish;
622                 }
623
624                 assert(nfds >= 1);
625
626                 for (i = 0; i < nfds; i++) {
627                         if (ev[i].data.fd == STDIN_FILENO) {
628
629                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
630                                         stdin_readable = true;
631
632                         } else if (ev[i].data.fd == STDOUT_FILENO) {
633
634                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
635                                         stdout_writable = true;
636
637                         } else if (ev[i].data.fd == master) {
638
639                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
640                                         master_readable = true;
641
642                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
643                                         master_writable = true;
644
645                         } else if (ev[i].data.fd == signal_fd) {
646                                 struct signalfd_siginfo sfsi;
647                                 ssize_t n;
648
649                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
650
651                                         if (n >= 0) {
652                                                 log_error("Failed to read from signalfd: invalid block size");
653                                                 r = -EIO;
654                                                 goto finish;
655                                         }
656
657                                         if (errno != EINTR && errno != EAGAIN) {
658                                                 log_error("Failed to read from signalfd: %m");
659                                                 r = -errno;
660                                                 goto finish;
661                                         }
662                                 } else {
663
664                                         if (sfsi.ssi_signo == SIGWINCH) {
665                                                 struct winsize ws;
666
667                                                 /* The window size changed, let's forward that. */
668                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
669                                                         ioctl(master, TIOCSWINSZ, &ws);
670                                         } else {
671                                                 r = 0;
672                                                 goto finish;
673                                         }
674                                 }
675                         }
676                 }
677
678                 while ((stdin_readable && in_buffer_full <= 0) ||
679                        (master_writable && in_buffer_full > 0) ||
680                        (master_readable && out_buffer_full <= 0) ||
681                        (stdout_writable && out_buffer_full > 0)) {
682
683                         if (stdin_readable && in_buffer_full < LINE_MAX) {
684
685                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
686
687                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
688                                                 stdin_readable = false;
689                                         else {
690                                                 log_error("read(): %m");
691                                                 r = -errno;
692                                                 goto finish;
693                                         }
694                                 } else
695                                         in_buffer_full += (size_t) k;
696                         }
697
698                         if (master_writable && in_buffer_full > 0) {
699
700                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
701
702                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
703                                                 master_writable = false;
704                                         else {
705                                                 log_error("write(): %m");
706                                                 r = -errno;
707                                                 goto finish;
708                                         }
709
710                                 } else {
711                                         assert(in_buffer_full >= (size_t) k);
712                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
713                                         in_buffer_full -= k;
714                                 }
715                         }
716
717                         if (master_readable && out_buffer_full < LINE_MAX) {
718
719                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
720
721                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
722                                                 master_readable = false;
723                                         else {
724                                                 log_error("read(): %m");
725                                                 r = -errno;
726                                                 goto finish;
727                                         }
728                                 }  else
729                                         out_buffer_full += (size_t) k;
730                         }
731
732                         if (stdout_writable && out_buffer_full > 0) {
733
734                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
735
736                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
737                                                 stdout_writable = false;
738                                         else {
739                                                 log_error("write(): %m");
740                                                 r = -errno;
741                                                 goto finish;
742                                         }
743
744                                 } else {
745                                         assert(out_buffer_full >= (size_t) k);
746                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
747                                         out_buffer_full -= k;
748                                 }
749                         }
750                 }
751         }
752
753 finish:
754         if (ep >= 0)
755                 close_nointr_nofail(ep);
756
757         if (signal_fd >= 0)
758                 close_nointr_nofail(signal_fd);
759
760         return r;
761 }
762
763 int main(int argc, char *argv[]) {
764         pid_t pid = 0;
765         int r = EXIT_FAILURE, k;
766         char *oldcg = NULL, *newcg = NULL;
767         char **controller = NULL;
768         int master = -1;
769         const char *console = NULL;
770         struct termios saved_attr, raw_attr;
771         sigset_t mask;
772         bool saved_attr_valid = false;
773         struct winsize ws;
774         int kmsg_socket_pair[2] = { -1, -1 };
775
776         log_parse_environment();
777         log_open();
778
779         if ((r = parse_argv(argc, argv)) <= 0)
780                 goto finish;
781
782         if (arg_directory) {
783                 char *p;
784
785                 p = path_make_absolute_cwd(arg_directory);
786                 free(arg_directory);
787                 arg_directory = p;
788         } else
789                 arg_directory = get_current_dir_name();
790
791         if (!arg_directory) {
792                 log_error("Failed to determine path");
793                 goto finish;
794         }
795
796         path_kill_slashes(arg_directory);
797
798         if (geteuid() != 0) {
799                 log_error("Need to be root.");
800                 goto finish;
801         }
802
803         if (sd_booted() <= 0) {
804                 log_error("Not running on a systemd system.");
805                 goto finish;
806         }
807
808         if (path_equal(arg_directory, "/")) {
809                 log_error("Spawning container on root directory not supported.");
810                 goto finish;
811         }
812
813         if (is_os_tree(arg_directory) <= 0) {
814                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
815                 goto finish;
816         }
817
818         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
819                 log_error("Failed to determine current cgroup: %s", strerror(-k));
820                 goto finish;
821         }
822
823         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
824                 log_error("Failed to allocate cgroup path.");
825                 goto finish;
826         }
827
828         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
829         if (k < 0)  {
830                 log_error("Failed to create cgroup: %s", strerror(-k));
831                 goto finish;
832         }
833
834         STRV_FOREACH(controller,arg_controllers) {
835                 k = cg_create_and_attach(*controller, newcg, 0);
836                 if (k < 0)
837                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
838         }
839
840         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
841                 log_error("Failed to acquire pseudo tty: %m");
842                 goto finish;
843         }
844
845         if (!(console = ptsname(master))) {
846                 log_error("Failed to determine tty name: %m");
847                 goto finish;
848         }
849
850         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
851
852         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
853                 ioctl(master, TIOCSWINSZ, &ws);
854
855         if (unlockpt(master) < 0) {
856                 log_error("Failed to unlock tty: %m");
857                 goto finish;
858         }
859
860         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
861                 log_error("Failed to get terminal attributes: %m");
862                 goto finish;
863         }
864
865         saved_attr_valid = true;
866
867         raw_attr = saved_attr;
868         cfmakeraw(&raw_attr);
869         raw_attr.c_lflag &= ~ECHO;
870
871         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
872                 log_error("Failed to set terminal attributes: %m");
873                 goto finish;
874         }
875
876         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
877                 log_error("Failed to create kmsg socket pair");
878                 goto finish;
879         }
880
881         assert_se(sigemptyset(&mask) == 0);
882         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
883         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
884
885         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
886         if (pid < 0) {
887                 if (errno == EINVAL)
888                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
889                 else
890                         log_error("clone() failed: %m");
891
892                 goto finish;
893         }
894
895         if (pid == 0) {
896                 /* child */
897
898                 const char *home = NULL;
899                 uid_t uid = (uid_t) -1;
900                 gid_t gid = (gid_t) -1;
901                 const char *envp[] = {
902                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
903                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
904                         NULL, /* TERM */
905                         NULL, /* HOME */
906                         NULL, /* USER */
907                         NULL, /* LOGNAME */
908                         NULL
909                 };
910
911                 envp[2] = strv_find_prefix(environ, "TERM=");
912
913                 close_nointr_nofail(master);
914
915                 close_nointr(STDIN_FILENO);
916                 close_nointr(STDOUT_FILENO);
917                 close_nointr(STDERR_FILENO);
918
919                 close_all_fds(&kmsg_socket_pair[1], 1);
920
921                 reset_all_signal_handlers();
922
923                 assert_se(sigemptyset(&mask) == 0);
924                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
925
926                 if (setsid() < 0)
927                         goto child_fail;
928
929                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
930                         goto child_fail;
931
932                 /* Mark / as private, in case somebody marked it shared */
933                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
934                         goto child_fail;
935
936                 if (mount_all(arg_directory) < 0)
937                         goto child_fail;
938
939                 if (copy_devnodes(arg_directory) < 0)
940                         goto child_fail;
941
942                 if (setup_dev_console(arg_directory, console) < 0)
943                         goto child_fail;
944
945                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
946                         goto child_fail;
947
948                 close_nointr_nofail(kmsg_socket_pair[1]);
949
950                 if (setup_timezone(arg_directory) < 0)
951                         goto child_fail;
952
953                 if (chdir(arg_directory) < 0) {
954                         log_error("chdir(%s) failed: %m", arg_directory);
955                         goto child_fail;
956                 }
957
958                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
959                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
960                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
961                         goto child_fail;
962
963                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
964                         log_error("mount(MS_MOVE) failed: %m");
965                         goto child_fail;
966                 }
967
968                 if (chroot(".") < 0) {
969                         log_error("chroot() failed: %m");
970                         goto child_fail;
971                 }
972
973                 if (chdir("/") < 0) {
974                         log_error("chdir() failed: %m");
975                         goto child_fail;
976                 }
977
978                 umask(0022);
979
980                 loopback_setup();
981
982                 if (drop_capabilities() < 0)
983                         goto child_fail;
984
985                 if (arg_user) {
986
987                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
988                                 log_error("get_user_creds() failed: %m");
989                                 goto child_fail;
990                         }
991
992                         if (mkdir_parents(home, 0775) < 0) {
993                                 log_error("mkdir_parents() failed: %m");
994                                 goto child_fail;
995                         }
996
997                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
998                                 log_error("safe_mkdir() failed: %m");
999                                 goto child_fail;
1000                         }
1001
1002                         if (initgroups((const char*)arg_user, gid) < 0) {
1003                                 log_error("initgroups() failed: %m");
1004                                 goto child_fail;
1005                         }
1006
1007                         if (setresgid(gid, gid, gid) < 0) {
1008                                 log_error("setregid() failed: %m");
1009                                 goto child_fail;
1010                         }
1011
1012                         if (setresuid(uid, uid, uid) < 0) {
1013                                 log_error("setreuid() failed: %m");
1014                                 goto child_fail;
1015                         }
1016                 }
1017
1018                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
1019                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
1020                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
1021                     log_error("Out of memory");
1022                     goto child_fail;
1023                 }
1024
1025                 setup_hostname();
1026
1027                 if (argc > optind)
1028                         execvpe(argv[optind], argv + optind, (char**) envp);
1029                 else {
1030                         chdir(home ? home : "/root");
1031                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1032                 }
1033
1034                 log_error("execv() failed: %m");
1035
1036         child_fail:
1037                 _exit(EXIT_FAILURE);
1038         }
1039
1040         if (process_pty(master, &mask) < 0)
1041                 goto finish;
1042
1043         if (saved_attr_valid) {
1044                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1045                 saved_attr_valid = false;
1046         }
1047
1048         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1049
1050         if (r < 0)
1051                 r = EXIT_FAILURE;
1052
1053 finish:
1054         if (saved_attr_valid)
1055                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1056
1057         if (master >= 0)
1058                 close_nointr_nofail(master);
1059
1060         close_pipe(kmsg_socket_pair);
1061
1062         if (oldcg)
1063                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1064
1065         if (newcg)
1066                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1067
1068         free(arg_directory);
1069         strv_free(arg_controllers);
1070         free(oldcg);
1071         free(newcg);
1072
1073         return r;
1074 }