chiark / gitweb /
nspawn: bind mount /etc/resolv.conf from the host by default
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static char *arg_uuid = NULL;
59 static bool arg_private_network = false;
60 static bool arg_boot = false;
61
62 static int help(void) {
63
64         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
65                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
66                "  -h --help             Show this help\n"
67                "  -D --directory=NAME   Root directory for the container\n"
68                "  -b --boot             Boot up full system (i.e. invoke init)\n"
69                "  -u --user=USER        Run the command under specified user or uid\n"
70                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
71                "     --uuid=UUID        Set a specific machine UUID for the container\n"
72                "     --private-network  Disable network in container\n",
73                program_invocation_short_name);
74
75         return 0;
76 }
77
78 static int parse_argv(int argc, char *argv[]) {
79
80         enum {
81                 ARG_PRIVATE_NETWORK = 0x100,
82                 ARG_UUID
83         };
84
85         static const struct option options[] = {
86                 { "help",            no_argument,       NULL, 'h'                 },
87                 { "directory",       required_argument, NULL, 'D'                 },
88                 { "user",            required_argument, NULL, 'u'                 },
89                 { "controllers",     required_argument, NULL, 'C'                 },
90                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
91                 { "boot",            no_argument,       NULL, 'b'                 },
92                 { "uuid",            required_argument, NULL, ARG_UUID            },
93                 { NULL,              0,                 NULL, 0                   }
94         };
95
96         int c;
97
98         assert(argc >= 0);
99         assert(argv);
100
101         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
102
103                 switch (c) {
104
105                 case 'h':
106                         help();
107                         return 0;
108
109                 case 'D':
110                         free(arg_directory);
111                         arg_directory = canonicalize_file_name(optarg);
112                         if (!arg_directory) {
113                                 log_error("Failed to canonicalize root directory.");
114                                 return -ENOMEM;
115                         }
116
117                         break;
118
119                 case 'u':
120                         free(arg_user);
121                         if (!(arg_user = strdup(optarg))) {
122                                 log_error("Failed to duplicate user name.");
123                                 return -ENOMEM;
124                         }
125
126                         break;
127
128                 case 'C':
129                         strv_free(arg_controllers);
130                         arg_controllers = strv_split(optarg, ",");
131                         if (!arg_controllers) {
132                                 log_error("Failed to split controllers list.");
133                                 return -ENOMEM;
134                         }
135                         strv_uniq(arg_controllers);
136
137                         break;
138
139                 case ARG_PRIVATE_NETWORK:
140                         arg_private_network = true;
141                         break;
142
143                 case 'b':
144                         arg_boot = true;
145                         break;
146
147                 case ARG_UUID:
148                         arg_uuid = optarg;
149                         break;
150
151                 case '?':
152                         return -EINVAL;
153
154                 default:
155                         log_error("Unknown option code %c", c);
156                         return -EINVAL;
157                 }
158         }
159
160         return 1;
161 }
162
163 static int mount_all(const char *dest) {
164
165         typedef struct MountPoint {
166                 const char *what;
167                 const char *where;
168                 const char *type;
169                 const char *options;
170                 unsigned long flags;
171                 bool fatal;
172         } MountPoint;
173
174         static const MountPoint mount_table[] = {
175                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
176                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
177                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
178                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
179                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
180                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
181                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
182                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
183 #ifdef HAVE_SELINUX
184                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
185                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
186 #endif
187         };
188
189         unsigned k;
190         int r = 0;
191         char *where;
192
193         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
194                 int t;
195
196                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
197                         log_error("Out of memory");
198
199                         if (r == 0)
200                                 r = -ENOMEM;
201
202                         break;
203                 }
204
205                 t = path_is_mount_point(where, false);
206                 if (t < 0) {
207                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
208                         free(where);
209
210                         if (r == 0)
211                                 r = t;
212
213                         continue;
214                 }
215
216                 mkdir_p(where, 0755);
217
218                 if (mount(mount_table[k].what,
219                           where,
220                           mount_table[k].type,
221                           mount_table[k].flags,
222                           mount_table[k].options) < 0 &&
223                     mount_table[k].fatal) {
224
225                         log_error("mount(%s) failed: %m", where);
226
227                         if (r == 0)
228                                 r = -errno;
229                 }
230
231                 free(where);
232         }
233
234         return r;
235 }
236
237 static int setup_timezone(const char *dest) {
238         char *where;
239
240         assert(dest);
241
242         /* Fix the timezone, if possible */
243         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
244                 log_error("Out of memory");
245                 return -ENOMEM;
246         }
247
248         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
249                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
250
251         free(where);
252
253         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
254                 log_error("Out of memory");
255                 return -ENOMEM;
256         }
257
258         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
259                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
260
261         free(where);
262
263         return 0;
264 }
265
266 static int setup_resolv_conf(const char *dest) {
267         char *where;
268
269         assert(dest);
270
271         if (arg_private_network)
272                 return 0;
273
274         /* Fix resolv.conf, if possible */
275         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
276                 log_error("Out of memory");
277                 return -ENOMEM;
278         }
279
280         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
281                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
282
283         free(where);
284
285         return 0;
286 }
287
288 static int copy_devnodes(const char *dest) {
289
290         static const char devnodes[] =
291                 "null\0"
292                 "zero\0"
293                 "full\0"
294                 "random\0"
295                 "urandom\0"
296                 "tty\0"
297                 "ptmx\0"
298                 "rtc0\0";
299
300         const char *d;
301         int r = 0;
302         mode_t u;
303
304         assert(dest);
305
306         u = umask(0000);
307
308         NULSTR_FOREACH(d, devnodes) {
309                 struct stat st;
310                 char *from = NULL, *to = NULL;
311
312                 asprintf(&from, "/dev/%s", d);
313                 asprintf(&to, "%s/dev/%s", dest, d);
314
315                 if (!from || !to) {
316                         log_error("Failed to allocate devnode path");
317
318                         free(from);
319                         free(to);
320
321                         from = to = NULL;
322
323                         if (r == 0)
324                                 r = -ENOMEM;
325
326                         break;
327                 }
328
329                 if (stat(from, &st) < 0) {
330
331                         if (errno != ENOENT) {
332                                 log_error("Failed to stat %s: %m", from);
333                                 if (r == 0)
334                                         r = -errno;
335                         }
336
337                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
338
339                         log_error("%s is not a char or block device, cannot copy.", from);
340                         if (r == 0)
341                                 r = -EIO;
342
343                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
344
345                         log_error("mknod(%s) failed: %m", dest);
346                         if (r == 0)
347                                 r = -errno;
348                 }
349
350                 free(from);
351                 free(to);
352         }
353
354         umask(u);
355
356         return r;
357 }
358
359 static int setup_dev_console(const char *dest, const char *console) {
360         struct stat st;
361         char *to = NULL;
362         int r;
363         mode_t u;
364
365         assert(dest);
366         assert(console);
367
368         u = umask(0000);
369
370         if (stat(console, &st) < 0) {
371                 log_error("Failed to stat %s: %m", console);
372                 r = -errno;
373                 goto finish;
374
375         } else if (!S_ISCHR(st.st_mode)) {
376                 log_error("/dev/console is not a char device.");
377                 r = -EIO;
378                 goto finish;
379         }
380
381         r = chmod_and_chown(console, 0600, 0, 0);
382         if (r < 0) {
383                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
384                 goto finish;
385         }
386
387         if (asprintf(&to, "%s/dev/console", dest) < 0) {
388                 log_error("Out of memory");
389                 r = -ENOMEM;
390                 goto finish;
391         }
392
393         /* We need to bind mount the right tty to /dev/console since
394          * ptys can only exist on pts file systems. To have something
395          * to bind mount things on we create a device node first, that
396          * has the right major/minor (note that the major minor
397          * doesn't actually matter here, since we mount it over
398          * anyway). */
399
400         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
401                 log_error("mknod() for /dev/console failed: %m");
402                 r = -errno;
403                 goto finish;
404         }
405
406         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
407                 log_error("Bind mount for /dev/console failed: %m");
408                 r = -errno;
409                 goto finish;
410         }
411
412 finish:
413         free(to);
414         umask(u);
415
416         return r;
417 }
418
419 static int setup_kmsg(const char *dest, int kmsg_socket) {
420         char *from = NULL, *to = NULL;
421         int r, fd, k;
422         mode_t u;
423         union {
424                 struct cmsghdr cmsghdr;
425                 uint8_t buf[CMSG_SPACE(sizeof(int))];
426         } control;
427         struct msghdr mh;
428         struct cmsghdr *cmsg;
429
430         assert(dest);
431         assert(kmsg_socket >= 0);
432
433         u = umask(0000);
434
435         /* We create the kmsg FIFO as /dev/kmsg, but immediately
436          * delete it after bind mounting it to /proc/kmsg. While FIFOs
437          * on the reading side behave very similar to /proc/kmsg,
438          * their writing side behaves differently from /dev/kmsg in
439          * that writing blocks when nothing is reading. In order to
440          * avoid any problems with containers deadlocking due to this
441          * we simply make /dev/kmsg unavailable to the container. */
442         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
443                 log_error("Out of memory");
444                 r = -ENOMEM;
445                 goto finish;
446         }
447
448         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
449                 log_error("Out of memory");
450                 r = -ENOMEM;
451                 goto finish;
452         }
453
454         if (mkfifo(from, 0600) < 0) {
455                 log_error("mkfifo() for /dev/kmsg failed: %m");
456                 r = -errno;
457                 goto finish;
458         }
459
460         r = chmod_and_chown(from, 0600, 0, 0);
461         if (r < 0) {
462                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
463                 goto finish;
464         }
465
466         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
467                 log_error("Bind mount for /proc/kmsg failed: %m");
468                 r = -errno;
469                 goto finish;
470         }
471
472         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
473         if (fd < 0) {
474                 log_error("Failed to open fifo: %m");
475                 r = -errno;
476                 goto finish;
477         }
478
479         zero(mh);
480         zero(control);
481
482         mh.msg_control = &control;
483         mh.msg_controllen = sizeof(control);
484
485         cmsg = CMSG_FIRSTHDR(&mh);
486         cmsg->cmsg_level = SOL_SOCKET;
487         cmsg->cmsg_type = SCM_RIGHTS;
488         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
489         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
490
491         mh.msg_controllen = cmsg->cmsg_len;
492
493         /* Store away the fd in the socket, so that it stays open as
494          * long as we run the child */
495         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
496         close_nointr_nofail(fd);
497
498         if (k < 0) {
499                 log_error("Failed to send FIFO fd: %m");
500                 r = -errno;
501                 goto finish;
502         }
503
504         /* And now make the FIFO unavailable as /dev/kmsg... */
505         unlink(from);
506
507 finish:
508         free(from);
509         free(to);
510         umask(u);
511
512         return r;
513 }
514
515 static int setup_hostname(void) {
516         char *hn;
517         int r = 0;
518
519         hn = file_name_from_path(arg_directory);
520         if (hn) {
521                 hn = strdup(hn);
522                 if (!hn)
523                         return -ENOMEM;
524
525                 hostname_cleanup(hn);
526
527                 if (!isempty(hn))
528                         if (sethostname(hn, strlen(hn)) < 0)
529                                 r = -errno;
530
531                 free(hn);
532         }
533
534         return r;
535 }
536
537 static int drop_capabilities(void) {
538         static const unsigned long retain[] = {
539                 CAP_CHOWN,
540                 CAP_DAC_OVERRIDE,
541                 CAP_DAC_READ_SEARCH,
542                 CAP_FOWNER,
543                 CAP_FSETID,
544                 CAP_IPC_OWNER,
545                 CAP_KILL,
546                 CAP_LEASE,
547                 CAP_LINUX_IMMUTABLE,
548                 CAP_NET_BIND_SERVICE,
549                 CAP_NET_BROADCAST,
550                 CAP_NET_RAW,
551                 CAP_SETGID,
552                 CAP_SETFCAP,
553                 CAP_SETPCAP,
554                 CAP_SETUID,
555                 CAP_SYS_ADMIN,
556                 CAP_SYS_CHROOT,
557                 CAP_SYS_NICE,
558                 CAP_SYS_PTRACE,
559                 CAP_SYS_TTY_CONFIG
560         };
561
562         unsigned long l;
563
564         for (l = 0; l <= cap_last_cap(); l++) {
565                 unsigned i;
566
567                 for (i = 0; i < ELEMENTSOF(retain); i++)
568                         if (retain[i] == l)
569                                 break;
570
571                 if (i < ELEMENTSOF(retain))
572                         continue;
573
574                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
575                         log_error("PR_CAPBSET_DROP failed: %m");
576                         return -errno;
577                 }
578         }
579
580         return 0;
581 }
582
583 static int is_os_tree(const char *path) {
584         int r;
585         char *p;
586         /* We use /bin/sh as flag file if something is an OS */
587
588         if (asprintf(&p, "%s/bin/sh", path) < 0)
589                 return -ENOMEM;
590
591         r = access(p, F_OK);
592         free(p);
593
594         return r < 0 ? 0 : 1;
595 }
596
597 static int process_pty(int master, sigset_t *mask) {
598
599         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
600         size_t in_buffer_full = 0, out_buffer_full = 0;
601         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
602         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
603         int ep = -1, signal_fd = -1, r;
604
605         fd_nonblock(STDIN_FILENO, 1);
606         fd_nonblock(STDOUT_FILENO, 1);
607         fd_nonblock(master, 1);
608
609         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
610                 log_error("signalfd(): %m");
611                 r = -errno;
612                 goto finish;
613         }
614
615         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
616                 log_error("Failed to create epoll: %m");
617                 r = -errno;
618                 goto finish;
619         }
620
621         zero(stdin_ev);
622         stdin_ev.events = EPOLLIN|EPOLLET;
623         stdin_ev.data.fd = STDIN_FILENO;
624
625         zero(stdout_ev);
626         stdout_ev.events = EPOLLOUT|EPOLLET;
627         stdout_ev.data.fd = STDOUT_FILENO;
628
629         zero(master_ev);
630         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
631         master_ev.data.fd = master;
632
633         zero(signal_ev);
634         signal_ev.events = EPOLLIN;
635         signal_ev.data.fd = signal_fd;
636
637         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
638             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
639             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
640             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
641                 log_error("Failed to regiser fds in epoll: %m");
642                 r = -errno;
643                 goto finish;
644         }
645
646         for (;;) {
647                 struct epoll_event ev[16];
648                 ssize_t k;
649                 int i, nfds;
650
651                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
652
653                         if (errno == EINTR || errno == EAGAIN)
654                                 continue;
655
656                         log_error("epoll_wait(): %m");
657                         r = -errno;
658                         goto finish;
659                 }
660
661                 assert(nfds >= 1);
662
663                 for (i = 0; i < nfds; i++) {
664                         if (ev[i].data.fd == STDIN_FILENO) {
665
666                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
667                                         stdin_readable = true;
668
669                         } else if (ev[i].data.fd == STDOUT_FILENO) {
670
671                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
672                                         stdout_writable = true;
673
674                         } else if (ev[i].data.fd == master) {
675
676                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
677                                         master_readable = true;
678
679                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
680                                         master_writable = true;
681
682                         } else if (ev[i].data.fd == signal_fd) {
683                                 struct signalfd_siginfo sfsi;
684                                 ssize_t n;
685
686                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
687
688                                         if (n >= 0) {
689                                                 log_error("Failed to read from signalfd: invalid block size");
690                                                 r = -EIO;
691                                                 goto finish;
692                                         }
693
694                                         if (errno != EINTR && errno != EAGAIN) {
695                                                 log_error("Failed to read from signalfd: %m");
696                                                 r = -errno;
697                                                 goto finish;
698                                         }
699                                 } else {
700
701                                         if (sfsi.ssi_signo == SIGWINCH) {
702                                                 struct winsize ws;
703
704                                                 /* The window size changed, let's forward that. */
705                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
706                                                         ioctl(master, TIOCSWINSZ, &ws);
707                                         } else {
708                                                 r = 0;
709                                                 goto finish;
710                                         }
711                                 }
712                         }
713                 }
714
715                 while ((stdin_readable && in_buffer_full <= 0) ||
716                        (master_writable && in_buffer_full > 0) ||
717                        (master_readable && out_buffer_full <= 0) ||
718                        (stdout_writable && out_buffer_full > 0)) {
719
720                         if (stdin_readable && in_buffer_full < LINE_MAX) {
721
722                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
723
724                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
725                                                 stdin_readable = false;
726                                         else {
727                                                 log_error("read(): %m");
728                                                 r = -errno;
729                                                 goto finish;
730                                         }
731                                 } else
732                                         in_buffer_full += (size_t) k;
733                         }
734
735                         if (master_writable && in_buffer_full > 0) {
736
737                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
738
739                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
740                                                 master_writable = false;
741                                         else {
742                                                 log_error("write(): %m");
743                                                 r = -errno;
744                                                 goto finish;
745                                         }
746
747                                 } else {
748                                         assert(in_buffer_full >= (size_t) k);
749                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
750                                         in_buffer_full -= k;
751                                 }
752                         }
753
754                         if (master_readable && out_buffer_full < LINE_MAX) {
755
756                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
757
758                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
759                                                 master_readable = false;
760                                         else {
761                                                 log_error("read(): %m");
762                                                 r = -errno;
763                                                 goto finish;
764                                         }
765                                 }  else
766                                         out_buffer_full += (size_t) k;
767                         }
768
769                         if (stdout_writable && out_buffer_full > 0) {
770
771                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
772
773                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
774                                                 stdout_writable = false;
775                                         else {
776                                                 log_error("write(): %m");
777                                                 r = -errno;
778                                                 goto finish;
779                                         }
780
781                                 } else {
782                                         assert(out_buffer_full >= (size_t) k);
783                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
784                                         out_buffer_full -= k;
785                                 }
786                         }
787                 }
788         }
789
790 finish:
791         if (ep >= 0)
792                 close_nointr_nofail(ep);
793
794         if (signal_fd >= 0)
795                 close_nointr_nofail(signal_fd);
796
797         return r;
798 }
799
800 int main(int argc, char *argv[]) {
801         pid_t pid = 0;
802         int r = EXIT_FAILURE, k;
803         char *oldcg = NULL, *newcg = NULL;
804         char **controller = NULL;
805         int master = -1;
806         const char *console = NULL;
807         struct termios saved_attr, raw_attr;
808         sigset_t mask;
809         bool saved_attr_valid = false;
810         struct winsize ws;
811         int kmsg_socket_pair[2] = { -1, -1 };
812
813         log_parse_environment();
814         log_open();
815
816         if ((r = parse_argv(argc, argv)) <= 0)
817                 goto finish;
818
819         if (arg_directory) {
820                 char *p;
821
822                 p = path_make_absolute_cwd(arg_directory);
823                 free(arg_directory);
824                 arg_directory = p;
825         } else
826                 arg_directory = get_current_dir_name();
827
828         if (!arg_directory) {
829                 log_error("Failed to determine path");
830                 goto finish;
831         }
832
833         path_kill_slashes(arg_directory);
834
835         if (geteuid() != 0) {
836                 log_error("Need to be root.");
837                 goto finish;
838         }
839
840         if (sd_booted() <= 0) {
841                 log_error("Not running on a systemd system.");
842                 goto finish;
843         }
844
845         if (path_equal(arg_directory, "/")) {
846                 log_error("Spawning container on root directory not supported.");
847                 goto finish;
848         }
849
850         if (is_os_tree(arg_directory) <= 0) {
851                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
852                 goto finish;
853         }
854
855         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
856                 log_error("Failed to determine current cgroup: %s", strerror(-k));
857                 goto finish;
858         }
859
860         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
861                 log_error("Failed to allocate cgroup path.");
862                 goto finish;
863         }
864
865         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
866         if (k < 0)  {
867                 log_error("Failed to create cgroup: %s", strerror(-k));
868                 goto finish;
869         }
870
871         STRV_FOREACH(controller,arg_controllers) {
872                 k = cg_create_and_attach(*controller, newcg, 0);
873                 if (k < 0)
874                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
875         }
876
877         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
878                 log_error("Failed to acquire pseudo tty: %m");
879                 goto finish;
880         }
881
882         if (!(console = ptsname(master))) {
883                 log_error("Failed to determine tty name: %m");
884                 goto finish;
885         }
886
887         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
888
889         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
890                 ioctl(master, TIOCSWINSZ, &ws);
891
892         if (unlockpt(master) < 0) {
893                 log_error("Failed to unlock tty: %m");
894                 goto finish;
895         }
896
897         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
898                 log_error("Failed to get terminal attributes: %m");
899                 goto finish;
900         }
901
902         saved_attr_valid = true;
903
904         raw_attr = saved_attr;
905         cfmakeraw(&raw_attr);
906         raw_attr.c_lflag &= ~ECHO;
907
908         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
909                 log_error("Failed to set terminal attributes: %m");
910                 goto finish;
911         }
912
913         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
914                 log_error("Failed to create kmsg socket pair");
915                 goto finish;
916         }
917
918         assert_se(sigemptyset(&mask) == 0);
919         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
920         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
921
922         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
923         if (pid < 0) {
924                 if (errno == EINVAL)
925                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
926                 else
927                         log_error("clone() failed: %m");
928
929                 goto finish;
930         }
931
932         if (pid == 0) {
933                 /* child */
934
935                 const char *home = NULL;
936                 uid_t uid = (uid_t) -1;
937                 gid_t gid = (gid_t) -1;
938                 const char *envp[] = {
939                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
940                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
941                         NULL, /* TERM */
942                         NULL, /* HOME */
943                         NULL, /* USER */
944                         NULL, /* LOGNAME */
945                         NULL, /* container_uuid */
946                         NULL
947                 };
948
949                 envp[2] = strv_find_prefix(environ, "TERM=");
950
951                 close_nointr_nofail(master);
952
953                 close_nointr(STDIN_FILENO);
954                 close_nointr(STDOUT_FILENO);
955                 close_nointr(STDERR_FILENO);
956
957                 close_all_fds(&kmsg_socket_pair[1], 1);
958
959                 reset_all_signal_handlers();
960
961                 assert_se(sigemptyset(&mask) == 0);
962                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
963
964                 if (setsid() < 0)
965                         goto child_fail;
966
967                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
968                         goto child_fail;
969
970                 /* Mark / as private, in case somebody marked it shared */
971                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
972                         goto child_fail;
973
974                 if (mount_all(arg_directory) < 0)
975                         goto child_fail;
976
977                 if (copy_devnodes(arg_directory) < 0)
978                         goto child_fail;
979
980                 if (setup_dev_console(arg_directory, console) < 0)
981                         goto child_fail;
982
983                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
984                         goto child_fail;
985
986                 close_nointr_nofail(kmsg_socket_pair[1]);
987
988                 if (setup_timezone(arg_directory) < 0)
989                         goto child_fail;
990
991                 if (setup_resolv_conf(arg_directory) < 0)
992                         goto child_fail;
993
994                 if (chdir(arg_directory) < 0) {
995                         log_error("chdir(%s) failed: %m", arg_directory);
996                         goto child_fail;
997                 }
998
999                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1000                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1001                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1002                         goto child_fail;
1003
1004                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
1005                         log_error("mount(MS_MOVE) failed: %m");
1006                         goto child_fail;
1007                 }
1008
1009                 if (chroot(".") < 0) {
1010                         log_error("chroot() failed: %m");
1011                         goto child_fail;
1012                 }
1013
1014                 if (chdir("/") < 0) {
1015                         log_error("chdir() failed: %m");
1016                         goto child_fail;
1017                 }
1018
1019                 umask(0022);
1020
1021                 loopback_setup();
1022
1023                 if (drop_capabilities() < 0)
1024                         goto child_fail;
1025
1026                 if (arg_user) {
1027
1028                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1029                                 log_error("get_user_creds() failed: %m");
1030                                 goto child_fail;
1031                         }
1032
1033                         if (mkdir_parents(home, 0775) < 0) {
1034                                 log_error("mkdir_parents() failed: %m");
1035                                 goto child_fail;
1036                         }
1037
1038                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1039                                 log_error("safe_mkdir() failed: %m");
1040                                 goto child_fail;
1041                         }
1042
1043                         if (initgroups((const char*)arg_user, gid) < 0) {
1044                                 log_error("initgroups() failed: %m");
1045                                 goto child_fail;
1046                         }
1047
1048                         if (setresgid(gid, gid, gid) < 0) {
1049                                 log_error("setregid() failed: %m");
1050                                 goto child_fail;
1051                         }
1052
1053                         if (setresuid(uid, uid, uid) < 0) {
1054                                 log_error("setreuid() failed: %m");
1055                                 goto child_fail;
1056                         }
1057                 }
1058
1059                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1060                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1061                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1062                     log_error("Out of memory");
1063                     goto child_fail;
1064                 }
1065
1066                 if (arg_uuid) {
1067                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1068                                 log_error("Out of memory");
1069                                 goto child_fail;
1070                         }
1071                 }
1072
1073                 setup_hostname();
1074
1075                 if (arg_boot) {
1076                         char **a;
1077                         size_t l;
1078
1079                         /* Automatically search for the init system */
1080
1081                         l = 1 + argc - optind;
1082                         a = newa(char*, l + 1);
1083                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1084
1085                         a[0] = (char*) "/usr/lib/systemd/systemd";
1086                         execve(a[0], a, (char**) envp);
1087
1088                         a[0] = (char*) "/lib/systemd/systemd";
1089                         execve(a[0], a, (char**) envp);
1090
1091                         a[0] = (char*) "/sbin/init";
1092                         execve(a[0], a, (char**) envp);
1093                 } else if (argc > optind)
1094                         execvpe(argv[optind], argv + optind, (char**) envp);
1095                 else {
1096                         chdir(home ? home : "/root");
1097                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1098                 }
1099
1100                 log_error("execv() failed: %m");
1101
1102         child_fail:
1103                 _exit(EXIT_FAILURE);
1104         }
1105
1106         if (process_pty(master, &mask) < 0)
1107                 goto finish;
1108
1109         if (saved_attr_valid) {
1110                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1111                 saved_attr_valid = false;
1112         }
1113
1114         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1115
1116         if (r < 0)
1117                 r = EXIT_FAILURE;
1118
1119 finish:
1120         if (saved_attr_valid)
1121                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1122
1123         if (master >= 0)
1124                 close_nointr_nofail(master);
1125
1126         close_pipe(kmsg_socket_pair);
1127
1128         if (oldcg)
1129                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1130
1131         if (newcg)
1132                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1133
1134         free(arg_directory);
1135         strv_free(arg_controllers);
1136         free(oldcg);
1137         free(newcg);
1138
1139         return r;
1140 }