chiark / gitweb /
job: the status messages are proper sentences, hence end them with a full stop
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static char *arg_uuid = NULL;
59 static bool arg_private_network = false;
60 static bool arg_boot = false;
61
62 static int help(void) {
63
64         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
65                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
66                "  -h --help             Show this help\n"
67                "  -D --directory=NAME   Root directory for the container\n"
68                "  -b --boot             Boot up full system (i.e. invoke init)\n"
69                "  -u --user=USER        Run the command under specified user or uid\n"
70                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
71                "     --uuid=UUID        Set a specific machine UUID for the container\n"
72                "     --private-network  Disable network in container\n",
73                program_invocation_short_name);
74
75         return 0;
76 }
77
78 static int parse_argv(int argc, char *argv[]) {
79
80         enum {
81                 ARG_PRIVATE_NETWORK = 0x100,
82                 ARG_UUID
83         };
84
85         static const struct option options[] = {
86                 { "help",            no_argument,       NULL, 'h'                 },
87                 { "directory",       required_argument, NULL, 'D'                 },
88                 { "user",            required_argument, NULL, 'u'                 },
89                 { "controllers",     required_argument, NULL, 'C'                 },
90                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
91                 { "boot",            no_argument,       NULL, 'b'                 },
92                 { "uuid",            required_argument, NULL, ARG_UUID            },
93                 { NULL,              0,                 NULL, 0                   }
94         };
95
96         int c;
97
98         assert(argc >= 0);
99         assert(argv);
100
101         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
102
103                 switch (c) {
104
105                 case 'h':
106                         help();
107                         return 0;
108
109                 case 'D':
110                         free(arg_directory);
111                         arg_directory = canonicalize_file_name(optarg);
112                         if (!arg_directory) {
113                                 log_error("Failed to canonicalize root directory.");
114                                 return -ENOMEM;
115                         }
116
117                         break;
118
119                 case 'u':
120                         free(arg_user);
121                         if (!(arg_user = strdup(optarg))) {
122                                 log_error("Failed to duplicate user name.");
123                                 return -ENOMEM;
124                         }
125
126                         break;
127
128                 case 'C':
129                         strv_free(arg_controllers);
130                         arg_controllers = strv_split(optarg, ",");
131                         if (!arg_controllers) {
132                                 log_error("Failed to split controllers list.");
133                                 return -ENOMEM;
134                         }
135                         strv_uniq(arg_controllers);
136
137                         break;
138
139                 case ARG_PRIVATE_NETWORK:
140                         arg_private_network = true;
141                         break;
142
143                 case 'b':
144                         arg_boot = true;
145                         break;
146
147                 case ARG_UUID:
148                         arg_uuid = optarg;
149                         break;
150
151                 case '?':
152                         return -EINVAL;
153
154                 default:
155                         log_error("Unknown option code %c", c);
156                         return -EINVAL;
157                 }
158         }
159
160         return 1;
161 }
162
163 static int mount_all(const char *dest) {
164
165         typedef struct MountPoint {
166                 const char *what;
167                 const char *where;
168                 const char *type;
169                 const char *options;
170                 unsigned long flags;
171                 bool fatal;
172         } MountPoint;
173
174         static const MountPoint mount_table[] = {
175                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
176                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
177                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
178                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
179                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
180                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
181                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
182                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
183 #ifdef HAVE_SELINUX
184                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
185                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
186 #endif
187         };
188
189         unsigned k;
190         int r = 0;
191         char *where;
192
193         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
194                 int t;
195
196                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
197                         log_error("Out of memory");
198
199                         if (r == 0)
200                                 r = -ENOMEM;
201
202                         break;
203                 }
204
205                 t = path_is_mount_point(where, false);
206                 if (t < 0) {
207                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
208                         free(where);
209
210                         if (r == 0)
211                                 r = t;
212
213                         continue;
214                 }
215
216                 mkdir_p(where, 0755);
217
218                 if (mount(mount_table[k].what,
219                           where,
220                           mount_table[k].type,
221                           mount_table[k].flags,
222                           mount_table[k].options) < 0 &&
223                     mount_table[k].fatal) {
224
225                         log_error("mount(%s) failed: %m", where);
226
227                         if (r == 0)
228                                 r = -errno;
229                 }
230
231                 free(where);
232         }
233
234         return r;
235 }
236
237 static int setup_timezone(const char *dest) {
238         char *where;
239
240         assert(dest);
241
242         /* Fix the timezone, if possible */
243         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
244                 log_error("Out of memory");
245                 return -ENOMEM;
246         }
247
248         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
249                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
250
251         free(where);
252
253         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
254                 log_error("Out of memory");
255                 return -ENOMEM;
256         }
257
258         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
259                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
260
261         free(where);
262
263         return 0;
264 }
265
266 static int copy_devnodes(const char *dest) {
267
268         static const char devnodes[] =
269                 "null\0"
270                 "zero\0"
271                 "full\0"
272                 "random\0"
273                 "urandom\0"
274                 "tty\0"
275                 "ptmx\0"
276                 "rtc0\0";
277
278         const char *d;
279         int r = 0;
280         mode_t u;
281
282         assert(dest);
283
284         u = umask(0000);
285
286         NULSTR_FOREACH(d, devnodes) {
287                 struct stat st;
288                 char *from = NULL, *to = NULL;
289
290                 asprintf(&from, "/dev/%s", d);
291                 asprintf(&to, "%s/dev/%s", dest, d);
292
293                 if (!from || !to) {
294                         log_error("Failed to allocate devnode path");
295
296                         free(from);
297                         free(to);
298
299                         from = to = NULL;
300
301                         if (r == 0)
302                                 r = -ENOMEM;
303
304                         break;
305                 }
306
307                 if (stat(from, &st) < 0) {
308
309                         if (errno != ENOENT) {
310                                 log_error("Failed to stat %s: %m", from);
311                                 if (r == 0)
312                                         r = -errno;
313                         }
314
315                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
316
317                         log_error("%s is not a char or block device, cannot copy.", from);
318                         if (r == 0)
319                                 r = -EIO;
320
321                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
322
323                         log_error("mknod(%s) failed: %m", dest);
324                         if (r == 0)
325                                 r = -errno;
326                 }
327
328                 free(from);
329                 free(to);
330         }
331
332         umask(u);
333
334         return r;
335 }
336
337 static int setup_dev_console(const char *dest, const char *console) {
338         struct stat st;
339         char *to = NULL;
340         int r;
341         mode_t u;
342
343         assert(dest);
344         assert(console);
345
346         u = umask(0000);
347
348         if (stat(console, &st) < 0) {
349                 log_error("Failed to stat %s: %m", console);
350                 r = -errno;
351                 goto finish;
352
353         } else if (!S_ISCHR(st.st_mode)) {
354                 log_error("/dev/console is not a char device.");
355                 r = -EIO;
356                 goto finish;
357         }
358
359         r = chmod_and_chown(console, 0600, 0, 0);
360         if (r < 0) {
361                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
362                 goto finish;
363         }
364
365         if (asprintf(&to, "%s/dev/console", dest) < 0) {
366                 log_error("Out of memory");
367                 r = -ENOMEM;
368                 goto finish;
369         }
370
371         /* We need to bind mount the right tty to /dev/console since
372          * ptys can only exist on pts file systems. To have something
373          * to bind mount things on we create a device node first, that
374          * has the right major/minor (note that the major minor
375          * doesn't actually matter here, since we mount it over
376          * anyway). */
377
378         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
379                 log_error("mknod() for /dev/console failed: %m");
380                 r = -errno;
381                 goto finish;
382         }
383
384         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
385                 log_error("Bind mount for /dev/console failed: %m");
386                 r = -errno;
387                 goto finish;
388         }
389
390 finish:
391         free(to);
392         umask(u);
393
394         return r;
395 }
396
397 static int setup_kmsg(const char *dest, int kmsg_socket) {
398         char *from = NULL, *to = NULL;
399         int r, fd, k;
400         mode_t u;
401         union {
402                 struct cmsghdr cmsghdr;
403                 uint8_t buf[CMSG_SPACE(sizeof(int))];
404         } control;
405         struct msghdr mh;
406         struct cmsghdr *cmsg;
407
408         assert(dest);
409         assert(kmsg_socket >= 0);
410
411         u = umask(0000);
412
413         /* We create the kmsg FIFO as /dev/kmsg, but immediately
414          * delete it after bind mounting it to /proc/kmsg. While FIFOs
415          * on the reading side behave very similar to /proc/kmsg,
416          * their writing side behaves differently from /dev/kmsg in
417          * that writing blocks when nothing is reading. In order to
418          * avoid any problems with containers deadlocking due to this
419          * we simply make /dev/kmsg unavailable to the container. */
420         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
421                 log_error("Out of memory");
422                 r = -ENOMEM;
423                 goto finish;
424         }
425
426         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
427                 log_error("Out of memory");
428                 r = -ENOMEM;
429                 goto finish;
430         }
431
432         if (mkfifo(from, 0600) < 0) {
433                 log_error("mkfifo() for /dev/kmsg failed: %m");
434                 r = -errno;
435                 goto finish;
436         }
437
438         r = chmod_and_chown(from, 0600, 0, 0);
439         if (r < 0) {
440                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
441                 goto finish;
442         }
443
444         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
445                 log_error("Bind mount for /proc/kmsg failed: %m");
446                 r = -errno;
447                 goto finish;
448         }
449
450         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
451         if (fd < 0) {
452                 log_error("Failed to open fifo: %m");
453                 r = -errno;
454                 goto finish;
455         }
456
457         zero(mh);
458         zero(control);
459
460         mh.msg_control = &control;
461         mh.msg_controllen = sizeof(control);
462
463         cmsg = CMSG_FIRSTHDR(&mh);
464         cmsg->cmsg_level = SOL_SOCKET;
465         cmsg->cmsg_type = SCM_RIGHTS;
466         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
467         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
468
469         mh.msg_controllen = cmsg->cmsg_len;
470
471         /* Store away the fd in the socket, so that it stays open as
472          * long as we run the child */
473         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
474         close_nointr_nofail(fd);
475
476         if (k < 0) {
477                 log_error("Failed to send FIFO fd: %m");
478                 r = -errno;
479                 goto finish;
480         }
481
482         /* And now make the FIFO unavailable as /dev/kmsg... */
483         unlink(from);
484
485 finish:
486         free(from);
487         free(to);
488         umask(u);
489
490         return r;
491 }
492
493 static int setup_hostname(void) {
494         char *hn;
495         int r = 0;
496
497         hn = file_name_from_path(arg_directory);
498         if (hn) {
499                 hn = strdup(hn);
500                 if (!hn)
501                         return -ENOMEM;
502
503                 hostname_cleanup(hn);
504
505                 if (!isempty(hn))
506                         if (sethostname(hn, strlen(hn)) < 0)
507                                 r = -errno;
508
509                 free(hn);
510         }
511
512         return r;
513 }
514
515 static int drop_capabilities(void) {
516         static const unsigned long retain[] = {
517                 CAP_CHOWN,
518                 CAP_DAC_OVERRIDE,
519                 CAP_DAC_READ_SEARCH,
520                 CAP_FOWNER,
521                 CAP_FSETID,
522                 CAP_IPC_OWNER,
523                 CAP_KILL,
524                 CAP_LEASE,
525                 CAP_LINUX_IMMUTABLE,
526                 CAP_NET_BIND_SERVICE,
527                 CAP_NET_BROADCAST,
528                 CAP_NET_RAW,
529                 CAP_SETGID,
530                 CAP_SETFCAP,
531                 CAP_SETPCAP,
532                 CAP_SETUID,
533                 CAP_SYS_ADMIN,
534                 CAP_SYS_CHROOT,
535                 CAP_SYS_NICE,
536                 CAP_SYS_PTRACE,
537                 CAP_SYS_TTY_CONFIG
538         };
539
540         unsigned long l;
541
542         for (l = 0; l <= cap_last_cap(); l++) {
543                 unsigned i;
544
545                 for (i = 0; i < ELEMENTSOF(retain); i++)
546                         if (retain[i] == l)
547                                 break;
548
549                 if (i < ELEMENTSOF(retain))
550                         continue;
551
552                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
553                         log_error("PR_CAPBSET_DROP failed: %m");
554                         return -errno;
555                 }
556         }
557
558         return 0;
559 }
560
561 static int is_os_tree(const char *path) {
562         int r;
563         char *p;
564         /* We use /bin/sh as flag file if something is an OS */
565
566         if (asprintf(&p, "%s/bin/sh", path) < 0)
567                 return -ENOMEM;
568
569         r = access(p, F_OK);
570         free(p);
571
572         return r < 0 ? 0 : 1;
573 }
574
575 static int process_pty(int master, sigset_t *mask) {
576
577         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
578         size_t in_buffer_full = 0, out_buffer_full = 0;
579         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
580         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
581         int ep = -1, signal_fd = -1, r;
582
583         fd_nonblock(STDIN_FILENO, 1);
584         fd_nonblock(STDOUT_FILENO, 1);
585         fd_nonblock(master, 1);
586
587         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
588                 log_error("signalfd(): %m");
589                 r = -errno;
590                 goto finish;
591         }
592
593         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
594                 log_error("Failed to create epoll: %m");
595                 r = -errno;
596                 goto finish;
597         }
598
599         zero(stdin_ev);
600         stdin_ev.events = EPOLLIN|EPOLLET;
601         stdin_ev.data.fd = STDIN_FILENO;
602
603         zero(stdout_ev);
604         stdout_ev.events = EPOLLOUT|EPOLLET;
605         stdout_ev.data.fd = STDOUT_FILENO;
606
607         zero(master_ev);
608         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
609         master_ev.data.fd = master;
610
611         zero(signal_ev);
612         signal_ev.events = EPOLLIN;
613         signal_ev.data.fd = signal_fd;
614
615         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
616             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
617             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
618             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
619                 log_error("Failed to regiser fds in epoll: %m");
620                 r = -errno;
621                 goto finish;
622         }
623
624         for (;;) {
625                 struct epoll_event ev[16];
626                 ssize_t k;
627                 int i, nfds;
628
629                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
630
631                         if (errno == EINTR || errno == EAGAIN)
632                                 continue;
633
634                         log_error("epoll_wait(): %m");
635                         r = -errno;
636                         goto finish;
637                 }
638
639                 assert(nfds >= 1);
640
641                 for (i = 0; i < nfds; i++) {
642                         if (ev[i].data.fd == STDIN_FILENO) {
643
644                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
645                                         stdin_readable = true;
646
647                         } else if (ev[i].data.fd == STDOUT_FILENO) {
648
649                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
650                                         stdout_writable = true;
651
652                         } else if (ev[i].data.fd == master) {
653
654                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
655                                         master_readable = true;
656
657                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
658                                         master_writable = true;
659
660                         } else if (ev[i].data.fd == signal_fd) {
661                                 struct signalfd_siginfo sfsi;
662                                 ssize_t n;
663
664                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
665
666                                         if (n >= 0) {
667                                                 log_error("Failed to read from signalfd: invalid block size");
668                                                 r = -EIO;
669                                                 goto finish;
670                                         }
671
672                                         if (errno != EINTR && errno != EAGAIN) {
673                                                 log_error("Failed to read from signalfd: %m");
674                                                 r = -errno;
675                                                 goto finish;
676                                         }
677                                 } else {
678
679                                         if (sfsi.ssi_signo == SIGWINCH) {
680                                                 struct winsize ws;
681
682                                                 /* The window size changed, let's forward that. */
683                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
684                                                         ioctl(master, TIOCSWINSZ, &ws);
685                                         } else {
686                                                 r = 0;
687                                                 goto finish;
688                                         }
689                                 }
690                         }
691                 }
692
693                 while ((stdin_readable && in_buffer_full <= 0) ||
694                        (master_writable && in_buffer_full > 0) ||
695                        (master_readable && out_buffer_full <= 0) ||
696                        (stdout_writable && out_buffer_full > 0)) {
697
698                         if (stdin_readable && in_buffer_full < LINE_MAX) {
699
700                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
701
702                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
703                                                 stdin_readable = false;
704                                         else {
705                                                 log_error("read(): %m");
706                                                 r = -errno;
707                                                 goto finish;
708                                         }
709                                 } else
710                                         in_buffer_full += (size_t) k;
711                         }
712
713                         if (master_writable && in_buffer_full > 0) {
714
715                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
716
717                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
718                                                 master_writable = false;
719                                         else {
720                                                 log_error("write(): %m");
721                                                 r = -errno;
722                                                 goto finish;
723                                         }
724
725                                 } else {
726                                         assert(in_buffer_full >= (size_t) k);
727                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
728                                         in_buffer_full -= k;
729                                 }
730                         }
731
732                         if (master_readable && out_buffer_full < LINE_MAX) {
733
734                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
735
736                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
737                                                 master_readable = false;
738                                         else {
739                                                 log_error("read(): %m");
740                                                 r = -errno;
741                                                 goto finish;
742                                         }
743                                 }  else
744                                         out_buffer_full += (size_t) k;
745                         }
746
747                         if (stdout_writable && out_buffer_full > 0) {
748
749                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
750
751                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
752                                                 stdout_writable = false;
753                                         else {
754                                                 log_error("write(): %m");
755                                                 r = -errno;
756                                                 goto finish;
757                                         }
758
759                                 } else {
760                                         assert(out_buffer_full >= (size_t) k);
761                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
762                                         out_buffer_full -= k;
763                                 }
764                         }
765                 }
766         }
767
768 finish:
769         if (ep >= 0)
770                 close_nointr_nofail(ep);
771
772         if (signal_fd >= 0)
773                 close_nointr_nofail(signal_fd);
774
775         return r;
776 }
777
778 int main(int argc, char *argv[]) {
779         pid_t pid = 0;
780         int r = EXIT_FAILURE, k;
781         char *oldcg = NULL, *newcg = NULL;
782         char **controller = NULL;
783         int master = -1;
784         const char *console = NULL;
785         struct termios saved_attr, raw_attr;
786         sigset_t mask;
787         bool saved_attr_valid = false;
788         struct winsize ws;
789         int kmsg_socket_pair[2] = { -1, -1 };
790
791         log_parse_environment();
792         log_open();
793
794         if ((r = parse_argv(argc, argv)) <= 0)
795                 goto finish;
796
797         if (arg_directory) {
798                 char *p;
799
800                 p = path_make_absolute_cwd(arg_directory);
801                 free(arg_directory);
802                 arg_directory = p;
803         } else
804                 arg_directory = get_current_dir_name();
805
806         if (!arg_directory) {
807                 log_error("Failed to determine path");
808                 goto finish;
809         }
810
811         path_kill_slashes(arg_directory);
812
813         if (geteuid() != 0) {
814                 log_error("Need to be root.");
815                 goto finish;
816         }
817
818         if (sd_booted() <= 0) {
819                 log_error("Not running on a systemd system.");
820                 goto finish;
821         }
822
823         if (path_equal(arg_directory, "/")) {
824                 log_error("Spawning container on root directory not supported.");
825                 goto finish;
826         }
827
828         if (is_os_tree(arg_directory) <= 0) {
829                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
830                 goto finish;
831         }
832
833         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
834                 log_error("Failed to determine current cgroup: %s", strerror(-k));
835                 goto finish;
836         }
837
838         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
839                 log_error("Failed to allocate cgroup path.");
840                 goto finish;
841         }
842
843         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
844         if (k < 0)  {
845                 log_error("Failed to create cgroup: %s", strerror(-k));
846                 goto finish;
847         }
848
849         STRV_FOREACH(controller,arg_controllers) {
850                 k = cg_create_and_attach(*controller, newcg, 0);
851                 if (k < 0)
852                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
853         }
854
855         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
856                 log_error("Failed to acquire pseudo tty: %m");
857                 goto finish;
858         }
859
860         if (!(console = ptsname(master))) {
861                 log_error("Failed to determine tty name: %m");
862                 goto finish;
863         }
864
865         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
866
867         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
868                 ioctl(master, TIOCSWINSZ, &ws);
869
870         if (unlockpt(master) < 0) {
871                 log_error("Failed to unlock tty: %m");
872                 goto finish;
873         }
874
875         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
876                 log_error("Failed to get terminal attributes: %m");
877                 goto finish;
878         }
879
880         saved_attr_valid = true;
881
882         raw_attr = saved_attr;
883         cfmakeraw(&raw_attr);
884         raw_attr.c_lflag &= ~ECHO;
885
886         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
887                 log_error("Failed to set terminal attributes: %m");
888                 goto finish;
889         }
890
891         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
892                 log_error("Failed to create kmsg socket pair");
893                 goto finish;
894         }
895
896         assert_se(sigemptyset(&mask) == 0);
897         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
898         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
899
900         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
901         if (pid < 0) {
902                 if (errno == EINVAL)
903                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
904                 else
905                         log_error("clone() failed: %m");
906
907                 goto finish;
908         }
909
910         if (pid == 0) {
911                 /* child */
912
913                 const char *home = NULL;
914                 uid_t uid = (uid_t) -1;
915                 gid_t gid = (gid_t) -1;
916                 const char *envp[] = {
917                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
918                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
919                         NULL, /* TERM */
920                         NULL, /* HOME */
921                         NULL, /* USER */
922                         NULL, /* LOGNAME */
923                         NULL, /* container_uuid */
924                         NULL
925                 };
926
927                 envp[2] = strv_find_prefix(environ, "TERM=");
928
929                 close_nointr_nofail(master);
930
931                 close_nointr(STDIN_FILENO);
932                 close_nointr(STDOUT_FILENO);
933                 close_nointr(STDERR_FILENO);
934
935                 close_all_fds(&kmsg_socket_pair[1], 1);
936
937                 reset_all_signal_handlers();
938
939                 assert_se(sigemptyset(&mask) == 0);
940                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
941
942                 if (setsid() < 0)
943                         goto child_fail;
944
945                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
946                         goto child_fail;
947
948                 /* Mark / as private, in case somebody marked it shared */
949                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
950                         goto child_fail;
951
952                 if (mount_all(arg_directory) < 0)
953                         goto child_fail;
954
955                 if (copy_devnodes(arg_directory) < 0)
956                         goto child_fail;
957
958                 if (setup_dev_console(arg_directory, console) < 0)
959                         goto child_fail;
960
961                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
962                         goto child_fail;
963
964                 close_nointr_nofail(kmsg_socket_pair[1]);
965
966                 if (setup_timezone(arg_directory) < 0)
967                         goto child_fail;
968
969                 if (chdir(arg_directory) < 0) {
970                         log_error("chdir(%s) failed: %m", arg_directory);
971                         goto child_fail;
972                 }
973
974                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
975                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
976                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
977                         goto child_fail;
978
979                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
980                         log_error("mount(MS_MOVE) failed: %m");
981                         goto child_fail;
982                 }
983
984                 if (chroot(".") < 0) {
985                         log_error("chroot() failed: %m");
986                         goto child_fail;
987                 }
988
989                 if (chdir("/") < 0) {
990                         log_error("chdir() failed: %m");
991                         goto child_fail;
992                 }
993
994                 umask(0022);
995
996                 loopback_setup();
997
998                 if (drop_capabilities() < 0)
999                         goto child_fail;
1000
1001                 if (arg_user) {
1002
1003                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1004                                 log_error("get_user_creds() failed: %m");
1005                                 goto child_fail;
1006                         }
1007
1008                         if (mkdir_parents(home, 0775) < 0) {
1009                                 log_error("mkdir_parents() failed: %m");
1010                                 goto child_fail;
1011                         }
1012
1013                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1014                                 log_error("safe_mkdir() failed: %m");
1015                                 goto child_fail;
1016                         }
1017
1018                         if (initgroups((const char*)arg_user, gid) < 0) {
1019                                 log_error("initgroups() failed: %m");
1020                                 goto child_fail;
1021                         }
1022
1023                         if (setresgid(gid, gid, gid) < 0) {
1024                                 log_error("setregid() failed: %m");
1025                                 goto child_fail;
1026                         }
1027
1028                         if (setresuid(uid, uid, uid) < 0) {
1029                                 log_error("setreuid() failed: %m");
1030                                 goto child_fail;
1031                         }
1032                 }
1033
1034                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1035                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1036                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1037                     log_error("Out of memory");
1038                     goto child_fail;
1039                 }
1040
1041                 if (arg_uuid) {
1042                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1043                                 log_error("Out of memory");
1044                                 goto child_fail;
1045                         }
1046                 }
1047
1048                 setup_hostname();
1049
1050                 if (arg_boot) {
1051                         char **a;
1052                         size_t l;
1053
1054                         /* Automatically search for the init system */
1055
1056                         l = 1 + argc - optind;
1057                         a = newa(char*, l + 1);
1058                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1059
1060                         a[0] = (char*) "/usr/lib/systemd/systemd";
1061                         execve(a[0], a, (char**) envp);
1062
1063                         a[0] = (char*) "/lib/systemd/systemd";
1064                         execve(a[0], a, (char**) envp);
1065
1066                         a[0] = (char*) "/sbin/init";
1067                         execve(a[0], a, (char**) envp);
1068                 } else if (argc > optind)
1069                         execvpe(argv[optind], argv + optind, (char**) envp);
1070                 else {
1071                         chdir(home ? home : "/root");
1072                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1073                 }
1074
1075                 log_error("execv() failed: %m");
1076
1077         child_fail:
1078                 _exit(EXIT_FAILURE);
1079         }
1080
1081         if (process_pty(master, &mask) < 0)
1082                 goto finish;
1083
1084         if (saved_attr_valid) {
1085                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1086                 saved_attr_valid = false;
1087         }
1088
1089         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1090
1091         if (r < 0)
1092                 r = EXIT_FAILURE;
1093
1094 finish:
1095         if (saved_attr_valid)
1096                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1097
1098         if (master >= 0)
1099                 close_nointr_nofail(master);
1100
1101         close_pipe(kmsg_socket_pair);
1102
1103         if (oldcg)
1104                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1105
1106         if (newcg)
1107                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1108
1109         free(arg_directory);
1110         strv_free(arg_controllers);
1111         free(oldcg);
1112         free(newcg);
1113
1114         return r;
1115 }