chiark / gitweb /
nspawn: skip mounts if already mounted
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE);
96
97 static int help(void) {
98
99         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101                "  -h --help               Show this help\n"
102                "  -D --directory=NAME     Root directory for the container\n"
103                "  -b --boot               Boot up full system (i.e. invoke init)\n"
104                "  -u --user=USER          Run the command under specified user or uid\n"
105                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
106                "     --uuid=UUID          Set a specific machine UUID for the container\n"
107                "     --private-network    Disable network in container\n"
108                "     --read-only          Mount the root directory read-only\n"
109                "     --capability=CAP     In addition to the default, retain specified capability\n"
110                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
111                "  -j                      Equivalent to --link-journal=host\n",
112                program_invocation_short_name);
113
114         return 0;
115 }
116
117 static int parse_argv(int argc, char *argv[]) {
118
119         enum {
120                 ARG_PRIVATE_NETWORK = 0x100,
121                 ARG_UUID,
122                 ARG_READ_ONLY,
123                 ARG_CAPABILITY,
124                 ARG_LINK_JOURNAL
125         };
126
127         static const struct option options[] = {
128                 { "help",            no_argument,       NULL, 'h'                 },
129                 { "directory",       required_argument, NULL, 'D'                 },
130                 { "user",            required_argument, NULL, 'u'                 },
131                 { "controllers",     required_argument, NULL, 'C'                 },
132                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
133                 { "boot",            no_argument,       NULL, 'b'                 },
134                 { "uuid",            required_argument, NULL, ARG_UUID            },
135                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
136                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
137                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
138                 { NULL,              0,                 NULL, 0                   }
139         };
140
141         int c;
142
143         assert(argc >= 0);
144         assert(argv);
145
146         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
147
148                 switch (c) {
149
150                 case 'h':
151                         help();
152                         return 0;
153
154                 case 'D':
155                         free(arg_directory);
156                         arg_directory = canonicalize_file_name(optarg);
157                         if (!arg_directory) {
158                                 log_error("Failed to canonicalize root directory.");
159                                 return -ENOMEM;
160                         }
161
162                         break;
163
164                 case 'u':
165                         free(arg_user);
166                         if (!(arg_user = strdup(optarg))) {
167                                 log_error("Failed to duplicate user name.");
168                                 return -ENOMEM;
169                         }
170
171                         break;
172
173                 case 'C':
174                         strv_free(arg_controllers);
175                         arg_controllers = strv_split(optarg, ",");
176                         if (!arg_controllers) {
177                                 log_error("Failed to split controllers list.");
178                                 return -ENOMEM;
179                         }
180                         strv_uniq(arg_controllers);
181
182                         break;
183
184                 case ARG_PRIVATE_NETWORK:
185                         arg_private_network = true;
186                         break;
187
188                 case 'b':
189                         arg_boot = true;
190                         break;
191
192                 case ARG_UUID:
193                         arg_uuid = optarg;
194                         break;
195
196                 case ARG_READ_ONLY:
197                         arg_read_only = true;
198                         break;
199
200                 case ARG_CAPABILITY: {
201                         char *state, *word;
202                         size_t length;
203
204                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205                                 cap_value_t cap;
206                                 char *t;
207
208                                 t = strndup(word, length);
209                                 if (!t)
210                                         return log_oom();
211
212                                 if (cap_from_name(t, &cap) < 0) {
213                                         log_error("Failed to parse capability %s.", t);
214                                         free(t);
215                                         return -EINVAL;
216                                 }
217
218                                 free(t);
219                                 arg_retain |= 1ULL << (uint64_t) cap;
220                         }
221
222                         break;
223                 }
224
225                 case 'j':
226                         arg_link_journal = LINK_GUEST;
227                         break;
228
229                 case ARG_LINK_JOURNAL:
230                         if (streq(optarg, "auto"))
231                                 arg_link_journal = LINK_AUTO;
232                         else if (streq(optarg, "no"))
233                                 arg_link_journal = LINK_NO;
234                         else if (streq(optarg, "guest"))
235                                 arg_link_journal = LINK_GUEST;
236                         else if (streq(optarg, "host"))
237                                 arg_link_journal = LINK_HOST;
238                         else {
239                                 log_error("Failed to parse link journal mode %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         break;
244
245                 case '?':
246                         return -EINVAL;
247
248                 default:
249                         log_error("Unknown option code %c", c);
250                         return -EINVAL;
251                 }
252         }
253
254         return 1;
255 }
256
257 static int mount_all(const char *dest) {
258
259         typedef struct MountPoint {
260                 const char *what;
261                 const char *where;
262                 const char *type;
263                 const char *options;
264                 unsigned long flags;
265                 bool fatal;
266         } MountPoint;
267
268         static const MountPoint mount_table[] = {
269                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
270                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
271                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
272                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
273                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
274                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
275                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
276 #ifdef HAVE_SELINUX
277                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
278                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
279 #endif
280         };
281
282         unsigned k;
283         int r = 0;
284         char *where;
285
286         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287                 int t;
288
289                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290                         log_oom();
291
292                         if (r == 0)
293                                 r = -ENOMEM;
294
295                         break;
296                 }
297
298                 t = path_is_mount_point(where, true);
299                 if (t < 0) {
300                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301                         free(where);
302
303                         if (r == 0)
304                                 r = t;
305
306                         continue;
307                 }
308
309                 if (t > 0)
310                         continue;
311
312                 mkdir_p_label(where, 0755);
313
314                 if (mount(mount_table[k].what,
315                           where,
316                           mount_table[k].type,
317                           mount_table[k].flags,
318                           mount_table[k].options) < 0 &&
319                     mount_table[k].fatal) {
320
321                         log_error("mount(%s) failed: %m", where);
322
323                         if (r == 0)
324                                 r = -errno;
325                 }
326
327                 free(where);
328         }
329
330         return r;
331 }
332
333 static int setup_timezone(const char *dest) {
334         char *where;
335
336         assert(dest);
337
338         /* Fix the timezone, if possible */
339         if (asprintf(&where, "%s/etc/localtime", dest) < 0)
340                 return log_oom();
341
342         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
343                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
344
345         free(where);
346
347         if (asprintf(&where, "%s/etc/timezone", dest) < 0)
348                 return log_oom();
349
350         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
351                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
352
353         free(where);
354
355         return 0;
356 }
357
358 static int setup_resolv_conf(const char *dest) {
359         char *where;
360
361         assert(dest);
362
363         if (arg_private_network)
364                 return 0;
365
366         /* Fix resolv.conf, if possible */
367         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
368                 return log_oom();
369         }
370
371         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
372                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
373
374         free(where);
375
376         return 0;
377 }
378
379 static int copy_devnodes(const char *dest) {
380
381         static const char devnodes[] =
382                 "null\0"
383                 "zero\0"
384                 "full\0"
385                 "random\0"
386                 "urandom\0"
387                 "tty\0"
388                 "ptmx\0"
389                 "rtc0\0";
390
391         const char *d;
392         int r = 0;
393         mode_t u;
394
395         assert(dest);
396
397         u = umask(0000);
398
399         NULSTR_FOREACH(d, devnodes) {
400                 struct stat st;
401                 char *from = NULL, *to = NULL;
402
403                 asprintf(&from, "/dev/%s", d);
404                 asprintf(&to, "%s/dev/%s", dest, d);
405
406                 if (!from || !to) {
407                         log_error("Failed to allocate devnode path");
408
409                         free(from);
410                         free(to);
411
412                         from = to = NULL;
413
414                         if (r == 0)
415                                 r = -ENOMEM;
416
417                         break;
418                 }
419
420                 if (stat(from, &st) < 0) {
421
422                         if (errno != ENOENT) {
423                                 log_error("Failed to stat %s: %m", from);
424                                 if (r == 0)
425                                         r = -errno;
426                         }
427
428                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
429
430                         log_error("%s is not a char or block device, cannot copy.", from);
431                         if (r == 0)
432                                 r = -EIO;
433
434                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
435
436                         log_error("mknod(%s) failed: %m", dest);
437                         if (r == 0)
438                                 r = -errno;
439                 }
440
441                 free(from);
442                 free(to);
443         }
444
445         umask(u);
446
447         return r;
448 }
449
450 static int setup_dev_console(const char *dest, const char *console) {
451         struct stat st;
452         char *to = NULL;
453         int r;
454         mode_t u;
455
456         assert(dest);
457         assert(console);
458
459         u = umask(0000);
460
461         if (stat(console, &st) < 0) {
462                 log_error("Failed to stat %s: %m", console);
463                 r = -errno;
464                 goto finish;
465
466         } else if (!S_ISCHR(st.st_mode)) {
467                 log_error("/dev/console is not a char device.");
468                 r = -EIO;
469                 goto finish;
470         }
471
472         r = chmod_and_chown(console, 0600, 0, 0);
473         if (r < 0) {
474                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
475                 goto finish;
476         }
477
478         if (asprintf(&to, "%s/dev/console", dest) < 0) {
479                 r = log_oom();
480                 goto finish;
481         }
482
483         /* We need to bind mount the right tty to /dev/console since
484          * ptys can only exist on pts file systems. To have something
485          * to bind mount things on we create a device node first, that
486          * has the right major/minor (note that the major minor
487          * doesn't actually matter here, since we mount it over
488          * anyway). */
489
490         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
491                 log_error("mknod() for /dev/console failed: %m");
492                 r = -errno;
493                 goto finish;
494         }
495
496         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
497                 log_error("Bind mount for /dev/console failed: %m");
498                 r = -errno;
499                 goto finish;
500         }
501
502 finish:
503         free(to);
504         umask(u);
505
506         return r;
507 }
508
509 static int setup_kmsg(const char *dest, int kmsg_socket) {
510         char *from = NULL, *to = NULL;
511         int r, fd, k;
512         mode_t u;
513         union {
514                 struct cmsghdr cmsghdr;
515                 uint8_t buf[CMSG_SPACE(sizeof(int))];
516         } control;
517         struct msghdr mh;
518         struct cmsghdr *cmsg;
519
520         assert(dest);
521         assert(kmsg_socket >= 0);
522
523         u = umask(0000);
524
525         /* We create the kmsg FIFO as /dev/kmsg, but immediately
526          * delete it after bind mounting it to /proc/kmsg. While FIFOs
527          * on the reading side behave very similar to /proc/kmsg,
528          * their writing side behaves differently from /dev/kmsg in
529          * that writing blocks when nothing is reading. In order to
530          * avoid any problems with containers deadlocking due to this
531          * we simply make /dev/kmsg unavailable to the container. */
532         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
533                 r = log_oom();
534                 goto finish;
535         }
536
537         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
538                 r = log_oom();
539                 goto finish;
540         }
541
542         if (mkfifo(from, 0600) < 0) {
543                 log_error("mkfifo() for /dev/kmsg failed: %m");
544                 r = -errno;
545                 goto finish;
546         }
547
548         r = chmod_and_chown(from, 0600, 0, 0);
549         if (r < 0) {
550                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
551                 goto finish;
552         }
553
554         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
555                 log_error("Bind mount for /proc/kmsg failed: %m");
556                 r = -errno;
557                 goto finish;
558         }
559
560         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
561         if (fd < 0) {
562                 log_error("Failed to open fifo: %m");
563                 r = -errno;
564                 goto finish;
565         }
566
567         zero(mh);
568         zero(control);
569
570         mh.msg_control = &control;
571         mh.msg_controllen = sizeof(control);
572
573         cmsg = CMSG_FIRSTHDR(&mh);
574         cmsg->cmsg_level = SOL_SOCKET;
575         cmsg->cmsg_type = SCM_RIGHTS;
576         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
577         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
578
579         mh.msg_controllen = cmsg->cmsg_len;
580
581         /* Store away the fd in the socket, so that it stays open as
582          * long as we run the child */
583         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
584         close_nointr_nofail(fd);
585
586         if (k < 0) {
587                 log_error("Failed to send FIFO fd: %m");
588                 r = -errno;
589                 goto finish;
590         }
591
592         /* And now make the FIFO unavailable as /dev/kmsg... */
593         unlink(from);
594
595 finish:
596         free(from);
597         free(to);
598         umask(u);
599
600         return r;
601 }
602
603 static int setup_hostname(void) {
604         char *hn;
605         int r = 0;
606
607         hn = path_get_file_name(arg_directory);
608         if (hn) {
609                 hn = strdup(hn);
610                 if (!hn)
611                         return -ENOMEM;
612
613                 hostname_cleanup(hn);
614
615                 if (!isempty(hn))
616                         if (sethostname(hn, strlen(hn)) < 0)
617                                 r = -errno;
618
619                 free(hn);
620         }
621
622         return r;
623 }
624
625 static int setup_journal(const char *directory) {
626         sd_id128_t machine_id;
627         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
628         int r;
629
630         if (arg_link_journal == LINK_NO)
631                 return 0;
632
633         p = strappend(directory, "/etc/machine-id");
634         if (!p) {
635                 r = log_oom();
636                 goto finish;
637         }
638
639         r = read_one_line_file(p, &b);
640         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
641                 r = 0;
642                 goto finish;
643         } else if (r < 0) {
644                 log_error("Failed to read machine ID: %s", strerror(-r));
645                 return r;
646         }
647
648         l = strstrip(b);
649         if (isempty(l) && arg_link_journal == LINK_AUTO) {
650                 r = 0;
651                 goto finish;
652         }
653
654         /* Verify validaty */
655         r = sd_id128_from_string(l, &machine_id);
656         if (r < 0) {
657                 log_error("Failed to parse machine ID: %s", strerror(-r));
658                 goto finish;
659         }
660
661         free(p);
662         p = strappend("/var/log/journal/", l);
663         q = strjoin(directory, "/var/log/journal/", l, NULL);
664         if (!p || !q) {
665                 r = log_oom();
666                 goto finish;
667         }
668
669         if (path_is_mount_point(p, false) > 0 ||
670             path_is_mount_point(q, false) > 0) {
671                 if (arg_link_journal != LINK_AUTO) {
672                         log_error("Journal already a mount point, refusing.");
673                         r = -EEXIST;
674                         goto finish;
675                 }
676
677                 r = 0;
678                 goto finish;
679         }
680
681         r = readlink_and_make_absolute(p, &d);
682         if (r >= 0) {
683                 if ((arg_link_journal == LINK_GUEST ||
684                      arg_link_journal == LINK_AUTO) &&
685                     path_equal(d, q)) {
686
687                         mkdir_p(q, 0755);
688
689                         r = 0;
690                         goto finish;
691                 }
692
693                 if (unlink(p) < 0) {
694                         log_error("Failed to remove symlink %s: %m", p);
695                         r = -errno;
696                         goto finish;
697                 }
698         } else if (r == -EINVAL) {
699
700                 if (arg_link_journal == LINK_GUEST &&
701                     rmdir(p) < 0) {
702
703                         if (errno == ENOTDIR)
704                                 log_error("%s already exists and is neither symlink nor directory.", p);
705                         else {
706                                 log_error("Failed to remove %s: %m", p);
707                                 r = -errno;
708                         }
709
710                         goto finish;
711                 }
712         } else if (r != -ENOENT) {
713                 log_error("readlink(%s) failed: %m", p);
714                 goto finish;
715         }
716
717         if (arg_link_journal == LINK_GUEST) {
718
719                 if (symlink(q, p) < 0) {
720                         log_error("Failed to symlink %s to %s: %m", q, p);
721                         r = -errno;
722                         goto finish;
723                 }
724
725                 mkdir_p(q, 0755);
726
727                 r = 0;
728                 goto finish;
729         }
730
731         if (arg_link_journal == LINK_HOST) {
732                 r = mkdir_p(p, 0755);
733                 if (r < 0) {
734                         log_error("Failed to create %s: %m", p);
735                         goto finish;
736                 }
737
738         } else if (access(p, F_OK) < 0) {
739                 r = 0;
740                 goto finish;
741         }
742
743         if (dir_is_empty(q) == 0) {
744                 log_error("%s not empty.", q);
745                 r = -ENOTEMPTY;
746                 goto finish;
747         }
748
749         r = mkdir_p(q, 0755);
750         if (r < 0) {
751                 log_error("Failed to create %s: %m", q);
752                 goto finish;
753         }
754
755         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
756                 log_error("Failed to bind mount journal from host into guest: %m");
757                 r = -errno;
758                 goto finish;
759         }
760
761         r = 0;
762
763 finish:
764         free(p);
765         free(q);
766         free(d);
767         free(b);
768         return r;
769
770 }
771
772 static int drop_capabilities(void) {
773         return capability_bounding_set_drop(~arg_retain, false);
774 }
775
776 static int is_os_tree(const char *path) {
777         int r;
778         char *p;
779         /* We use /bin/sh as flag file if something is an OS */
780
781         if (asprintf(&p, "%s/bin/sh", path) < 0)
782                 return -ENOMEM;
783
784         r = access(p, F_OK);
785         free(p);
786
787         return r < 0 ? 0 : 1;
788 }
789
790 static int process_pty(int master, sigset_t *mask) {
791
792         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
793         size_t in_buffer_full = 0, out_buffer_full = 0;
794         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
795         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
796         int ep = -1, signal_fd = -1, r;
797
798         fd_nonblock(STDIN_FILENO, 1);
799         fd_nonblock(STDOUT_FILENO, 1);
800         fd_nonblock(master, 1);
801
802         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
803         if (signal_fd < 0) {
804                 log_error("signalfd(): %m");
805                 r = -errno;
806                 goto finish;
807         }
808
809         ep = epoll_create1(EPOLL_CLOEXEC);
810         if (ep < 0) {
811                 log_error("Failed to create epoll: %m");
812                 r = -errno;
813                 goto finish;
814         }
815
816         zero(stdin_ev);
817         stdin_ev.events = EPOLLIN|EPOLLET;
818         stdin_ev.data.fd = STDIN_FILENO;
819
820         zero(stdout_ev);
821         stdout_ev.events = EPOLLOUT|EPOLLET;
822         stdout_ev.data.fd = STDOUT_FILENO;
823
824         zero(master_ev);
825         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
826         master_ev.data.fd = master;
827
828         zero(signal_ev);
829         signal_ev.events = EPOLLIN;
830         signal_ev.data.fd = signal_fd;
831
832         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
833             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
834             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
835             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
836                 log_error("Failed to regiser fds in epoll: %m");
837                 r = -errno;
838                 goto finish;
839         }
840
841         for (;;) {
842                 struct epoll_event ev[16];
843                 ssize_t k;
844                 int i, nfds;
845
846                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
847                 if (nfds < 0) {
848
849                         if (errno == EINTR || errno == EAGAIN)
850                                 continue;
851
852                         log_error("epoll_wait(): %m");
853                         r = -errno;
854                         goto finish;
855                 }
856
857                 assert(nfds >= 1);
858
859                 for (i = 0; i < nfds; i++) {
860                         if (ev[i].data.fd == STDIN_FILENO) {
861
862                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
863                                         stdin_readable = true;
864
865                         } else if (ev[i].data.fd == STDOUT_FILENO) {
866
867                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
868                                         stdout_writable = true;
869
870                         } else if (ev[i].data.fd == master) {
871
872                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
873                                         master_readable = true;
874
875                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
876                                         master_writable = true;
877
878                         } else if (ev[i].data.fd == signal_fd) {
879                                 struct signalfd_siginfo sfsi;
880                                 ssize_t n;
881
882                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
883                                 if (n != sizeof(sfsi)) {
884
885                                         if (n >= 0) {
886                                                 log_error("Failed to read from signalfd: invalid block size");
887                                                 r = -EIO;
888                                                 goto finish;
889                                         }
890
891                                         if (errno != EINTR && errno != EAGAIN) {
892                                                 log_error("Failed to read from signalfd: %m");
893                                                 r = -errno;
894                                                 goto finish;
895                                         }
896                                 } else {
897
898                                         if (sfsi.ssi_signo == SIGWINCH) {
899                                                 struct winsize ws;
900
901                                                 /* The window size changed, let's forward that. */
902                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
903                                                         ioctl(master, TIOCSWINSZ, &ws);
904                                         } else {
905                                                 r = 0;
906                                                 goto finish;
907                                         }
908                                 }
909                         }
910                 }
911
912                 while ((stdin_readable && in_buffer_full <= 0) ||
913                        (master_writable && in_buffer_full > 0) ||
914                        (master_readable && out_buffer_full <= 0) ||
915                        (stdout_writable && out_buffer_full > 0)) {
916
917                         if (stdin_readable && in_buffer_full < LINE_MAX) {
918
919                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
920                                 if (k < 0) {
921
922                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
923                                                 stdin_readable = false;
924                                         else {
925                                                 log_error("read(): %m");
926                                                 r = -errno;
927                                                 goto finish;
928                                         }
929                                 } else
930                                         in_buffer_full += (size_t) k;
931                         }
932
933                         if (master_writable && in_buffer_full > 0) {
934
935                                 k = write(master, in_buffer, in_buffer_full);
936                                 if (k < 0) {
937
938                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
939                                                 master_writable = false;
940                                         else {
941                                                 log_error("write(): %m");
942                                                 r = -errno;
943                                                 goto finish;
944                                         }
945
946                                 } else {
947                                         assert(in_buffer_full >= (size_t) k);
948                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
949                                         in_buffer_full -= k;
950                                 }
951                         }
952
953                         if (master_readable && out_buffer_full < LINE_MAX) {
954
955                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
956                                 if (k < 0) {
957
958                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
959                                                 master_readable = false;
960                                         else {
961                                                 log_error("read(): %m");
962                                                 r = -errno;
963                                                 goto finish;
964                                         }
965                                 }  else
966                                         out_buffer_full += (size_t) k;
967                         }
968
969                         if (stdout_writable && out_buffer_full > 0) {
970
971                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
972                                 if (k < 0) {
973
974                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
975                                                 stdout_writable = false;
976                                         else {
977                                                 log_error("write(): %m");
978                                                 r = -errno;
979                                                 goto finish;
980                                         }
981
982                                 } else {
983                                         assert(out_buffer_full >= (size_t) k);
984                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
985                                         out_buffer_full -= k;
986                                 }
987                         }
988                 }
989         }
990
991 finish:
992         if (ep >= 0)
993                 close_nointr_nofail(ep);
994
995         if (signal_fd >= 0)
996                 close_nointr_nofail(signal_fd);
997
998         return r;
999 }
1000
1001 int main(int argc, char *argv[]) {
1002         pid_t pid = 0;
1003         int r = EXIT_FAILURE, k;
1004         char *oldcg = NULL, *newcg = NULL;
1005         char **controller = NULL;
1006         int master = -1;
1007         const char *console = NULL;
1008         struct termios saved_attr, raw_attr;
1009         sigset_t mask;
1010         bool saved_attr_valid = false;
1011         struct winsize ws;
1012         int kmsg_socket_pair[2] = { -1, -1 };
1013
1014         log_parse_environment();
1015         log_open();
1016
1017         r = parse_argv(argc, argv);
1018         if (r <= 0)
1019                 goto finish;
1020
1021         if (arg_directory) {
1022                 char *p;
1023
1024                 p = path_make_absolute_cwd(arg_directory);
1025                 free(arg_directory);
1026                 arg_directory = p;
1027         } else
1028                 arg_directory = get_current_dir_name();
1029
1030         if (!arg_directory) {
1031                 log_error("Failed to determine path");
1032                 goto finish;
1033         }
1034
1035         path_kill_slashes(arg_directory);
1036
1037         if (geteuid() != 0) {
1038                 log_error("Need to be root.");
1039                 goto finish;
1040         }
1041
1042         if (sd_booted() <= 0) {
1043                 log_error("Not running on a systemd system.");
1044                 goto finish;
1045         }
1046
1047         if (path_equal(arg_directory, "/")) {
1048                 log_error("Spawning container on root directory not supported.");
1049                 goto finish;
1050         }
1051
1052         if (is_os_tree(arg_directory) <= 0) {
1053                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1054                 goto finish;
1055         }
1056
1057         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1058         if (k < 0) {
1059                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1060                 goto finish;
1061         }
1062
1063         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1064                 log_error("Failed to allocate cgroup path.");
1065                 goto finish;
1066         }
1067
1068         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1069         if (k < 0)  {
1070                 log_error("Failed to create cgroup: %s", strerror(-k));
1071                 goto finish;
1072         }
1073
1074         STRV_FOREACH(controller, arg_controllers) {
1075                 k = cg_create_and_attach(*controller, newcg, 0);
1076                 if (k < 0)
1077                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1078         }
1079
1080         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1081         if (master < 0) {
1082                 log_error("Failed to acquire pseudo tty: %m");
1083                 goto finish;
1084         }
1085
1086         console = ptsname(master);
1087         if (!console) {
1088                 log_error("Failed to determine tty name: %m");
1089                 goto finish;
1090         }
1091
1092         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1093
1094         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1095                 ioctl(master, TIOCSWINSZ, &ws);
1096
1097         if (unlockpt(master) < 0) {
1098                 log_error("Failed to unlock tty: %m");
1099                 goto finish;
1100         }
1101
1102         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1103                 log_error("Failed to get terminal attributes: %m");
1104                 goto finish;
1105         }
1106
1107         saved_attr_valid = true;
1108
1109         raw_attr = saved_attr;
1110         cfmakeraw(&raw_attr);
1111         raw_attr.c_lflag &= ~ECHO;
1112
1113         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1114                 log_error("Failed to set terminal attributes: %m");
1115                 goto finish;
1116         }
1117
1118         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1119                 log_error("Failed to create kmsg socket pair");
1120                 goto finish;
1121         }
1122
1123         assert_se(sigemptyset(&mask) == 0);
1124         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1125         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1126
1127         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1128         if (pid < 0) {
1129                 if (errno == EINVAL)
1130                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1131                 else
1132                         log_error("clone() failed: %m");
1133
1134                 goto finish;
1135         }
1136
1137         if (pid == 0) {
1138                 /* child */
1139
1140                 const char *home = NULL;
1141                 uid_t uid = (uid_t) -1;
1142                 gid_t gid = (gid_t) -1;
1143                 const char *envp[] = {
1144                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1145                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1146                         NULL, /* TERM */
1147                         NULL, /* HOME */
1148                         NULL, /* USER */
1149                         NULL, /* LOGNAME */
1150                         NULL, /* container_uuid */
1151                         NULL
1152                 };
1153
1154                 envp[2] = strv_find_prefix(environ, "TERM=");
1155
1156                 close_nointr_nofail(master);
1157
1158                 close_nointr(STDIN_FILENO);
1159                 close_nointr(STDOUT_FILENO);
1160                 close_nointr(STDERR_FILENO);
1161
1162                 close_all_fds(&kmsg_socket_pair[1], 1);
1163
1164                 reset_all_signal_handlers();
1165
1166                 assert_se(sigemptyset(&mask) == 0);
1167                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1168
1169                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1170                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1171                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1172                         goto child_fail;
1173
1174                 if (setsid() < 0) {
1175                         log_error("setsid() failed: %m");
1176                         goto child_fail;
1177                 }
1178
1179                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1180                         log_error("PR_SET_PDEATHSIG failed: %m");
1181                         goto child_fail;
1182                 }
1183
1184                 /* Mark everything as slave, so that we still
1185                  * receive mounts from the real root, but don't
1186                  * propagate mounts to the real root. */
1187                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1188                         log_error("MS_SLAVE|MS_REC failed: %m");
1189                         goto child_fail;
1190                 }
1191
1192                 /* Turn directory into bind mount */
1193                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1194                         log_error("Failed to make bind mount.");
1195                         goto child_fail;
1196                 }
1197
1198                 if (arg_read_only)
1199                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1200                                 log_error("Failed to make read-only.");
1201                                 goto child_fail;
1202                         }
1203
1204                 if (mount_all(arg_directory) < 0)
1205                         goto child_fail;
1206
1207                 if (copy_devnodes(arg_directory) < 0)
1208                         goto child_fail;
1209
1210                 dev_setup(arg_directory);
1211
1212                 if (setup_dev_console(arg_directory, console) < 0)
1213                         goto child_fail;
1214
1215                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1216                         goto child_fail;
1217
1218                 close_nointr_nofail(kmsg_socket_pair[1]);
1219
1220                 if (setup_timezone(arg_directory) < 0)
1221                         goto child_fail;
1222
1223                 if (setup_resolv_conf(arg_directory) < 0)
1224                         goto child_fail;
1225
1226                 if (setup_journal(arg_directory) < 0)
1227                         goto child_fail;
1228
1229                 if (chdir(arg_directory) < 0) {
1230                         log_error("chdir(%s) failed: %m", arg_directory);
1231                         goto child_fail;
1232                 }
1233
1234                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1235                         log_error("mount(MS_MOVE) failed: %m");
1236                         goto child_fail;
1237                 }
1238
1239                 if (chroot(".") < 0) {
1240                         log_error("chroot() failed: %m");
1241                         goto child_fail;
1242                 }
1243
1244                 if (chdir("/") < 0) {
1245                         log_error("chdir() failed: %m");
1246                         goto child_fail;
1247                 }
1248
1249                 umask(0022);
1250
1251                 loopback_setup();
1252
1253                 if (drop_capabilities() < 0) {
1254                         log_error("drop_capabilities() failed: %m");
1255                         goto child_fail;
1256                 }
1257
1258                 if (arg_user) {
1259
1260                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1261                                 log_error("get_user_creds() failed: %m");
1262                                 goto child_fail;
1263                         }
1264
1265                         if (mkdir_parents_label(home, 0775) < 0) {
1266                                 log_error("mkdir_parents_label() failed: %m");
1267                                 goto child_fail;
1268                         }
1269
1270                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1271                                 log_error("mkdir_safe_label() failed: %m");
1272                                 goto child_fail;
1273                         }
1274
1275                         if (initgroups((const char*)arg_user, gid) < 0) {
1276                                 log_error("initgroups() failed: %m");
1277                                 goto child_fail;
1278                         }
1279
1280                         if (setresgid(gid, gid, gid) < 0) {
1281                                 log_error("setregid() failed: %m");
1282                                 goto child_fail;
1283                         }
1284
1285                         if (setresuid(uid, uid, uid) < 0) {
1286                                 log_error("setreuid() failed: %m");
1287                                 goto child_fail;
1288                         }
1289                 }
1290
1291                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1292                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1293                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1294                     log_oom();
1295                     goto child_fail;
1296                 }
1297
1298                 if (arg_uuid) {
1299                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1300                                 log_oom();
1301                                 goto child_fail;
1302                         }
1303                 }
1304
1305                 setup_hostname();
1306
1307                 if (arg_boot) {
1308                         char **a;
1309                         size_t l;
1310
1311                         /* Automatically search for the init system */
1312
1313                         l = 1 + argc - optind;
1314                         a = newa(char*, l + 1);
1315                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1316
1317                         a[0] = (char*) "/usr/lib/systemd/systemd";
1318                         execve(a[0], a, (char**) envp);
1319
1320                         a[0] = (char*) "/lib/systemd/systemd";
1321                         execve(a[0], a, (char**) envp);
1322
1323                         a[0] = (char*) "/sbin/init";
1324                         execve(a[0], a, (char**) envp);
1325                 } else if (argc > optind)
1326                         execvpe(argv[optind], argv + optind, (char**) envp);
1327                 else {
1328                         chdir(home ? home : "/root");
1329                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1330                 }
1331
1332                 log_error("execv() failed: %m");
1333
1334         child_fail:
1335                 _exit(EXIT_FAILURE);
1336         }
1337
1338         if (process_pty(master, &mask) < 0)
1339                 goto finish;
1340
1341         if (saved_attr_valid) {
1342                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1343                 saved_attr_valid = false;
1344         }
1345
1346         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1347
1348         if (r < 0)
1349                 r = EXIT_FAILURE;
1350
1351 finish:
1352         if (saved_attr_valid)
1353                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1354
1355         if (master >= 0)
1356                 close_nointr_nofail(master);
1357
1358         close_pipe(kmsg_socket_pair);
1359
1360         if (oldcg)
1361                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1362
1363         if (newcg)
1364                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1365
1366         free(arg_directory);
1367         strv_free(arg_controllers);
1368         free(oldcg);
1369         free(newcg);
1370
1371         return r;
1372 }