chiark / gitweb /
nspawn: if a file system comes pre-mounted, still do the read-only remounts
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE);
96
97 static int help(void) {
98
99         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101                "  -h --help               Show this help\n"
102                "  -D --directory=NAME     Root directory for the container\n"
103                "  -b --boot               Boot up full system (i.e. invoke init)\n"
104                "  -u --user=USER          Run the command under specified user or uid\n"
105                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
106                "     --uuid=UUID          Set a specific machine UUID for the container\n"
107                "     --private-network    Disable network in container\n"
108                "     --read-only          Mount the root directory read-only\n"
109                "     --capability=CAP     In addition to the default, retain specified capability\n"
110                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
111                "  -j                      Equivalent to --link-journal=host\n",
112                program_invocation_short_name);
113
114         return 0;
115 }
116
117 static int parse_argv(int argc, char *argv[]) {
118
119         enum {
120                 ARG_PRIVATE_NETWORK = 0x100,
121                 ARG_UUID,
122                 ARG_READ_ONLY,
123                 ARG_CAPABILITY,
124                 ARG_LINK_JOURNAL
125         };
126
127         static const struct option options[] = {
128                 { "help",            no_argument,       NULL, 'h'                 },
129                 { "directory",       required_argument, NULL, 'D'                 },
130                 { "user",            required_argument, NULL, 'u'                 },
131                 { "controllers",     required_argument, NULL, 'C'                 },
132                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
133                 { "boot",            no_argument,       NULL, 'b'                 },
134                 { "uuid",            required_argument, NULL, ARG_UUID            },
135                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
136                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
137                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
138                 { NULL,              0,                 NULL, 0                   }
139         };
140
141         int c;
142
143         assert(argc >= 0);
144         assert(argv);
145
146         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
147
148                 switch (c) {
149
150                 case 'h':
151                         help();
152                         return 0;
153
154                 case 'D':
155                         free(arg_directory);
156                         arg_directory = canonicalize_file_name(optarg);
157                         if (!arg_directory) {
158                                 log_error("Failed to canonicalize root directory.");
159                                 return -ENOMEM;
160                         }
161
162                         break;
163
164                 case 'u':
165                         free(arg_user);
166                         if (!(arg_user = strdup(optarg))) {
167                                 log_error("Failed to duplicate user name.");
168                                 return -ENOMEM;
169                         }
170
171                         break;
172
173                 case 'C':
174                         strv_free(arg_controllers);
175                         arg_controllers = strv_split(optarg, ",");
176                         if (!arg_controllers) {
177                                 log_error("Failed to split controllers list.");
178                                 return -ENOMEM;
179                         }
180                         strv_uniq(arg_controllers);
181
182                         break;
183
184                 case ARG_PRIVATE_NETWORK:
185                         arg_private_network = true;
186                         break;
187
188                 case 'b':
189                         arg_boot = true;
190                         break;
191
192                 case ARG_UUID:
193                         arg_uuid = optarg;
194                         break;
195
196                 case ARG_READ_ONLY:
197                         arg_read_only = true;
198                         break;
199
200                 case ARG_CAPABILITY: {
201                         char *state, *word;
202                         size_t length;
203
204                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205                                 cap_value_t cap;
206                                 char *t;
207
208                                 t = strndup(word, length);
209                                 if (!t)
210                                         return log_oom();
211
212                                 if (cap_from_name(t, &cap) < 0) {
213                                         log_error("Failed to parse capability %s.", t);
214                                         free(t);
215                                         return -EINVAL;
216                                 }
217
218                                 free(t);
219                                 arg_retain |= 1ULL << (uint64_t) cap;
220                         }
221
222                         break;
223                 }
224
225                 case 'j':
226                         arg_link_journal = LINK_GUEST;
227                         break;
228
229                 case ARG_LINK_JOURNAL:
230                         if (streq(optarg, "auto"))
231                                 arg_link_journal = LINK_AUTO;
232                         else if (streq(optarg, "no"))
233                                 arg_link_journal = LINK_NO;
234                         else if (streq(optarg, "guest"))
235                                 arg_link_journal = LINK_GUEST;
236                         else if (streq(optarg, "host"))
237                                 arg_link_journal = LINK_HOST;
238                         else {
239                                 log_error("Failed to parse link journal mode %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         break;
244
245                 case '?':
246                         return -EINVAL;
247
248                 default:
249                         log_error("Unknown option code %c", c);
250                         return -EINVAL;
251                 }
252         }
253
254         return 1;
255 }
256
257 static int mount_all(const char *dest) {
258
259         typedef struct MountPoint {
260                 const char *what;
261                 const char *where;
262                 const char *type;
263                 const char *options;
264                 unsigned long flags;
265                 bool fatal;
266         } MountPoint;
267
268         static const MountPoint mount_table[] = {
269                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
270                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
271                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
272                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
273                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
274                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
275                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
276 #ifdef HAVE_SELINUX
277                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
278                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
279 #endif
280         };
281
282         unsigned k;
283         int r = 0;
284         char *where;
285
286         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287                 int t;
288
289                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290                         log_oom();
291
292                         if (r == 0)
293                                 r = -ENOMEM;
294
295                         break;
296                 }
297
298                 t = path_is_mount_point(where, true);
299                 if (t < 0) {
300                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301                         free(where);
302
303                         if (r == 0)
304                                 r = t;
305
306                         continue;
307                 }
308
309                 /* Skip this entry if it is not a remount. */
310                 if (mount_table[k].what && t > 0)
311                         continue;
312
313                 mkdir_p_label(where, 0755);
314
315                 if (mount(mount_table[k].what,
316                           where,
317                           mount_table[k].type,
318                           mount_table[k].flags,
319                           mount_table[k].options) < 0 &&
320                     mount_table[k].fatal) {
321
322                         log_error("mount(%s) failed: %m", where);
323
324                         if (r == 0)
325                                 r = -errno;
326                 }
327
328                 free(where);
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         if (asprintf(&where, "%s/etc/localtime", dest) < 0)
341                 return log_oom();
342
343         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
345
346         free(where);
347
348         if (asprintf(&where, "%s/etc/timezone", dest) < 0)
349                 return log_oom();
350
351         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
352                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
353
354         free(where);
355
356         return 0;
357 }
358
359 static int setup_resolv_conf(const char *dest) {
360         char *where;
361
362         assert(dest);
363
364         if (arg_private_network)
365                 return 0;
366
367         /* Fix resolv.conf, if possible */
368         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
369                 return log_oom();
370         }
371
372         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
373                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
374
375         free(where);
376
377         return 0;
378 }
379
380 static int copy_devnodes(const char *dest) {
381
382         static const char devnodes[] =
383                 "null\0"
384                 "zero\0"
385                 "full\0"
386                 "random\0"
387                 "urandom\0"
388                 "tty\0"
389                 "ptmx\0"
390                 "rtc0\0";
391
392         const char *d;
393         int r = 0;
394         mode_t u;
395
396         assert(dest);
397
398         u = umask(0000);
399
400         NULSTR_FOREACH(d, devnodes) {
401                 struct stat st;
402                 char *from = NULL, *to = NULL;
403
404                 asprintf(&from, "/dev/%s", d);
405                 asprintf(&to, "%s/dev/%s", dest, d);
406
407                 if (!from || !to) {
408                         log_error("Failed to allocate devnode path");
409
410                         free(from);
411                         free(to);
412
413                         from = to = NULL;
414
415                         if (r == 0)
416                                 r = -ENOMEM;
417
418                         break;
419                 }
420
421                 if (stat(from, &st) < 0) {
422
423                         if (errno != ENOENT) {
424                                 log_error("Failed to stat %s: %m", from);
425                                 if (r == 0)
426                                         r = -errno;
427                         }
428
429                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
430
431                         log_error("%s is not a char or block device, cannot copy.", from);
432                         if (r == 0)
433                                 r = -EIO;
434
435                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
436
437                         log_error("mknod(%s) failed: %m", dest);
438                         if (r == 0)
439                                 r = -errno;
440                 }
441
442                 free(from);
443                 free(to);
444         }
445
446         umask(u);
447
448         return r;
449 }
450
451 static int setup_dev_console(const char *dest, const char *console) {
452         struct stat st;
453         char *to = NULL;
454         int r;
455         mode_t u;
456
457         assert(dest);
458         assert(console);
459
460         u = umask(0000);
461
462         if (stat(console, &st) < 0) {
463                 log_error("Failed to stat %s: %m", console);
464                 r = -errno;
465                 goto finish;
466
467         } else if (!S_ISCHR(st.st_mode)) {
468                 log_error("/dev/console is not a char device.");
469                 r = -EIO;
470                 goto finish;
471         }
472
473         r = chmod_and_chown(console, 0600, 0, 0);
474         if (r < 0) {
475                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
476                 goto finish;
477         }
478
479         if (asprintf(&to, "%s/dev/console", dest) < 0) {
480                 r = log_oom();
481                 goto finish;
482         }
483
484         /* We need to bind mount the right tty to /dev/console since
485          * ptys can only exist on pts file systems. To have something
486          * to bind mount things on we create a device node first, that
487          * has the right major/minor (note that the major minor
488          * doesn't actually matter here, since we mount it over
489          * anyway). */
490
491         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
492                 log_error("mknod() for /dev/console failed: %m");
493                 r = -errno;
494                 goto finish;
495         }
496
497         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
498                 log_error("Bind mount for /dev/console failed: %m");
499                 r = -errno;
500                 goto finish;
501         }
502
503 finish:
504         free(to);
505         umask(u);
506
507         return r;
508 }
509
510 static int setup_kmsg(const char *dest, int kmsg_socket) {
511         char *from = NULL, *to = NULL;
512         int r, fd, k;
513         mode_t u;
514         union {
515                 struct cmsghdr cmsghdr;
516                 uint8_t buf[CMSG_SPACE(sizeof(int))];
517         } control;
518         struct msghdr mh;
519         struct cmsghdr *cmsg;
520
521         assert(dest);
522         assert(kmsg_socket >= 0);
523
524         u = umask(0000);
525
526         /* We create the kmsg FIFO as /dev/kmsg, but immediately
527          * delete it after bind mounting it to /proc/kmsg. While FIFOs
528          * on the reading side behave very similar to /proc/kmsg,
529          * their writing side behaves differently from /dev/kmsg in
530          * that writing blocks when nothing is reading. In order to
531          * avoid any problems with containers deadlocking due to this
532          * we simply make /dev/kmsg unavailable to the container. */
533         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
534                 r = log_oom();
535                 goto finish;
536         }
537
538         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
539                 r = log_oom();
540                 goto finish;
541         }
542
543         if (mkfifo(from, 0600) < 0) {
544                 log_error("mkfifo() for /dev/kmsg failed: %m");
545                 r = -errno;
546                 goto finish;
547         }
548
549         r = chmod_and_chown(from, 0600, 0, 0);
550         if (r < 0) {
551                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
552                 goto finish;
553         }
554
555         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
556                 log_error("Bind mount for /proc/kmsg failed: %m");
557                 r = -errno;
558                 goto finish;
559         }
560
561         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
562         if (fd < 0) {
563                 log_error("Failed to open fifo: %m");
564                 r = -errno;
565                 goto finish;
566         }
567
568         zero(mh);
569         zero(control);
570
571         mh.msg_control = &control;
572         mh.msg_controllen = sizeof(control);
573
574         cmsg = CMSG_FIRSTHDR(&mh);
575         cmsg->cmsg_level = SOL_SOCKET;
576         cmsg->cmsg_type = SCM_RIGHTS;
577         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
578         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
579
580         mh.msg_controllen = cmsg->cmsg_len;
581
582         /* Store away the fd in the socket, so that it stays open as
583          * long as we run the child */
584         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
585         close_nointr_nofail(fd);
586
587         if (k < 0) {
588                 log_error("Failed to send FIFO fd: %m");
589                 r = -errno;
590                 goto finish;
591         }
592
593         /* And now make the FIFO unavailable as /dev/kmsg... */
594         unlink(from);
595
596 finish:
597         free(from);
598         free(to);
599         umask(u);
600
601         return r;
602 }
603
604 static int setup_hostname(void) {
605         char *hn;
606         int r = 0;
607
608         hn = path_get_file_name(arg_directory);
609         if (hn) {
610                 hn = strdup(hn);
611                 if (!hn)
612                         return -ENOMEM;
613
614                 hostname_cleanup(hn);
615
616                 if (!isempty(hn))
617                         if (sethostname(hn, strlen(hn)) < 0)
618                                 r = -errno;
619
620                 free(hn);
621         }
622
623         return r;
624 }
625
626 static int setup_journal(const char *directory) {
627         sd_id128_t machine_id;
628         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
629         int r;
630
631         if (arg_link_journal == LINK_NO)
632                 return 0;
633
634         p = strappend(directory, "/etc/machine-id");
635         if (!p) {
636                 r = log_oom();
637                 goto finish;
638         }
639
640         r = read_one_line_file(p, &b);
641         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
642                 r = 0;
643                 goto finish;
644         } else if (r < 0) {
645                 log_error("Failed to read machine ID: %s", strerror(-r));
646                 return r;
647         }
648
649         l = strstrip(b);
650         if (isempty(l) && arg_link_journal == LINK_AUTO) {
651                 r = 0;
652                 goto finish;
653         }
654
655         /* Verify validaty */
656         r = sd_id128_from_string(l, &machine_id);
657         if (r < 0) {
658                 log_error("Failed to parse machine ID: %s", strerror(-r));
659                 goto finish;
660         }
661
662         free(p);
663         p = strappend("/var/log/journal/", l);
664         q = strjoin(directory, "/var/log/journal/", l, NULL);
665         if (!p || !q) {
666                 r = log_oom();
667                 goto finish;
668         }
669
670         if (path_is_mount_point(p, false) > 0 ||
671             path_is_mount_point(q, false) > 0) {
672                 if (arg_link_journal != LINK_AUTO) {
673                         log_error("Journal already a mount point, refusing.");
674                         r = -EEXIST;
675                         goto finish;
676                 }
677
678                 r = 0;
679                 goto finish;
680         }
681
682         r = readlink_and_make_absolute(p, &d);
683         if (r >= 0) {
684                 if ((arg_link_journal == LINK_GUEST ||
685                      arg_link_journal == LINK_AUTO) &&
686                     path_equal(d, q)) {
687
688                         mkdir_p(q, 0755);
689
690                         r = 0;
691                         goto finish;
692                 }
693
694                 if (unlink(p) < 0) {
695                         log_error("Failed to remove symlink %s: %m", p);
696                         r = -errno;
697                         goto finish;
698                 }
699         } else if (r == -EINVAL) {
700
701                 if (arg_link_journal == LINK_GUEST &&
702                     rmdir(p) < 0) {
703
704                         if (errno == ENOTDIR)
705                                 log_error("%s already exists and is neither symlink nor directory.", p);
706                         else {
707                                 log_error("Failed to remove %s: %m", p);
708                                 r = -errno;
709                         }
710
711                         goto finish;
712                 }
713         } else if (r != -ENOENT) {
714                 log_error("readlink(%s) failed: %m", p);
715                 goto finish;
716         }
717
718         if (arg_link_journal == LINK_GUEST) {
719
720                 if (symlink(q, p) < 0) {
721                         log_error("Failed to symlink %s to %s: %m", q, p);
722                         r = -errno;
723                         goto finish;
724                 }
725
726                 mkdir_p(q, 0755);
727
728                 r = 0;
729                 goto finish;
730         }
731
732         if (arg_link_journal == LINK_HOST) {
733                 r = mkdir_p(p, 0755);
734                 if (r < 0) {
735                         log_error("Failed to create %s: %m", p);
736                         goto finish;
737                 }
738
739         } else if (access(p, F_OK) < 0) {
740                 r = 0;
741                 goto finish;
742         }
743
744         if (dir_is_empty(q) == 0) {
745                 log_error("%s not empty.", q);
746                 r = -ENOTEMPTY;
747                 goto finish;
748         }
749
750         r = mkdir_p(q, 0755);
751         if (r < 0) {
752                 log_error("Failed to create %s: %m", q);
753                 goto finish;
754         }
755
756         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
757                 log_error("Failed to bind mount journal from host into guest: %m");
758                 r = -errno;
759                 goto finish;
760         }
761
762         r = 0;
763
764 finish:
765         free(p);
766         free(q);
767         free(d);
768         free(b);
769         return r;
770
771 }
772
773 static int drop_capabilities(void) {
774         return capability_bounding_set_drop(~arg_retain, false);
775 }
776
777 static int is_os_tree(const char *path) {
778         int r;
779         char *p;
780         /* We use /bin/sh as flag file if something is an OS */
781
782         if (asprintf(&p, "%s/bin/sh", path) < 0)
783                 return -ENOMEM;
784
785         r = access(p, F_OK);
786         free(p);
787
788         return r < 0 ? 0 : 1;
789 }
790
791 static int process_pty(int master, sigset_t *mask) {
792
793         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
794         size_t in_buffer_full = 0, out_buffer_full = 0;
795         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
796         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
797         int ep = -1, signal_fd = -1, r;
798
799         fd_nonblock(STDIN_FILENO, 1);
800         fd_nonblock(STDOUT_FILENO, 1);
801         fd_nonblock(master, 1);
802
803         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
804         if (signal_fd < 0) {
805                 log_error("signalfd(): %m");
806                 r = -errno;
807                 goto finish;
808         }
809
810         ep = epoll_create1(EPOLL_CLOEXEC);
811         if (ep < 0) {
812                 log_error("Failed to create epoll: %m");
813                 r = -errno;
814                 goto finish;
815         }
816
817         zero(stdin_ev);
818         stdin_ev.events = EPOLLIN|EPOLLET;
819         stdin_ev.data.fd = STDIN_FILENO;
820
821         zero(stdout_ev);
822         stdout_ev.events = EPOLLOUT|EPOLLET;
823         stdout_ev.data.fd = STDOUT_FILENO;
824
825         zero(master_ev);
826         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
827         master_ev.data.fd = master;
828
829         zero(signal_ev);
830         signal_ev.events = EPOLLIN;
831         signal_ev.data.fd = signal_fd;
832
833         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
834             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
835             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
836             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
837                 log_error("Failed to regiser fds in epoll: %m");
838                 r = -errno;
839                 goto finish;
840         }
841
842         for (;;) {
843                 struct epoll_event ev[16];
844                 ssize_t k;
845                 int i, nfds;
846
847                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
848                 if (nfds < 0) {
849
850                         if (errno == EINTR || errno == EAGAIN)
851                                 continue;
852
853                         log_error("epoll_wait(): %m");
854                         r = -errno;
855                         goto finish;
856                 }
857
858                 assert(nfds >= 1);
859
860                 for (i = 0; i < nfds; i++) {
861                         if (ev[i].data.fd == STDIN_FILENO) {
862
863                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
864                                         stdin_readable = true;
865
866                         } else if (ev[i].data.fd == STDOUT_FILENO) {
867
868                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
869                                         stdout_writable = true;
870
871                         } else if (ev[i].data.fd == master) {
872
873                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
874                                         master_readable = true;
875
876                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877                                         master_writable = true;
878
879                         } else if (ev[i].data.fd == signal_fd) {
880                                 struct signalfd_siginfo sfsi;
881                                 ssize_t n;
882
883                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
884                                 if (n != sizeof(sfsi)) {
885
886                                         if (n >= 0) {
887                                                 log_error("Failed to read from signalfd: invalid block size");
888                                                 r = -EIO;
889                                                 goto finish;
890                                         }
891
892                                         if (errno != EINTR && errno != EAGAIN) {
893                                                 log_error("Failed to read from signalfd: %m");
894                                                 r = -errno;
895                                                 goto finish;
896                                         }
897                                 } else {
898
899                                         if (sfsi.ssi_signo == SIGWINCH) {
900                                                 struct winsize ws;
901
902                                                 /* The window size changed, let's forward that. */
903                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
904                                                         ioctl(master, TIOCSWINSZ, &ws);
905                                         } else {
906                                                 r = 0;
907                                                 goto finish;
908                                         }
909                                 }
910                         }
911                 }
912
913                 while ((stdin_readable && in_buffer_full <= 0) ||
914                        (master_writable && in_buffer_full > 0) ||
915                        (master_readable && out_buffer_full <= 0) ||
916                        (stdout_writable && out_buffer_full > 0)) {
917
918                         if (stdin_readable && in_buffer_full < LINE_MAX) {
919
920                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
921                                 if (k < 0) {
922
923                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
924                                                 stdin_readable = false;
925                                         else {
926                                                 log_error("read(): %m");
927                                                 r = -errno;
928                                                 goto finish;
929                                         }
930                                 } else
931                                         in_buffer_full += (size_t) k;
932                         }
933
934                         if (master_writable && in_buffer_full > 0) {
935
936                                 k = write(master, in_buffer, in_buffer_full);
937                                 if (k < 0) {
938
939                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
940                                                 master_writable = false;
941                                         else {
942                                                 log_error("write(): %m");
943                                                 r = -errno;
944                                                 goto finish;
945                                         }
946
947                                 } else {
948                                         assert(in_buffer_full >= (size_t) k);
949                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
950                                         in_buffer_full -= k;
951                                 }
952                         }
953
954                         if (master_readable && out_buffer_full < LINE_MAX) {
955
956                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
957                                 if (k < 0) {
958
959                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
960                                                 master_readable = false;
961                                         else {
962                                                 log_error("read(): %m");
963                                                 r = -errno;
964                                                 goto finish;
965                                         }
966                                 }  else
967                                         out_buffer_full += (size_t) k;
968                         }
969
970                         if (stdout_writable && out_buffer_full > 0) {
971
972                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
973                                 if (k < 0) {
974
975                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
976                                                 stdout_writable = false;
977                                         else {
978                                                 log_error("write(): %m");
979                                                 r = -errno;
980                                                 goto finish;
981                                         }
982
983                                 } else {
984                                         assert(out_buffer_full >= (size_t) k);
985                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
986                                         out_buffer_full -= k;
987                                 }
988                         }
989                 }
990         }
991
992 finish:
993         if (ep >= 0)
994                 close_nointr_nofail(ep);
995
996         if (signal_fd >= 0)
997                 close_nointr_nofail(signal_fd);
998
999         return r;
1000 }
1001
1002 int main(int argc, char *argv[]) {
1003         pid_t pid = 0;
1004         int r = EXIT_FAILURE, k;
1005         char *oldcg = NULL, *newcg = NULL;
1006         char **controller = NULL;
1007         int master = -1;
1008         const char *console = NULL;
1009         struct termios saved_attr, raw_attr;
1010         sigset_t mask;
1011         bool saved_attr_valid = false;
1012         struct winsize ws;
1013         int kmsg_socket_pair[2] = { -1, -1 };
1014
1015         log_parse_environment();
1016         log_open();
1017
1018         r = parse_argv(argc, argv);
1019         if (r <= 0)
1020                 goto finish;
1021
1022         if (arg_directory) {
1023                 char *p;
1024
1025                 p = path_make_absolute_cwd(arg_directory);
1026                 free(arg_directory);
1027                 arg_directory = p;
1028         } else
1029                 arg_directory = get_current_dir_name();
1030
1031         if (!arg_directory) {
1032                 log_error("Failed to determine path");
1033                 goto finish;
1034         }
1035
1036         path_kill_slashes(arg_directory);
1037
1038         if (geteuid() != 0) {
1039                 log_error("Need to be root.");
1040                 goto finish;
1041         }
1042
1043         if (sd_booted() <= 0) {
1044                 log_error("Not running on a systemd system.");
1045                 goto finish;
1046         }
1047
1048         if (path_equal(arg_directory, "/")) {
1049                 log_error("Spawning container on root directory not supported.");
1050                 goto finish;
1051         }
1052
1053         if (is_os_tree(arg_directory) <= 0) {
1054                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1055                 goto finish;
1056         }
1057
1058         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1059         if (k < 0) {
1060                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1061                 goto finish;
1062         }
1063
1064         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1065                 log_error("Failed to allocate cgroup path.");
1066                 goto finish;
1067         }
1068
1069         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1070         if (k < 0)  {
1071                 log_error("Failed to create cgroup: %s", strerror(-k));
1072                 goto finish;
1073         }
1074
1075         STRV_FOREACH(controller, arg_controllers) {
1076                 k = cg_create_and_attach(*controller, newcg, 0);
1077                 if (k < 0)
1078                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1079         }
1080
1081         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1082         if (master < 0) {
1083                 log_error("Failed to acquire pseudo tty: %m");
1084                 goto finish;
1085         }
1086
1087         console = ptsname(master);
1088         if (!console) {
1089                 log_error("Failed to determine tty name: %m");
1090                 goto finish;
1091         }
1092
1093         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1094
1095         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1096                 ioctl(master, TIOCSWINSZ, &ws);
1097
1098         if (unlockpt(master) < 0) {
1099                 log_error("Failed to unlock tty: %m");
1100                 goto finish;
1101         }
1102
1103         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1104                 log_error("Failed to get terminal attributes: %m");
1105                 goto finish;
1106         }
1107
1108         saved_attr_valid = true;
1109
1110         raw_attr = saved_attr;
1111         cfmakeraw(&raw_attr);
1112         raw_attr.c_lflag &= ~ECHO;
1113
1114         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1115                 log_error("Failed to set terminal attributes: %m");
1116                 goto finish;
1117         }
1118
1119         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1120                 log_error("Failed to create kmsg socket pair");
1121                 goto finish;
1122         }
1123
1124         assert_se(sigemptyset(&mask) == 0);
1125         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1126         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1127
1128         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1129         if (pid < 0) {
1130                 if (errno == EINVAL)
1131                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1132                 else
1133                         log_error("clone() failed: %m");
1134
1135                 goto finish;
1136         }
1137
1138         if (pid == 0) {
1139                 /* child */
1140
1141                 const char *home = NULL;
1142                 uid_t uid = (uid_t) -1;
1143                 gid_t gid = (gid_t) -1;
1144                 const char *envp[] = {
1145                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1146                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1147                         NULL, /* TERM */
1148                         NULL, /* HOME */
1149                         NULL, /* USER */
1150                         NULL, /* LOGNAME */
1151                         NULL, /* container_uuid */
1152                         NULL
1153                 };
1154
1155                 envp[2] = strv_find_prefix(environ, "TERM=");
1156
1157                 close_nointr_nofail(master);
1158
1159                 close_nointr(STDIN_FILENO);
1160                 close_nointr(STDOUT_FILENO);
1161                 close_nointr(STDERR_FILENO);
1162
1163                 close_all_fds(&kmsg_socket_pair[1], 1);
1164
1165                 reset_all_signal_handlers();
1166
1167                 assert_se(sigemptyset(&mask) == 0);
1168                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1169
1170                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1171                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1172                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1173                         goto child_fail;
1174
1175                 if (setsid() < 0) {
1176                         log_error("setsid() failed: %m");
1177                         goto child_fail;
1178                 }
1179
1180                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1181                         log_error("PR_SET_PDEATHSIG failed: %m");
1182                         goto child_fail;
1183                 }
1184
1185                 /* Mark everything as slave, so that we still
1186                  * receive mounts from the real root, but don't
1187                  * propagate mounts to the real root. */
1188                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1189                         log_error("MS_SLAVE|MS_REC failed: %m");
1190                         goto child_fail;
1191                 }
1192
1193                 /* Turn directory into bind mount */
1194                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1195                         log_error("Failed to make bind mount.");
1196                         goto child_fail;
1197                 }
1198
1199                 if (arg_read_only)
1200                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1201                                 log_error("Failed to make read-only.");
1202                                 goto child_fail;
1203                         }
1204
1205                 if (mount_all(arg_directory) < 0)
1206                         goto child_fail;
1207
1208                 if (copy_devnodes(arg_directory) < 0)
1209                         goto child_fail;
1210
1211                 dev_setup(arg_directory);
1212
1213                 if (setup_dev_console(arg_directory, console) < 0)
1214                         goto child_fail;
1215
1216                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1217                         goto child_fail;
1218
1219                 close_nointr_nofail(kmsg_socket_pair[1]);
1220
1221                 if (setup_timezone(arg_directory) < 0)
1222                         goto child_fail;
1223
1224                 if (setup_resolv_conf(arg_directory) < 0)
1225                         goto child_fail;
1226
1227                 if (setup_journal(arg_directory) < 0)
1228                         goto child_fail;
1229
1230                 if (chdir(arg_directory) < 0) {
1231                         log_error("chdir(%s) failed: %m", arg_directory);
1232                         goto child_fail;
1233                 }
1234
1235                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1236                         log_error("mount(MS_MOVE) failed: %m");
1237                         goto child_fail;
1238                 }
1239
1240                 if (chroot(".") < 0) {
1241                         log_error("chroot() failed: %m");
1242                         goto child_fail;
1243                 }
1244
1245                 if (chdir("/") < 0) {
1246                         log_error("chdir() failed: %m");
1247                         goto child_fail;
1248                 }
1249
1250                 umask(0022);
1251
1252                 loopback_setup();
1253
1254                 if (drop_capabilities() < 0) {
1255                         log_error("drop_capabilities() failed: %m");
1256                         goto child_fail;
1257                 }
1258
1259                 if (arg_user) {
1260
1261                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1262                                 log_error("get_user_creds() failed: %m");
1263                                 goto child_fail;
1264                         }
1265
1266                         if (mkdir_parents_label(home, 0775) < 0) {
1267                                 log_error("mkdir_parents_label() failed: %m");
1268                                 goto child_fail;
1269                         }
1270
1271                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1272                                 log_error("mkdir_safe_label() failed: %m");
1273                                 goto child_fail;
1274                         }
1275
1276                         if (initgroups((const char*)arg_user, gid) < 0) {
1277                                 log_error("initgroups() failed: %m");
1278                                 goto child_fail;
1279                         }
1280
1281                         if (setresgid(gid, gid, gid) < 0) {
1282                                 log_error("setregid() failed: %m");
1283                                 goto child_fail;
1284                         }
1285
1286                         if (setresuid(uid, uid, uid) < 0) {
1287                                 log_error("setreuid() failed: %m");
1288                                 goto child_fail;
1289                         }
1290                 }
1291
1292                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1293                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1294                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1295                     log_oom();
1296                     goto child_fail;
1297                 }
1298
1299                 if (arg_uuid) {
1300                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1301                                 log_oom();
1302                                 goto child_fail;
1303                         }
1304                 }
1305
1306                 setup_hostname();
1307
1308                 if (arg_boot) {
1309                         char **a;
1310                         size_t l;
1311
1312                         /* Automatically search for the init system */
1313
1314                         l = 1 + argc - optind;
1315                         a = newa(char*, l + 1);
1316                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1317
1318                         a[0] = (char*) "/usr/lib/systemd/systemd";
1319                         execve(a[0], a, (char**) envp);
1320
1321                         a[0] = (char*) "/lib/systemd/systemd";
1322                         execve(a[0], a, (char**) envp);
1323
1324                         a[0] = (char*) "/sbin/init";
1325                         execve(a[0], a, (char**) envp);
1326                 } else if (argc > optind)
1327                         execvpe(argv[optind], argv + optind, (char**) envp);
1328                 else {
1329                         chdir(home ? home : "/root");
1330                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1331                 }
1332
1333                 log_error("execv() failed: %m");
1334
1335         child_fail:
1336                 _exit(EXIT_FAILURE);
1337         }
1338
1339         if (process_pty(master, &mask) < 0)
1340                 goto finish;
1341
1342         if (saved_attr_valid) {
1343                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1344                 saved_attr_valid = false;
1345         }
1346
1347         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1348
1349         if (r < 0)
1350                 r = EXIT_FAILURE;
1351
1352 finish:
1353         if (saved_attr_valid)
1354                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1355
1356         if (master >= 0)
1357                 close_nointr_nofail(master);
1358
1359         close_pipe(kmsg_socket_pair);
1360
1361         if (oldcg)
1362                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1363
1364         if (newcg)
1365                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1366
1367         free(arg_directory);
1368         strv_free(arg_controllers);
1369         free(oldcg);
1370         free(newcg);
1371
1372         return r;
1373 }