chiark / gitweb /
nspawn: inherit mounts from real root, don't propagate mounts to real root
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56
57 typedef enum LinkJournal {
58         LINK_NO,
59         LINK_AUTO,
60         LINK_HOST,
61         LINK_GUEST
62 } LinkJournal;
63
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
73         (1ULL << CAP_CHOWN) |
74         (1ULL << CAP_DAC_OVERRIDE) |
75         (1ULL << CAP_DAC_READ_SEARCH) |
76         (1ULL << CAP_FOWNER) |
77         (1ULL << CAP_FSETID) |
78         (1ULL << CAP_IPC_OWNER) |
79         (1ULL << CAP_KILL) |
80         (1ULL << CAP_LEASE) |
81         (1ULL << CAP_LINUX_IMMUTABLE) |
82         (1ULL << CAP_NET_BIND_SERVICE) |
83         (1ULL << CAP_NET_BROADCAST) |
84         (1ULL << CAP_NET_RAW) |
85         (1ULL << CAP_SETGID) |
86         (1ULL << CAP_SETFCAP) |
87         (1ULL << CAP_SETPCAP) |
88         (1ULL << CAP_SETUID) |
89         (1ULL << CAP_SYS_ADMIN) |
90         (1ULL << CAP_SYS_CHROOT) |
91         (1ULL << CAP_SYS_NICE) |
92         (1ULL << CAP_SYS_PTRACE) |
93         (1ULL << CAP_SYS_TTY_CONFIG) |
94         (1ULL << CAP_SYS_RESOURCE);
95
96 static int help(void) {
97
98         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100                "  -h --help               Show this help\n"
101                "  -D --directory=NAME     Root directory for the container\n"
102                "  -b --boot               Boot up full system (i.e. invoke init)\n"
103                "  -u --user=USER          Run the command under specified user or uid\n"
104                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
105                "     --uuid=UUID          Set a specific machine UUID for the container\n"
106                "     --private-network    Disable network in container\n"
107                "     --read-only          Mount the root directory read-only\n"
108                "     --capability=CAP     In addition to the default, retain specified capability\n"
109                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
110                "  -j                      Equivalent to --link-journal=host\n",
111                program_invocation_short_name);
112
113         return 0;
114 }
115
116 static int parse_argv(int argc, char *argv[]) {
117
118         enum {
119                 ARG_PRIVATE_NETWORK = 0x100,
120                 ARG_UUID,
121                 ARG_READ_ONLY,
122                 ARG_CAPABILITY,
123                 ARG_LINK_JOURNAL
124         };
125
126         static const struct option options[] = {
127                 { "help",            no_argument,       NULL, 'h'                 },
128                 { "directory",       required_argument, NULL, 'D'                 },
129                 { "user",            required_argument, NULL, 'u'                 },
130                 { "controllers",     required_argument, NULL, 'C'                 },
131                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
132                 { "boot",            no_argument,       NULL, 'b'                 },
133                 { "uuid",            required_argument, NULL, ARG_UUID            },
134                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
135                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
136                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
137                 { NULL,              0,                 NULL, 0                   }
138         };
139
140         int c;
141
142         assert(argc >= 0);
143         assert(argv);
144
145         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
146
147                 switch (c) {
148
149                 case 'h':
150                         help();
151                         return 0;
152
153                 case 'D':
154                         free(arg_directory);
155                         arg_directory = canonicalize_file_name(optarg);
156                         if (!arg_directory) {
157                                 log_error("Failed to canonicalize root directory.");
158                                 return -ENOMEM;
159                         }
160
161                         break;
162
163                 case 'u':
164                         free(arg_user);
165                         if (!(arg_user = strdup(optarg))) {
166                                 log_error("Failed to duplicate user name.");
167                                 return -ENOMEM;
168                         }
169
170                         break;
171
172                 case 'C':
173                         strv_free(arg_controllers);
174                         arg_controllers = strv_split(optarg, ",");
175                         if (!arg_controllers) {
176                                 log_error("Failed to split controllers list.");
177                                 return -ENOMEM;
178                         }
179                         strv_uniq(arg_controllers);
180
181                         break;
182
183                 case ARG_PRIVATE_NETWORK:
184                         arg_private_network = true;
185                         break;
186
187                 case 'b':
188                         arg_boot = true;
189                         break;
190
191                 case ARG_UUID:
192                         arg_uuid = optarg;
193                         break;
194
195                 case ARG_READ_ONLY:
196                         arg_read_only = true;
197                         break;
198
199                 case ARG_CAPABILITY: {
200                         char *state, *word;
201                         size_t length;
202
203                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
204                                 cap_value_t cap;
205                                 char *t;
206
207                                 t = strndup(word, length);
208                                 if (!t)
209                                         return log_oom();
210
211                                 if (cap_from_name(t, &cap) < 0) {
212                                         log_error("Failed to parse capability %s.", t);
213                                         free(t);
214                                         return -EINVAL;
215                                 }
216
217                                 free(t);
218                                 arg_retain |= 1ULL << (uint64_t) cap;
219                         }
220
221                         break;
222                 }
223
224                 case 'j':
225                         arg_link_journal = LINK_GUEST;
226                         break;
227
228                 case ARG_LINK_JOURNAL:
229                         if (streq(optarg, "auto"))
230                                 arg_link_journal = LINK_AUTO;
231                         else if (streq(optarg, "no"))
232                                 arg_link_journal = LINK_NO;
233                         else if (streq(optarg, "guest"))
234                                 arg_link_journal = LINK_GUEST;
235                         else if (streq(optarg, "host"))
236                                 arg_link_journal = LINK_HOST;
237                         else {
238                                 log_error("Failed to parse link journal mode %s", optarg);
239                                 return -EINVAL;
240                         }
241
242                         break;
243
244                 case '?':
245                         return -EINVAL;
246
247                 default:
248                         log_error("Unknown option code %c", c);
249                         return -EINVAL;
250                 }
251         }
252
253         return 1;
254 }
255
256 static int mount_all(const char *dest) {
257
258         typedef struct MountPoint {
259                 const char *what;
260                 const char *where;
261                 const char *type;
262                 const char *options;
263                 unsigned long flags;
264                 bool fatal;
265         } MountPoint;
266
267         static const MountPoint mount_table[] = {
268                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
269                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
270                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
271                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
272                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
273                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
274                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
275                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
276 #ifdef HAVE_SELINUX
277                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
278                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
279 #endif
280         };
281
282         unsigned k;
283         int r = 0;
284         char *where;
285
286         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287                 int t;
288
289                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290                         log_oom();
291
292                         if (r == 0)
293                                 r = -ENOMEM;
294
295                         break;
296                 }
297
298                 t = path_is_mount_point(where, false);
299                 if (t < 0) {
300                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301                         free(where);
302
303                         if (r == 0)
304                                 r = t;
305
306                         continue;
307                 }
308
309                 mkdir_p_label(where, 0755);
310
311                 if (mount(mount_table[k].what,
312                           where,
313                           mount_table[k].type,
314                           mount_table[k].flags,
315                           mount_table[k].options) < 0 &&
316                     mount_table[k].fatal) {
317
318                         log_error("mount(%s) failed: %m", where);
319
320                         if (r == 0)
321                                 r = -errno;
322                 }
323
324                 free(where);
325         }
326
327         return r;
328 }
329
330 static int setup_timezone(const char *dest) {
331         char *where;
332
333         assert(dest);
334
335         /* Fix the timezone, if possible */
336         if (asprintf(&where, "%s/etc/localtime", dest) < 0)
337                 return log_oom();
338
339         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
340                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
341
342         free(where);
343
344         if (asprintf(&where, "%s/etc/timezone", dest) < 0)
345                 return log_oom();
346
347         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
348                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
349
350         free(where);
351
352         return 0;
353 }
354
355 static int setup_resolv_conf(const char *dest) {
356         char *where;
357
358         assert(dest);
359
360         if (arg_private_network)
361                 return 0;
362
363         /* Fix resolv.conf, if possible */
364         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
365                 return log_oom();
366         }
367
368         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
369                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
370
371         free(where);
372
373         return 0;
374 }
375
376 static int copy_devnodes(const char *dest) {
377
378         static const char devnodes[] =
379                 "null\0"
380                 "zero\0"
381                 "full\0"
382                 "random\0"
383                 "urandom\0"
384                 "tty\0"
385                 "ptmx\0"
386                 "rtc0\0";
387
388         const char *d;
389         int r = 0;
390         mode_t u;
391
392         assert(dest);
393
394         u = umask(0000);
395
396         NULSTR_FOREACH(d, devnodes) {
397                 struct stat st;
398                 char *from = NULL, *to = NULL;
399
400                 asprintf(&from, "/dev/%s", d);
401                 asprintf(&to, "%s/dev/%s", dest, d);
402
403                 if (!from || !to) {
404                         log_error("Failed to allocate devnode path");
405
406                         free(from);
407                         free(to);
408
409                         from = to = NULL;
410
411                         if (r == 0)
412                                 r = -ENOMEM;
413
414                         break;
415                 }
416
417                 if (stat(from, &st) < 0) {
418
419                         if (errno != ENOENT) {
420                                 log_error("Failed to stat %s: %m", from);
421                                 if (r == 0)
422                                         r = -errno;
423                         }
424
425                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
426
427                         log_error("%s is not a char or block device, cannot copy.", from);
428                         if (r == 0)
429                                 r = -EIO;
430
431                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
432
433                         log_error("mknod(%s) failed: %m", dest);
434                         if (r == 0)
435                                 r = -errno;
436                 }
437
438                 free(from);
439                 free(to);
440         }
441
442         umask(u);
443
444         return r;
445 }
446
447 static int setup_dev_console(const char *dest, const char *console) {
448         struct stat st;
449         char *to = NULL;
450         int r;
451         mode_t u;
452
453         assert(dest);
454         assert(console);
455
456         u = umask(0000);
457
458         if (stat(console, &st) < 0) {
459                 log_error("Failed to stat %s: %m", console);
460                 r = -errno;
461                 goto finish;
462
463         } else if (!S_ISCHR(st.st_mode)) {
464                 log_error("/dev/console is not a char device.");
465                 r = -EIO;
466                 goto finish;
467         }
468
469         r = chmod_and_chown(console, 0600, 0, 0);
470         if (r < 0) {
471                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
472                 goto finish;
473         }
474
475         if (asprintf(&to, "%s/dev/console", dest) < 0) {
476                 r = log_oom();
477                 goto finish;
478         }
479
480         /* We need to bind mount the right tty to /dev/console since
481          * ptys can only exist on pts file systems. To have something
482          * to bind mount things on we create a device node first, that
483          * has the right major/minor (note that the major minor
484          * doesn't actually matter here, since we mount it over
485          * anyway). */
486
487         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
488                 log_error("mknod() for /dev/console failed: %m");
489                 r = -errno;
490                 goto finish;
491         }
492
493         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
494                 log_error("Bind mount for /dev/console failed: %m");
495                 r = -errno;
496                 goto finish;
497         }
498
499 finish:
500         free(to);
501         umask(u);
502
503         return r;
504 }
505
506 static int setup_kmsg(const char *dest, int kmsg_socket) {
507         char *from = NULL, *to = NULL;
508         int r, fd, k;
509         mode_t u;
510         union {
511                 struct cmsghdr cmsghdr;
512                 uint8_t buf[CMSG_SPACE(sizeof(int))];
513         } control;
514         struct msghdr mh;
515         struct cmsghdr *cmsg;
516
517         assert(dest);
518         assert(kmsg_socket >= 0);
519
520         u = umask(0000);
521
522         /* We create the kmsg FIFO as /dev/kmsg, but immediately
523          * delete it after bind mounting it to /proc/kmsg. While FIFOs
524          * on the reading side behave very similar to /proc/kmsg,
525          * their writing side behaves differently from /dev/kmsg in
526          * that writing blocks when nothing is reading. In order to
527          * avoid any problems with containers deadlocking due to this
528          * we simply make /dev/kmsg unavailable to the container. */
529         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
530                 r = log_oom();
531                 goto finish;
532         }
533
534         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
535                 r = log_oom();
536                 goto finish;
537         }
538
539         if (mkfifo(from, 0600) < 0) {
540                 log_error("mkfifo() for /dev/kmsg failed: %m");
541                 r = -errno;
542                 goto finish;
543         }
544
545         r = chmod_and_chown(from, 0600, 0, 0);
546         if (r < 0) {
547                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
548                 goto finish;
549         }
550
551         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
552                 log_error("Bind mount for /proc/kmsg failed: %m");
553                 r = -errno;
554                 goto finish;
555         }
556
557         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
558         if (fd < 0) {
559                 log_error("Failed to open fifo: %m");
560                 r = -errno;
561                 goto finish;
562         }
563
564         zero(mh);
565         zero(control);
566
567         mh.msg_control = &control;
568         mh.msg_controllen = sizeof(control);
569
570         cmsg = CMSG_FIRSTHDR(&mh);
571         cmsg->cmsg_level = SOL_SOCKET;
572         cmsg->cmsg_type = SCM_RIGHTS;
573         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
574         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
575
576         mh.msg_controllen = cmsg->cmsg_len;
577
578         /* Store away the fd in the socket, so that it stays open as
579          * long as we run the child */
580         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
581         close_nointr_nofail(fd);
582
583         if (k < 0) {
584                 log_error("Failed to send FIFO fd: %m");
585                 r = -errno;
586                 goto finish;
587         }
588
589         /* And now make the FIFO unavailable as /dev/kmsg... */
590         unlink(from);
591
592 finish:
593         free(from);
594         free(to);
595         umask(u);
596
597         return r;
598 }
599
600 static int setup_hostname(void) {
601         char *hn;
602         int r = 0;
603
604         hn = path_get_file_name(arg_directory);
605         if (hn) {
606                 hn = strdup(hn);
607                 if (!hn)
608                         return -ENOMEM;
609
610                 hostname_cleanup(hn);
611
612                 if (!isempty(hn))
613                         if (sethostname(hn, strlen(hn)) < 0)
614                                 r = -errno;
615
616                 free(hn);
617         }
618
619         return r;
620 }
621
622 static int setup_journal(const char *directory) {
623         sd_id128_t machine_id;
624         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
625         int r;
626
627         if (arg_link_journal == LINK_NO)
628                 return 0;
629
630         p = strappend(directory, "/etc/machine-id");
631         if (!p) {
632                 r = log_oom();
633                 goto finish;
634         }
635
636         r = read_one_line_file(p, &b);
637         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
638                 r = 0;
639                 goto finish;
640         } else if (r < 0) {
641                 log_error("Failed to read machine ID: %s", strerror(-r));
642                 return r;
643         }
644
645         l = strstrip(b);
646         if (isempty(l) && arg_link_journal == LINK_AUTO) {
647                 r = 0;
648                 goto finish;
649         }
650
651         /* Verify validaty */
652         r = sd_id128_from_string(l, &machine_id);
653         if (r < 0) {
654                 log_error("Failed to parse machine ID: %s", strerror(-r));
655                 goto finish;
656         }
657
658         free(p);
659         p = strappend("/var/log/journal/", l);
660         q = strjoin(directory, "/var/log/journal/", l, NULL);
661         if (!p || !q) {
662                 r = log_oom();
663                 goto finish;
664         }
665
666         if (path_is_mount_point(p, false) > 0 ||
667             path_is_mount_point(q, false) > 0) {
668                 if (arg_link_journal != LINK_AUTO) {
669                         log_error("Journal already a mount point, refusing.");
670                         r = -EEXIST;
671                         goto finish;
672                 }
673
674                 r = 0;
675                 goto finish;
676         }
677
678         r = readlink_and_make_absolute(p, &d);
679         if (r >= 0) {
680                 if ((arg_link_journal == LINK_GUEST ||
681                      arg_link_journal == LINK_AUTO) &&
682                     path_equal(d, q)) {
683
684                         mkdir_p(q, 0755);
685
686                         r = 0;
687                         goto finish;
688                 }
689
690                 if (unlink(p) < 0) {
691                         log_error("Failed to remove symlink %s: %m", p);
692                         r = -errno;
693                         goto finish;
694                 }
695         } else if (r == -EINVAL) {
696
697                 if (arg_link_journal == LINK_GUEST &&
698                     rmdir(p) < 0) {
699
700                         if (errno == ENOTDIR)
701                                 log_error("%s already exists and is neither symlink nor directory.", p);
702                         else {
703                                 log_error("Failed to remove %s: %m", p);
704                                 r = -errno;
705                         }
706
707                         goto finish;
708                 }
709         } else if (r != -ENOENT) {
710                 log_error("readlink(%s) failed: %m", p);
711                 goto finish;
712         }
713
714         if (arg_link_journal == LINK_GUEST) {
715
716                 if (symlink(q, p) < 0) {
717                         log_error("Failed to symlink %s to %s: %m", q, p);
718                         r = -errno;
719                         goto finish;
720                 }
721
722                 mkdir_p(q, 0755);
723
724                 r = 0;
725                 goto finish;
726         }
727
728         if (arg_link_journal == LINK_HOST) {
729                 r = mkdir_p(p, 0755);
730                 if (r < 0) {
731                         log_error("Failed to create %s: %m", p);
732                         goto finish;
733                 }
734
735         } else if (access(p, F_OK) < 0) {
736                 r = 0;
737                 goto finish;
738         }
739
740         if (dir_is_empty(q) == 0) {
741                 log_error("%s not empty.", q);
742                 r = -ENOTEMPTY;
743                 goto finish;
744         }
745
746         r = mkdir_p(q, 0755);
747         if (r < 0) {
748                 log_error("Failed to create %s: %m", q);
749                 goto finish;
750         }
751
752         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
753                 log_error("Failed to bind mount journal from host into guest: %m");
754                 r = -errno;
755                 goto finish;
756         }
757
758         r = 0;
759
760 finish:
761         free(p);
762         free(q);
763         free(d);
764         free(b);
765         return r;
766
767 }
768
769 static int drop_capabilities(void) {
770         return capability_bounding_set_drop(~arg_retain, false);
771 }
772
773 static int is_os_tree(const char *path) {
774         int r;
775         char *p;
776         /* We use /bin/sh as flag file if something is an OS */
777
778         if (asprintf(&p, "%s/bin/sh", path) < 0)
779                 return -ENOMEM;
780
781         r = access(p, F_OK);
782         free(p);
783
784         return r < 0 ? 0 : 1;
785 }
786
787 static int process_pty(int master, sigset_t *mask) {
788
789         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
790         size_t in_buffer_full = 0, out_buffer_full = 0;
791         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
792         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
793         int ep = -1, signal_fd = -1, r;
794
795         fd_nonblock(STDIN_FILENO, 1);
796         fd_nonblock(STDOUT_FILENO, 1);
797         fd_nonblock(master, 1);
798
799         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
800         if (signal_fd < 0) {
801                 log_error("signalfd(): %m");
802                 r = -errno;
803                 goto finish;
804         }
805
806         ep = epoll_create1(EPOLL_CLOEXEC);
807         if (ep < 0) {
808                 log_error("Failed to create epoll: %m");
809                 r = -errno;
810                 goto finish;
811         }
812
813         zero(stdin_ev);
814         stdin_ev.events = EPOLLIN|EPOLLET;
815         stdin_ev.data.fd = STDIN_FILENO;
816
817         zero(stdout_ev);
818         stdout_ev.events = EPOLLOUT|EPOLLET;
819         stdout_ev.data.fd = STDOUT_FILENO;
820
821         zero(master_ev);
822         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
823         master_ev.data.fd = master;
824
825         zero(signal_ev);
826         signal_ev.events = EPOLLIN;
827         signal_ev.data.fd = signal_fd;
828
829         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
830             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
831             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
832             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
833                 log_error("Failed to regiser fds in epoll: %m");
834                 r = -errno;
835                 goto finish;
836         }
837
838         for (;;) {
839                 struct epoll_event ev[16];
840                 ssize_t k;
841                 int i, nfds;
842
843                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
844                 if (nfds < 0) {
845
846                         if (errno == EINTR || errno == EAGAIN)
847                                 continue;
848
849                         log_error("epoll_wait(): %m");
850                         r = -errno;
851                         goto finish;
852                 }
853
854                 assert(nfds >= 1);
855
856                 for (i = 0; i < nfds; i++) {
857                         if (ev[i].data.fd == STDIN_FILENO) {
858
859                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
860                                         stdin_readable = true;
861
862                         } else if (ev[i].data.fd == STDOUT_FILENO) {
863
864                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
865                                         stdout_writable = true;
866
867                         } else if (ev[i].data.fd == master) {
868
869                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
870                                         master_readable = true;
871
872                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
873                                         master_writable = true;
874
875                         } else if (ev[i].data.fd == signal_fd) {
876                                 struct signalfd_siginfo sfsi;
877                                 ssize_t n;
878
879                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
880                                 if (n != sizeof(sfsi)) {
881
882                                         if (n >= 0) {
883                                                 log_error("Failed to read from signalfd: invalid block size");
884                                                 r = -EIO;
885                                                 goto finish;
886                                         }
887
888                                         if (errno != EINTR && errno != EAGAIN) {
889                                                 log_error("Failed to read from signalfd: %m");
890                                                 r = -errno;
891                                                 goto finish;
892                                         }
893                                 } else {
894
895                                         if (sfsi.ssi_signo == SIGWINCH) {
896                                                 struct winsize ws;
897
898                                                 /* The window size changed, let's forward that. */
899                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
900                                                         ioctl(master, TIOCSWINSZ, &ws);
901                                         } else {
902                                                 r = 0;
903                                                 goto finish;
904                                         }
905                                 }
906                         }
907                 }
908
909                 while ((stdin_readable && in_buffer_full <= 0) ||
910                        (master_writable && in_buffer_full > 0) ||
911                        (master_readable && out_buffer_full <= 0) ||
912                        (stdout_writable && out_buffer_full > 0)) {
913
914                         if (stdin_readable && in_buffer_full < LINE_MAX) {
915
916                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
917                                 if (k < 0) {
918
919                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
920                                                 stdin_readable = false;
921                                         else {
922                                                 log_error("read(): %m");
923                                                 r = -errno;
924                                                 goto finish;
925                                         }
926                                 } else
927                                         in_buffer_full += (size_t) k;
928                         }
929
930                         if (master_writable && in_buffer_full > 0) {
931
932                                 k = write(master, in_buffer, in_buffer_full);
933                                 if (k < 0) {
934
935                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
936                                                 master_writable = false;
937                                         else {
938                                                 log_error("write(): %m");
939                                                 r = -errno;
940                                                 goto finish;
941                                         }
942
943                                 } else {
944                                         assert(in_buffer_full >= (size_t) k);
945                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
946                                         in_buffer_full -= k;
947                                 }
948                         }
949
950                         if (master_readable && out_buffer_full < LINE_MAX) {
951
952                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
953                                 if (k < 0) {
954
955                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
956                                                 master_readable = false;
957                                         else {
958                                                 log_error("read(): %m");
959                                                 r = -errno;
960                                                 goto finish;
961                                         }
962                                 }  else
963                                         out_buffer_full += (size_t) k;
964                         }
965
966                         if (stdout_writable && out_buffer_full > 0) {
967
968                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
969                                 if (k < 0) {
970
971                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
972                                                 stdout_writable = false;
973                                         else {
974                                                 log_error("write(): %m");
975                                                 r = -errno;
976                                                 goto finish;
977                                         }
978
979                                 } else {
980                                         assert(out_buffer_full >= (size_t) k);
981                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
982                                         out_buffer_full -= k;
983                                 }
984                         }
985                 }
986         }
987
988 finish:
989         if (ep >= 0)
990                 close_nointr_nofail(ep);
991
992         if (signal_fd >= 0)
993                 close_nointr_nofail(signal_fd);
994
995         return r;
996 }
997
998 int main(int argc, char *argv[]) {
999         pid_t pid = 0;
1000         int r = EXIT_FAILURE, k;
1001         char *oldcg = NULL, *newcg = NULL;
1002         char **controller = NULL;
1003         int master = -1;
1004         const char *console = NULL;
1005         struct termios saved_attr, raw_attr;
1006         sigset_t mask;
1007         bool saved_attr_valid = false;
1008         struct winsize ws;
1009         int kmsg_socket_pair[2] = { -1, -1 };
1010
1011         log_parse_environment();
1012         log_open();
1013
1014         r = parse_argv(argc, argv);
1015         if (r <= 0)
1016                 goto finish;
1017
1018         if (arg_directory) {
1019                 char *p;
1020
1021                 p = path_make_absolute_cwd(arg_directory);
1022                 free(arg_directory);
1023                 arg_directory = p;
1024         } else
1025                 arg_directory = get_current_dir_name();
1026
1027         if (!arg_directory) {
1028                 log_error("Failed to determine path");
1029                 goto finish;
1030         }
1031
1032         path_kill_slashes(arg_directory);
1033
1034         if (geteuid() != 0) {
1035                 log_error("Need to be root.");
1036                 goto finish;
1037         }
1038
1039         if (sd_booted() <= 0) {
1040                 log_error("Not running on a systemd system.");
1041                 goto finish;
1042         }
1043
1044         if (path_equal(arg_directory, "/")) {
1045                 log_error("Spawning container on root directory not supported.");
1046                 goto finish;
1047         }
1048
1049         if (is_os_tree(arg_directory) <= 0) {
1050                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1051                 goto finish;
1052         }
1053
1054         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1055         if (k < 0) {
1056                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1057                 goto finish;
1058         }
1059
1060         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1061                 log_error("Failed to allocate cgroup path.");
1062                 goto finish;
1063         }
1064
1065         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1066         if (k < 0)  {
1067                 log_error("Failed to create cgroup: %s", strerror(-k));
1068                 goto finish;
1069         }
1070
1071         STRV_FOREACH(controller, arg_controllers) {
1072                 k = cg_create_and_attach(*controller, newcg, 0);
1073                 if (k < 0)
1074                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1075         }
1076
1077         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1078         if (master < 0) {
1079                 log_error("Failed to acquire pseudo tty: %m");
1080                 goto finish;
1081         }
1082
1083         console = ptsname(master);
1084         if (!console) {
1085                 log_error("Failed to determine tty name: %m");
1086                 goto finish;
1087         }
1088
1089         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1090
1091         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1092                 ioctl(master, TIOCSWINSZ, &ws);
1093
1094         if (unlockpt(master) < 0) {
1095                 log_error("Failed to unlock tty: %m");
1096                 goto finish;
1097         }
1098
1099         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1100                 log_error("Failed to get terminal attributes: %m");
1101                 goto finish;
1102         }
1103
1104         saved_attr_valid = true;
1105
1106         raw_attr = saved_attr;
1107         cfmakeraw(&raw_attr);
1108         raw_attr.c_lflag &= ~ECHO;
1109
1110         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1111                 log_error("Failed to set terminal attributes: %m");
1112                 goto finish;
1113         }
1114
1115         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1116                 log_error("Failed to create kmsg socket pair");
1117                 goto finish;
1118         }
1119
1120         assert_se(sigemptyset(&mask) == 0);
1121         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1122         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1123
1124         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1125         if (pid < 0) {
1126                 if (errno == EINVAL)
1127                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1128                 else
1129                         log_error("clone() failed: %m");
1130
1131                 goto finish;
1132         }
1133
1134         if (pid == 0) {
1135                 /* child */
1136
1137                 const char *home = NULL;
1138                 uid_t uid = (uid_t) -1;
1139                 gid_t gid = (gid_t) -1;
1140                 const char *envp[] = {
1141                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1143                         NULL, /* TERM */
1144                         NULL, /* HOME */
1145                         NULL, /* USER */
1146                         NULL, /* LOGNAME */
1147                         NULL, /* container_uuid */
1148                         NULL
1149                 };
1150
1151                 envp[2] = strv_find_prefix(environ, "TERM=");
1152
1153                 close_nointr_nofail(master);
1154
1155                 close_nointr(STDIN_FILENO);
1156                 close_nointr(STDOUT_FILENO);
1157                 close_nointr(STDERR_FILENO);
1158
1159                 close_all_fds(&kmsg_socket_pair[1], 1);
1160
1161                 reset_all_signal_handlers();
1162
1163                 assert_se(sigemptyset(&mask) == 0);
1164                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1165
1166                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1167                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1168                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1169                         goto child_fail;
1170
1171                 if (setsid() < 0) {
1172                         log_error("setsid() failed: %m");
1173                         goto child_fail;
1174                 }
1175
1176                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1177                         log_error("PR_SET_PDEATHSIG failed: %m");
1178                         goto child_fail;
1179                 }
1180
1181                 /* Mark everything as slave, so that we still
1182                  * receive mounts from the real root, but don't
1183                  * propagate mounts to the real root. */
1184                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1185                         log_error("MS_SLAVE|MS_REC failed: %m");
1186                         goto child_fail;
1187                 }
1188
1189                 /* Turn directory into bind mount */
1190                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1191                         log_error("Failed to make bind mount.");
1192                         goto child_fail;
1193                 }
1194
1195                 if (arg_read_only)
1196                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1197                                 log_error("Failed to make read-only.");
1198                                 goto child_fail;
1199                         }
1200
1201                 if (mount_all(arg_directory) < 0)
1202                         goto child_fail;
1203
1204                 if (copy_devnodes(arg_directory) < 0)
1205                         goto child_fail;
1206
1207                 if (setup_dev_console(arg_directory, console) < 0)
1208                         goto child_fail;
1209
1210                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1211                         goto child_fail;
1212
1213                 close_nointr_nofail(kmsg_socket_pair[1]);
1214
1215                 if (setup_timezone(arg_directory) < 0)
1216                         goto child_fail;
1217
1218                 if (setup_resolv_conf(arg_directory) < 0)
1219                         goto child_fail;
1220
1221                 if (setup_journal(arg_directory) < 0)
1222                         goto child_fail;
1223
1224                 if (chdir(arg_directory) < 0) {
1225                         log_error("chdir(%s) failed: %m", arg_directory);
1226                         goto child_fail;
1227                 }
1228
1229                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1230                         log_error("mount(MS_MOVE) failed: %m");
1231                         goto child_fail;
1232                 }
1233
1234                 if (chroot(".") < 0) {
1235                         log_error("chroot() failed: %m");
1236                         goto child_fail;
1237                 }
1238
1239                 if (chdir("/") < 0) {
1240                         log_error("chdir() failed: %m");
1241                         goto child_fail;
1242                 }
1243
1244                 umask(0022);
1245
1246                 loopback_setup();
1247
1248                 if (drop_capabilities() < 0) {
1249                         log_error("drop_capabilities() failed: %m");
1250                         goto child_fail;
1251                 }
1252
1253                 if (arg_user) {
1254
1255                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1256                                 log_error("get_user_creds() failed: %m");
1257                                 goto child_fail;
1258                         }
1259
1260                         if (mkdir_parents_label(home, 0775) < 0) {
1261                                 log_error("mkdir_parents_label() failed: %m");
1262                                 goto child_fail;
1263                         }
1264
1265                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1266                                 log_error("mkdir_safe_label() failed: %m");
1267                                 goto child_fail;
1268                         }
1269
1270                         if (initgroups((const char*)arg_user, gid) < 0) {
1271                                 log_error("initgroups() failed: %m");
1272                                 goto child_fail;
1273                         }
1274
1275                         if (setresgid(gid, gid, gid) < 0) {
1276                                 log_error("setregid() failed: %m");
1277                                 goto child_fail;
1278                         }
1279
1280                         if (setresuid(uid, uid, uid) < 0) {
1281                                 log_error("setreuid() failed: %m");
1282                                 goto child_fail;
1283                         }
1284                 }
1285
1286                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1287                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1288                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1289                     log_oom();
1290                     goto child_fail;
1291                 }
1292
1293                 if (arg_uuid) {
1294                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1295                                 log_oom();
1296                                 goto child_fail;
1297                         }
1298                 }
1299
1300                 setup_hostname();
1301
1302                 if (arg_boot) {
1303                         char **a;
1304                         size_t l;
1305
1306                         /* Automatically search for the init system */
1307
1308                         l = 1 + argc - optind;
1309                         a = newa(char*, l + 1);
1310                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1311
1312                         a[0] = (char*) "/usr/lib/systemd/systemd";
1313                         execve(a[0], a, (char**) envp);
1314
1315                         a[0] = (char*) "/lib/systemd/systemd";
1316                         execve(a[0], a, (char**) envp);
1317
1318                         a[0] = (char*) "/sbin/init";
1319                         execve(a[0], a, (char**) envp);
1320                 } else if (argc > optind)
1321                         execvpe(argv[optind], argv + optind, (char**) envp);
1322                 else {
1323                         chdir(home ? home : "/root");
1324                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1325                 }
1326
1327                 log_error("execv() failed: %m");
1328
1329         child_fail:
1330                 _exit(EXIT_FAILURE);
1331         }
1332
1333         if (process_pty(master, &mask) < 0)
1334                 goto finish;
1335
1336         if (saved_attr_valid) {
1337                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1338                 saved_attr_valid = false;
1339         }
1340
1341         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1342
1343         if (r < 0)
1344                 r = EXIT_FAILURE;
1345
1346 finish:
1347         if (saved_attr_valid)
1348                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1349
1350         if (master >= 0)
1351                 close_nointr_nofail(master);
1352
1353         close_pipe(kmsg_socket_pair);
1354
1355         if (oldcg)
1356                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1357
1358         if (newcg)
1359                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1360
1361         free(arg_directory);
1362         strv_free(arg_controllers);
1363         free(oldcg);
1364         free(newcg);
1365
1366         return r;
1367 }