chiark / gitweb /
14de7f8b43b135c4a000abf1eda8ef12d00c0081
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56
57 typedef enum LinkJournal {
58         LINK_NO,
59         LINK_AUTO,
60         LINK_HOST,
61         LINK_GUEST
62 } LinkJournal;
63
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
73         (1ULL << CAP_CHOWN) |
74         (1ULL << CAP_DAC_OVERRIDE) |
75         (1ULL << CAP_DAC_READ_SEARCH) |
76         (1ULL << CAP_FOWNER) |
77         (1ULL << CAP_FSETID) |
78         (1ULL << CAP_IPC_OWNER) |
79         (1ULL << CAP_KILL) |
80         (1ULL << CAP_LEASE) |
81         (1ULL << CAP_LINUX_IMMUTABLE) |
82         (1ULL << CAP_NET_BIND_SERVICE) |
83         (1ULL << CAP_NET_BROADCAST) |
84         (1ULL << CAP_NET_RAW) |
85         (1ULL << CAP_SETGID) |
86         (1ULL << CAP_SETFCAP) |
87         (1ULL << CAP_SETPCAP) |
88         (1ULL << CAP_SETUID) |
89         (1ULL << CAP_SYS_ADMIN) |
90         (1ULL << CAP_SYS_CHROOT) |
91         (1ULL << CAP_SYS_NICE) |
92         (1ULL << CAP_SYS_PTRACE) |
93         (1ULL << CAP_SYS_TTY_CONFIG) |
94         (1ULL << CAP_SYS_RESOURCE);
95
96 static int help(void) {
97
98         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100                "  -h --help               Show this help\n"
101                "  -D --directory=NAME     Root directory for the container\n"
102                "  -b --boot               Boot up full system (i.e. invoke init)\n"
103                "  -u --user=USER          Run the command under specified user or uid\n"
104                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
105                "     --uuid=UUID          Set a specific machine UUID for the container\n"
106                "     --private-network    Disable network in container\n"
107                "     --read-only          Mount the root directory read-only\n"
108                "     --capability=CAP     In addition to the default, retain specified capability\n"
109                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
110                "  -j                      Equivalent to --link-journal=host\n",
111                program_invocation_short_name);
112
113         return 0;
114 }
115
116 static int parse_argv(int argc, char *argv[]) {
117
118         enum {
119                 ARG_PRIVATE_NETWORK = 0x100,
120                 ARG_UUID,
121                 ARG_READ_ONLY,
122                 ARG_CAPABILITY,
123                 ARG_LINK_JOURNAL
124         };
125
126         static const struct option options[] = {
127                 { "help",            no_argument,       NULL, 'h'                 },
128                 { "directory",       required_argument, NULL, 'D'                 },
129                 { "user",            required_argument, NULL, 'u'                 },
130                 { "controllers",     required_argument, NULL, 'C'                 },
131                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
132                 { "boot",            no_argument,       NULL, 'b'                 },
133                 { "uuid",            required_argument, NULL, ARG_UUID            },
134                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
135                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
136                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
137                 { NULL,              0,                 NULL, 0                   }
138         };
139
140         int c;
141
142         assert(argc >= 0);
143         assert(argv);
144
145         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
146
147                 switch (c) {
148
149                 case 'h':
150                         help();
151                         return 0;
152
153                 case 'D':
154                         free(arg_directory);
155                         arg_directory = canonicalize_file_name(optarg);
156                         if (!arg_directory) {
157                                 log_error("Failed to canonicalize root directory.");
158                                 return -ENOMEM;
159                         }
160
161                         break;
162
163                 case 'u':
164                         free(arg_user);
165                         if (!(arg_user = strdup(optarg))) {
166                                 log_error("Failed to duplicate user name.");
167                                 return -ENOMEM;
168                         }
169
170                         break;
171
172                 case 'C':
173                         strv_free(arg_controllers);
174                         arg_controllers = strv_split(optarg, ",");
175                         if (!arg_controllers) {
176                                 log_error("Failed to split controllers list.");
177                                 return -ENOMEM;
178                         }
179                         strv_uniq(arg_controllers);
180
181                         break;
182
183                 case ARG_PRIVATE_NETWORK:
184                         arg_private_network = true;
185                         break;
186
187                 case 'b':
188                         arg_boot = true;
189                         break;
190
191                 case ARG_UUID:
192                         arg_uuid = optarg;
193                         break;
194
195                 case ARG_READ_ONLY:
196                         arg_read_only = true;
197                         break;
198
199                 case ARG_CAPABILITY: {
200                         char *state, *word;
201                         size_t length;
202
203                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
204                                 cap_value_t cap;
205                                 char *t;
206
207                                 t = strndup(word, length);
208                                 if (!t) {
209                                         log_error("Out of memory.");
210                                         return -ENOMEM;
211                                 }
212
213                                 if (cap_from_name(t, &cap) < 0) {
214                                         log_error("Failed to parse capability %s.", t);
215                                         free(t);
216                                         return -EINVAL;
217                                 }
218
219                                 free(t);
220                                 arg_retain |= 1ULL << (uint64_t) cap;
221                         }
222
223                         break;
224                 }
225
226                 case 'j':
227                         arg_link_journal = LINK_GUEST;
228                         break;
229
230                 case ARG_LINK_JOURNAL:
231                         if (streq(optarg, "auto"))
232                                 arg_link_journal = LINK_AUTO;
233                         else if (streq(optarg, "no"))
234                                 arg_link_journal = LINK_NO;
235                         else if (streq(optarg, "guest"))
236                                 arg_link_journal = LINK_GUEST;
237                         else if (streq(optarg, "host"))
238                                 arg_link_journal = LINK_HOST;
239                         else {
240                                 log_error("Failed to parse link journal mode %s", optarg);
241                                 return -EINVAL;
242                         }
243
244                         break;
245
246                 case '?':
247                         return -EINVAL;
248
249                 default:
250                         log_error("Unknown option code %c", c);
251                         return -EINVAL;
252                 }
253         }
254
255         return 1;
256 }
257
258 static int mount_all(const char *dest) {
259
260         typedef struct MountPoint {
261                 const char *what;
262                 const char *where;
263                 const char *type;
264                 const char *options;
265                 unsigned long flags;
266                 bool fatal;
267         } MountPoint;
268
269         static const MountPoint mount_table[] = {
270                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
271                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
272                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
273                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
274                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
275                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
276                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
277                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278 #ifdef HAVE_SELINUX
279                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
280                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
281 #endif
282         };
283
284         unsigned k;
285         int r = 0;
286         char *where;
287
288         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289                 int t;
290
291                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
292                         log_error("Out of memory");
293
294                         if (r == 0)
295                                 r = -ENOMEM;
296
297                         break;
298                 }
299
300                 t = path_is_mount_point(where, false);
301                 if (t < 0) {
302                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
303                         free(where);
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 mkdir_p_label(where, 0755);
312
313                 if (mount(mount_table[k].what,
314                           where,
315                           mount_table[k].type,
316                           mount_table[k].flags,
317                           mount_table[k].options) < 0 &&
318                     mount_table[k].fatal) {
319
320                         log_error("mount(%s) failed: %m", where);
321
322                         if (r == 0)
323                                 r = -errno;
324                 }
325
326                 free(where);
327         }
328
329         return r;
330 }
331
332 static int setup_timezone(const char *dest) {
333         char *where;
334
335         assert(dest);
336
337         /* Fix the timezone, if possible */
338         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
339                 log_error("Out of memory");
340                 return -ENOMEM;
341         }
342
343         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
345
346         free(where);
347
348         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
349                 log_error("Out of memory");
350                 return -ENOMEM;
351         }
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
371                 log_error("Out of memory");
372                 return -ENOMEM;
373         }
374
375         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
377
378         free(where);
379
380         return 0;
381 }
382
383 static int copy_devnodes(const char *dest) {
384
385         static const char devnodes[] =
386                 "null\0"
387                 "zero\0"
388                 "full\0"
389                 "random\0"
390                 "urandom\0"
391                 "tty\0"
392                 "ptmx\0"
393                 "rtc0\0";
394
395         const char *d;
396         int r = 0;
397         mode_t u;
398
399         assert(dest);
400
401         u = umask(0000);
402
403         NULSTR_FOREACH(d, devnodes) {
404                 struct stat st;
405                 char *from = NULL, *to = NULL;
406
407                 asprintf(&from, "/dev/%s", d);
408                 asprintf(&to, "%s/dev/%s", dest, d);
409
410                 if (!from || !to) {
411                         log_error("Failed to allocate devnode path");
412
413                         free(from);
414                         free(to);
415
416                         from = to = NULL;
417
418                         if (r == 0)
419                                 r = -ENOMEM;
420
421                         break;
422                 }
423
424                 if (stat(from, &st) < 0) {
425
426                         if (errno != ENOENT) {
427                                 log_error("Failed to stat %s: %m", from);
428                                 if (r == 0)
429                                         r = -errno;
430                         }
431
432                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
433
434                         log_error("%s is not a char or block device, cannot copy.", from);
435                         if (r == 0)
436                                 r = -EIO;
437
438                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
439
440                         log_error("mknod(%s) failed: %m", dest);
441                         if (r == 0)
442                                 r = -errno;
443                 }
444
445                 free(from);
446                 free(to);
447         }
448
449         umask(u);
450
451         return r;
452 }
453
454 static int setup_dev_console(const char *dest, const char *console) {
455         struct stat st;
456         char *to = NULL;
457         int r;
458         mode_t u;
459
460         assert(dest);
461         assert(console);
462
463         u = umask(0000);
464
465         if (stat(console, &st) < 0) {
466                 log_error("Failed to stat %s: %m", console);
467                 r = -errno;
468                 goto finish;
469
470         } else if (!S_ISCHR(st.st_mode)) {
471                 log_error("/dev/console is not a char device.");
472                 r = -EIO;
473                 goto finish;
474         }
475
476         r = chmod_and_chown(console, 0600, 0, 0);
477         if (r < 0) {
478                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
479                 goto finish;
480         }
481
482         if (asprintf(&to, "%s/dev/console", dest) < 0) {
483                 log_error("Out of memory");
484                 r = -ENOMEM;
485                 goto finish;
486         }
487
488         /* We need to bind mount the right tty to /dev/console since
489          * ptys can only exist on pts file systems. To have something
490          * to bind mount things on we create a device node first, that
491          * has the right major/minor (note that the major minor
492          * doesn't actually matter here, since we mount it over
493          * anyway). */
494
495         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
496                 log_error("mknod() for /dev/console failed: %m");
497                 r = -errno;
498                 goto finish;
499         }
500
501         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
502                 log_error("Bind mount for /dev/console failed: %m");
503                 r = -errno;
504                 goto finish;
505         }
506
507 finish:
508         free(to);
509         umask(u);
510
511         return r;
512 }
513
514 static int setup_kmsg(const char *dest, int kmsg_socket) {
515         char *from = NULL, *to = NULL;
516         int r, fd, k;
517         mode_t u;
518         union {
519                 struct cmsghdr cmsghdr;
520                 uint8_t buf[CMSG_SPACE(sizeof(int))];
521         } control;
522         struct msghdr mh;
523         struct cmsghdr *cmsg;
524
525         assert(dest);
526         assert(kmsg_socket >= 0);
527
528         u = umask(0000);
529
530         /* We create the kmsg FIFO as /dev/kmsg, but immediately
531          * delete it after bind mounting it to /proc/kmsg. While FIFOs
532          * on the reading side behave very similar to /proc/kmsg,
533          * their writing side behaves differently from /dev/kmsg in
534          * that writing blocks when nothing is reading. In order to
535          * avoid any problems with containers deadlocking due to this
536          * we simply make /dev/kmsg unavailable to the container. */
537         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
538                 log_error("Out of memory");
539                 r = -ENOMEM;
540                 goto finish;
541         }
542
543         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
544                 log_error("Out of memory");
545                 r = -ENOMEM;
546                 goto finish;
547         }
548
549         if (mkfifo(from, 0600) < 0) {
550                 log_error("mkfifo() for /dev/kmsg failed: %m");
551                 r = -errno;
552                 goto finish;
553         }
554
555         r = chmod_and_chown(from, 0600, 0, 0);
556         if (r < 0) {
557                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
558                 goto finish;
559         }
560
561         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
562                 log_error("Bind mount for /proc/kmsg failed: %m");
563                 r = -errno;
564                 goto finish;
565         }
566
567         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
568         if (fd < 0) {
569                 log_error("Failed to open fifo: %m");
570                 r = -errno;
571                 goto finish;
572         }
573
574         zero(mh);
575         zero(control);
576
577         mh.msg_control = &control;
578         mh.msg_controllen = sizeof(control);
579
580         cmsg = CMSG_FIRSTHDR(&mh);
581         cmsg->cmsg_level = SOL_SOCKET;
582         cmsg->cmsg_type = SCM_RIGHTS;
583         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
584         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
585
586         mh.msg_controllen = cmsg->cmsg_len;
587
588         /* Store away the fd in the socket, so that it stays open as
589          * long as we run the child */
590         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
591         close_nointr_nofail(fd);
592
593         if (k < 0) {
594                 log_error("Failed to send FIFO fd: %m");
595                 r = -errno;
596                 goto finish;
597         }
598
599         /* And now make the FIFO unavailable as /dev/kmsg... */
600         unlink(from);
601
602 finish:
603         free(from);
604         free(to);
605         umask(u);
606
607         return r;
608 }
609
610 static int setup_hostname(void) {
611         char *hn;
612         int r = 0;
613
614         hn = path_get_file_name(arg_directory);
615         if (hn) {
616                 hn = strdup(hn);
617                 if (!hn)
618                         return -ENOMEM;
619
620                 hostname_cleanup(hn);
621
622                 if (!isempty(hn))
623                         if (sethostname(hn, strlen(hn)) < 0)
624                                 r = -errno;
625
626                 free(hn);
627         }
628
629         return r;
630 }
631
632 static int setup_journal(const char *directory) {
633         sd_id128_t machine_id;
634         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
635         int r;
636
637         if (arg_link_journal == LINK_NO)
638                 return 0;
639
640         p = strappend(directory, "/etc/machine-id");
641         if (!p) {
642                 log_error("Out of memory");
643                 r = -ENOMEM;
644                 goto finish;
645         }
646
647         r = read_one_line_file(p, &b);
648         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
649                 r = 0;
650                 goto finish;
651         } else if (r < 0) {
652                 log_error("Failed to read machine ID: %s", strerror(-r));
653                 return r;
654         }
655
656         l = strstrip(b);
657         if (isempty(l) && arg_link_journal == LINK_AUTO) {
658                 r = 0;
659                 goto finish;
660         }
661
662         /* Verify validaty */
663         r = sd_id128_from_string(l, &machine_id);
664         if (r < 0) {
665                 log_error("Failed to parse machine ID: %s", strerror(-r));
666                 goto finish;
667         }
668
669         free(p);
670         p = strappend("/var/log/journal/", l);
671         q = strjoin(directory, "/var/log/journal/", l, NULL);
672         if (!p || !q) {
673                 log_error("Out of memory");
674                 r = -ENOMEM;
675                 goto finish;
676         }
677
678         if (path_is_mount_point(p, false) > 0 ||
679             path_is_mount_point(q, false) > 0) {
680                 if (arg_link_journal != LINK_AUTO) {
681                         log_error("Journal already a mount point, refusing.");
682                         r = -EEXIST;
683                         goto finish;
684                 }
685
686                 r = 0;
687                 goto finish;
688         }
689
690         r = readlink_and_make_absolute(p, &d);
691         if (r >= 0) {
692                 if ((arg_link_journal == LINK_GUEST ||
693                      arg_link_journal == LINK_AUTO) &&
694                     path_equal(d, q)) {
695
696                         mkdir_p(q, 0755);
697
698                         r = 0;
699                         goto finish;
700                 }
701
702                 if (unlink(p) < 0) {
703                         log_error("Failed to remove symlink %s: %m", p);
704                         r = -errno;
705                         goto finish;
706                 }
707         } else if (r == -EINVAL) {
708
709                 if (arg_link_journal == LINK_GUEST &&
710                     rmdir(p) < 0) {
711
712                         if (errno == ENOTDIR)
713                                 log_error("%s already exists and is neither symlink nor directory.", p);
714                         else {
715                                 log_error("Failed to remove %s: %m", p);
716                                 r = -errno;
717                         }
718
719                         goto finish;
720                 }
721         } else if (r != -ENOENT) {
722                 log_error("readlink(%s) failed: %m", p);
723                 goto finish;
724         }
725
726         if (arg_link_journal == LINK_GUEST) {
727
728                 if (symlink(q, p) < 0) {
729                         log_error("Failed to symlink %s to %s: %m", q, p);
730                         r = -errno;
731                         goto finish;
732                 }
733
734                 mkdir_p(q, 0755);
735
736                 r = 0;
737                 goto finish;
738         }
739
740         if (arg_link_journal == LINK_HOST) {
741                 r = mkdir_p(p, 0755);
742                 if (r < 0) {
743                         log_error("Failed to create %s: %m", p);
744                         goto finish;
745                 }
746
747         } else if (access(p, F_OK) < 0) {
748                 r = 0;
749                 goto finish;
750         }
751
752         if (dir_is_empty(q) == 0) {
753                 log_error("%s not empty.", q);
754                 r = -ENOTEMPTY;
755                 goto finish;
756         }
757
758         r = mkdir_p(q, 0755);
759         if (r < 0) {
760                 log_error("Failed to create %s: %m", q);
761                 goto finish;
762         }
763
764         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Failed to bind mount journal from host into guest: %m");
766                 r = -errno;
767                 goto finish;
768         }
769
770         r = 0;
771
772 finish:
773         free(p);
774         free(q);
775         free(d);
776         free(b);
777         return r;
778
779 }
780
781 static int drop_capabilities(void) {
782         return capability_bounding_set_drop(~arg_retain, false);
783 }
784
785 static int is_os_tree(const char *path) {
786         int r;
787         char *p;
788         /* We use /bin/sh as flag file if something is an OS */
789
790         if (asprintf(&p, "%s/bin/sh", path) < 0)
791                 return -ENOMEM;
792
793         r = access(p, F_OK);
794         free(p);
795
796         return r < 0 ? 0 : 1;
797 }
798
799 static int process_pty(int master, sigset_t *mask) {
800
801         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802         size_t in_buffer_full = 0, out_buffer_full = 0;
803         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805         int ep = -1, signal_fd = -1, r;
806
807         fd_nonblock(STDIN_FILENO, 1);
808         fd_nonblock(STDOUT_FILENO, 1);
809         fd_nonblock(master, 1);
810
811         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
812                 log_error("signalfd(): %m");
813                 r = -errno;
814                 goto finish;
815         }
816
817         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
818                 log_error("Failed to create epoll: %m");
819                 r = -errno;
820                 goto finish;
821         }
822
823         zero(stdin_ev);
824         stdin_ev.events = EPOLLIN|EPOLLET;
825         stdin_ev.data.fd = STDIN_FILENO;
826
827         zero(stdout_ev);
828         stdout_ev.events = EPOLLOUT|EPOLLET;
829         stdout_ev.data.fd = STDOUT_FILENO;
830
831         zero(master_ev);
832         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
833         master_ev.data.fd = master;
834
835         zero(signal_ev);
836         signal_ev.events = EPOLLIN;
837         signal_ev.data.fd = signal_fd;
838
839         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
840             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
841             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
842             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
843                 log_error("Failed to regiser fds in epoll: %m");
844                 r = -errno;
845                 goto finish;
846         }
847
848         for (;;) {
849                 struct epoll_event ev[16];
850                 ssize_t k;
851                 int i, nfds;
852
853                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
854
855                         if (errno == EINTR || errno == EAGAIN)
856                                 continue;
857
858                         log_error("epoll_wait(): %m");
859                         r = -errno;
860                         goto finish;
861                 }
862
863                 assert(nfds >= 1);
864
865                 for (i = 0; i < nfds; i++) {
866                         if (ev[i].data.fd == STDIN_FILENO) {
867
868                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
869                                         stdin_readable = true;
870
871                         } else if (ev[i].data.fd == STDOUT_FILENO) {
872
873                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
874                                         stdout_writable = true;
875
876                         } else if (ev[i].data.fd == master) {
877
878                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
879                                         master_readable = true;
880
881                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
882                                         master_writable = true;
883
884                         } else if (ev[i].data.fd == signal_fd) {
885                                 struct signalfd_siginfo sfsi;
886                                 ssize_t n;
887
888                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
889
890                                         if (n >= 0) {
891                                                 log_error("Failed to read from signalfd: invalid block size");
892                                                 r = -EIO;
893                                                 goto finish;
894                                         }
895
896                                         if (errno != EINTR && errno != EAGAIN) {
897                                                 log_error("Failed to read from signalfd: %m");
898                                                 r = -errno;
899                                                 goto finish;
900                                         }
901                                 } else {
902
903                                         if (sfsi.ssi_signo == SIGWINCH) {
904                                                 struct winsize ws;
905
906                                                 /* The window size changed, let's forward that. */
907                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
908                                                         ioctl(master, TIOCSWINSZ, &ws);
909                                         } else {
910                                                 r = 0;
911                                                 goto finish;
912                                         }
913                                 }
914                         }
915                 }
916
917                 while ((stdin_readable && in_buffer_full <= 0) ||
918                        (master_writable && in_buffer_full > 0) ||
919                        (master_readable && out_buffer_full <= 0) ||
920                        (stdout_writable && out_buffer_full > 0)) {
921
922                         if (stdin_readable && in_buffer_full < LINE_MAX) {
923
924                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
925
926                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
927                                                 stdin_readable = false;
928                                         else {
929                                                 log_error("read(): %m");
930                                                 r = -errno;
931                                                 goto finish;
932                                         }
933                                 } else
934                                         in_buffer_full += (size_t) k;
935                         }
936
937                         if (master_writable && in_buffer_full > 0) {
938
939                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
940
941                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
942                                                 master_writable = false;
943                                         else {
944                                                 log_error("write(): %m");
945                                                 r = -errno;
946                                                 goto finish;
947                                         }
948
949                                 } else {
950                                         assert(in_buffer_full >= (size_t) k);
951                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
952                                         in_buffer_full -= k;
953                                 }
954                         }
955
956                         if (master_readable && out_buffer_full < LINE_MAX) {
957
958                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
959
960                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
961                                                 master_readable = false;
962                                         else {
963                                                 log_error("read(): %m");
964                                                 r = -errno;
965                                                 goto finish;
966                                         }
967                                 }  else
968                                         out_buffer_full += (size_t) k;
969                         }
970
971                         if (stdout_writable && out_buffer_full > 0) {
972
973                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
974
975                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
976                                                 stdout_writable = false;
977                                         else {
978                                                 log_error("write(): %m");
979                                                 r = -errno;
980                                                 goto finish;
981                                         }
982
983                                 } else {
984                                         assert(out_buffer_full >= (size_t) k);
985                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
986                                         out_buffer_full -= k;
987                                 }
988                         }
989                 }
990         }
991
992 finish:
993         if (ep >= 0)
994                 close_nointr_nofail(ep);
995
996         if (signal_fd >= 0)
997                 close_nointr_nofail(signal_fd);
998
999         return r;
1000 }
1001
1002 int main(int argc, char *argv[]) {
1003         pid_t pid = 0;
1004         int r = EXIT_FAILURE, k;
1005         char *oldcg = NULL, *newcg = NULL;
1006         char **controller = NULL;
1007         int master = -1;
1008         const char *console = NULL;
1009         struct termios saved_attr, raw_attr;
1010         sigset_t mask;
1011         bool saved_attr_valid = false;
1012         struct winsize ws;
1013         int kmsg_socket_pair[2] = { -1, -1 };
1014
1015         log_parse_environment();
1016         log_open();
1017
1018         if ((r = parse_argv(argc, argv)) <= 0)
1019                 goto finish;
1020
1021         if (arg_directory) {
1022                 char *p;
1023
1024                 p = path_make_absolute_cwd(arg_directory);
1025                 free(arg_directory);
1026                 arg_directory = p;
1027         } else
1028                 arg_directory = get_current_dir_name();
1029
1030         if (!arg_directory) {
1031                 log_error("Failed to determine path");
1032                 goto finish;
1033         }
1034
1035         path_kill_slashes(arg_directory);
1036
1037         if (geteuid() != 0) {
1038                 log_error("Need to be root.");
1039                 goto finish;
1040         }
1041
1042         if (sd_booted() <= 0) {
1043                 log_error("Not running on a systemd system.");
1044                 goto finish;
1045         }
1046
1047         if (path_equal(arg_directory, "/")) {
1048                 log_error("Spawning container on root directory not supported.");
1049                 goto finish;
1050         }
1051
1052         if (is_os_tree(arg_directory) <= 0) {
1053                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1054                 goto finish;
1055         }
1056
1057         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
1058                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1059                 goto finish;
1060         }
1061
1062         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1063                 log_error("Failed to allocate cgroup path.");
1064                 goto finish;
1065         }
1066
1067         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1068         if (k < 0)  {
1069                 log_error("Failed to create cgroup: %s", strerror(-k));
1070                 goto finish;
1071         }
1072
1073         STRV_FOREACH(controller,arg_controllers) {
1074                 k = cg_create_and_attach(*controller, newcg, 0);
1075                 if (k < 0)
1076                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1077         }
1078
1079         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
1080                 log_error("Failed to acquire pseudo tty: %m");
1081                 goto finish;
1082         }
1083
1084         if (!(console = ptsname(master))) {
1085                 log_error("Failed to determine tty name: %m");
1086                 goto finish;
1087         }
1088
1089         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1090
1091         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1092                 ioctl(master, TIOCSWINSZ, &ws);
1093
1094         if (unlockpt(master) < 0) {
1095                 log_error("Failed to unlock tty: %m");
1096                 goto finish;
1097         }
1098
1099         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1100                 log_error("Failed to get terminal attributes: %m");
1101                 goto finish;
1102         }
1103
1104         saved_attr_valid = true;
1105
1106         raw_attr = saved_attr;
1107         cfmakeraw(&raw_attr);
1108         raw_attr.c_lflag &= ~ECHO;
1109
1110         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1111                 log_error("Failed to set terminal attributes: %m");
1112                 goto finish;
1113         }
1114
1115         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1116                 log_error("Failed to create kmsg socket pair");
1117                 goto finish;
1118         }
1119
1120         assert_se(sigemptyset(&mask) == 0);
1121         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1122         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1123
1124         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1125         if (pid < 0) {
1126                 if (errno == EINVAL)
1127                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1128                 else
1129                         log_error("clone() failed: %m");
1130
1131                 goto finish;
1132         }
1133
1134         if (pid == 0) {
1135                 /* child */
1136
1137                 const char *home = NULL;
1138                 uid_t uid = (uid_t) -1;
1139                 gid_t gid = (gid_t) -1;
1140                 const char *envp[] = {
1141                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1142                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1143                         NULL, /* TERM */
1144                         NULL, /* HOME */
1145                         NULL, /* USER */
1146                         NULL, /* LOGNAME */
1147                         NULL, /* container_uuid */
1148                         NULL
1149                 };
1150
1151                 envp[2] = strv_find_prefix(environ, "TERM=");
1152
1153                 close_nointr_nofail(master);
1154
1155                 close_nointr(STDIN_FILENO);
1156                 close_nointr(STDOUT_FILENO);
1157                 close_nointr(STDERR_FILENO);
1158
1159                 close_all_fds(&kmsg_socket_pair[1], 1);
1160
1161                 reset_all_signal_handlers();
1162
1163                 assert_se(sigemptyset(&mask) == 0);
1164                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1165
1166                 if (setsid() < 0)
1167                         goto child_fail;
1168
1169                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
1170                         goto child_fail;
1171
1172                 /* Mark / as private, in case somebody marked it shared */
1173                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
1174                         goto child_fail;
1175
1176                 /* Turn directory into bind mount */
1177                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1178                         log_error("Failed to make bind mount.");
1179                         goto child_fail;
1180                 }
1181
1182                 if (arg_read_only)
1183                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1184                                 log_error("Failed to make read-only.");
1185                                 goto child_fail;
1186                         }
1187
1188                 if (mount_all(arg_directory) < 0)
1189                         goto child_fail;
1190
1191                 if (copy_devnodes(arg_directory) < 0)
1192                         goto child_fail;
1193
1194                 if (setup_dev_console(arg_directory, console) < 0)
1195                         goto child_fail;
1196
1197                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1198                         goto child_fail;
1199
1200                 close_nointr_nofail(kmsg_socket_pair[1]);
1201
1202                 if (setup_timezone(arg_directory) < 0)
1203                         goto child_fail;
1204
1205                 if (setup_resolv_conf(arg_directory) < 0)
1206                         goto child_fail;
1207
1208                 if (setup_journal(arg_directory) < 0)
1209                         goto child_fail;
1210
1211                 if (chdir(arg_directory) < 0) {
1212                         log_error("chdir(%s) failed: %m", arg_directory);
1213                         goto child_fail;
1214                 }
1215
1216                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1217                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1218                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1219                         goto child_fail;
1220
1221                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1222                         log_error("mount(MS_BIND) failed: %m");
1223                         goto child_fail;
1224                 }
1225
1226                 if (chroot(".") < 0) {
1227                         log_error("chroot() failed: %m");
1228                         goto child_fail;
1229                 }
1230
1231                 if (chdir("/") < 0) {
1232                         log_error("chdir() failed: %m");
1233                         goto child_fail;
1234                 }
1235
1236                 umask(0022);
1237
1238                 loopback_setup();
1239
1240                 if (drop_capabilities() < 0) {
1241                         log_error("drop_capabilities() failed: %m");
1242                         goto child_fail;
1243                 }
1244
1245                 if (arg_user) {
1246
1247                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1248                                 log_error("get_user_creds() failed: %m");
1249                                 goto child_fail;
1250                         }
1251
1252                         if (mkdir_parents_label(home, 0775) < 0) {
1253                                 log_error("mkdir_parents_label() failed: %m");
1254                                 goto child_fail;
1255                         }
1256
1257                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1258                                 log_error("mkdir_safe_label() failed: %m");
1259                                 goto child_fail;
1260                         }
1261
1262                         if (initgroups((const char*)arg_user, gid) < 0) {
1263                                 log_error("initgroups() failed: %m");
1264                                 goto child_fail;
1265                         }
1266
1267                         if (setresgid(gid, gid, gid) < 0) {
1268                                 log_error("setregid() failed: %m");
1269                                 goto child_fail;
1270                         }
1271
1272                         if (setresuid(uid, uid, uid) < 0) {
1273                                 log_error("setreuid() failed: %m");
1274                                 goto child_fail;
1275                         }
1276                 }
1277
1278                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1279                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1280                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1281                     log_error("Out of memory");
1282                     goto child_fail;
1283                 }
1284
1285                 if (arg_uuid) {
1286                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1287                                 log_error("Out of memory");
1288                                 goto child_fail;
1289                         }
1290                 }
1291
1292                 setup_hostname();
1293
1294                 if (arg_boot) {
1295                         char **a;
1296                         size_t l;
1297
1298                         /* Automatically search for the init system */
1299
1300                         l = 1 + argc - optind;
1301                         a = newa(char*, l + 1);
1302                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1303
1304                         a[0] = (char*) "/usr/lib/systemd/systemd";
1305                         execve(a[0], a, (char**) envp);
1306
1307                         a[0] = (char*) "/lib/systemd/systemd";
1308                         execve(a[0], a, (char**) envp);
1309
1310                         a[0] = (char*) "/sbin/init";
1311                         execve(a[0], a, (char**) envp);
1312                 } else if (argc > optind)
1313                         execvpe(argv[optind], argv + optind, (char**) envp);
1314                 else {
1315                         chdir(home ? home : "/root");
1316                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1317                 }
1318
1319                 log_error("execv() failed: %m");
1320
1321         child_fail:
1322                 _exit(EXIT_FAILURE);
1323         }
1324
1325         if (process_pty(master, &mask) < 0)
1326                 goto finish;
1327
1328         if (saved_attr_valid) {
1329                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1330                 saved_attr_valid = false;
1331         }
1332
1333         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1334
1335         if (r < 0)
1336                 r = EXIT_FAILURE;
1337
1338 finish:
1339         if (saved_attr_valid)
1340                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1341
1342         if (master >= 0)
1343                 close_nointr_nofail(master);
1344
1345         close_pipe(kmsg_socket_pair);
1346
1347         if (oldcg)
1348                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1349
1350         if (newcg)
1351                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1352
1353         free(arg_directory);
1354         strv_free(arg_controllers);
1355         free(oldcg);
1356         free(newcg);
1357
1358         return r;
1359 }