chiark / gitweb /
ab7a239ed59f7d7df91425a675e4a2feaa195722
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58
59 typedef enum LinkJournal {
60         LINK_NO,
61         LINK_AUTO,
62         LINK_HOST,
63         LINK_GUEST
64 } LinkJournal;
65
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
75         (1ULL << CAP_CHOWN) |
76         (1ULL << CAP_DAC_OVERRIDE) |
77         (1ULL << CAP_DAC_READ_SEARCH) |
78         (1ULL << CAP_FOWNER) |
79         (1ULL << CAP_FSETID) |
80         (1ULL << CAP_IPC_OWNER) |
81         (1ULL << CAP_KILL) |
82         (1ULL << CAP_LEASE) |
83         (1ULL << CAP_LINUX_IMMUTABLE) |
84         (1ULL << CAP_NET_BIND_SERVICE) |
85         (1ULL << CAP_NET_BROADCAST) |
86         (1ULL << CAP_NET_RAW) |
87         (1ULL << CAP_SETGID) |
88         (1ULL << CAP_SETFCAP) |
89         (1ULL << CAP_SETPCAP) |
90         (1ULL << CAP_SETUID) |
91         (1ULL << CAP_SYS_ADMIN) |
92         (1ULL << CAP_SYS_CHROOT) |
93         (1ULL << CAP_SYS_NICE) |
94         (1ULL << CAP_SYS_PTRACE) |
95         (1ULL << CAP_SYS_TTY_CONFIG) |
96         (1ULL << CAP_SYS_RESOURCE) |
97         (1ULL << CAP_SYS_BOOT);
98
99 static int help(void) {
100
101         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103                "  -h --help               Show this help\n"
104                "  -D --directory=NAME     Root directory for the container\n"
105                "  -b --boot               Boot up full system (i.e. invoke init)\n"
106                "  -u --user=USER          Run the command under specified user or uid\n"
107                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
108                "     --uuid=UUID          Set a specific machine UUID for the container\n"
109                "     --private-network    Disable network in container\n"
110                "     --read-only          Mount the root directory read-only\n"
111                "     --capability=CAP     In addition to the default, retain specified capability\n"
112                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
113                "  -j                      Equivalent to --link-journal=host\n",
114                program_invocation_short_name);
115
116         return 0;
117 }
118
119 static int parse_argv(int argc, char *argv[]) {
120
121         enum {
122                 ARG_PRIVATE_NETWORK = 0x100,
123                 ARG_UUID,
124                 ARG_READ_ONLY,
125                 ARG_CAPABILITY,
126                 ARG_LINK_JOURNAL
127         };
128
129         static const struct option options[] = {
130                 { "help",            no_argument,       NULL, 'h'                 },
131                 { "directory",       required_argument, NULL, 'D'                 },
132                 { "user",            required_argument, NULL, 'u'                 },
133                 { "controllers",     required_argument, NULL, 'C'                 },
134                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
135                 { "boot",            no_argument,       NULL, 'b'                 },
136                 { "uuid",            required_argument, NULL, ARG_UUID            },
137                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
138                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
139                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
140                 { NULL,              0,                 NULL, 0                   }
141         };
142
143         int c;
144
145         assert(argc >= 0);
146         assert(argv);
147
148         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
149
150                 switch (c) {
151
152                 case 'h':
153                         help();
154                         return 0;
155
156                 case 'D':
157                         free(arg_directory);
158                         arg_directory = canonicalize_file_name(optarg);
159                         if (!arg_directory) {
160                                 log_error("Failed to canonicalize root directory.");
161                                 return -ENOMEM;
162                         }
163
164                         break;
165
166                 case 'u':
167                         free(arg_user);
168                         if (!(arg_user = strdup(optarg))) {
169                                 log_error("Failed to duplicate user name.");
170                                 return -ENOMEM;
171                         }
172
173                         break;
174
175                 case 'C':
176                         strv_free(arg_controllers);
177                         arg_controllers = strv_split(optarg, ",");
178                         if (!arg_controllers) {
179                                 log_error("Failed to split controllers list.");
180                                 return -ENOMEM;
181                         }
182                         strv_uniq(arg_controllers);
183
184                         break;
185
186                 case ARG_PRIVATE_NETWORK:
187                         arg_private_network = true;
188                         break;
189
190                 case 'b':
191                         arg_boot = true;
192                         break;
193
194                 case ARG_UUID:
195                         arg_uuid = optarg;
196                         break;
197
198                 case ARG_READ_ONLY:
199                         arg_read_only = true;
200                         break;
201
202                 case ARG_CAPABILITY: {
203                         char *state, *word;
204                         size_t length;
205
206                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207                                 cap_value_t cap;
208                                 char *t;
209
210                                 t = strndup(word, length);
211                                 if (!t)
212                                         return log_oom();
213
214                                 if (cap_from_name(t, &cap) < 0) {
215                                         log_error("Failed to parse capability %s.", t);
216                                         free(t);
217                                         return -EINVAL;
218                                 }
219
220                                 free(t);
221                                 arg_retain |= 1ULL << (uint64_t) cap;
222                         }
223
224                         break;
225                 }
226
227                 case 'j':
228                         arg_link_journal = LINK_GUEST;
229                         break;
230
231                 case ARG_LINK_JOURNAL:
232                         if (streq(optarg, "auto"))
233                                 arg_link_journal = LINK_AUTO;
234                         else if (streq(optarg, "no"))
235                                 arg_link_journal = LINK_NO;
236                         else if (streq(optarg, "guest"))
237                                 arg_link_journal = LINK_GUEST;
238                         else if (streq(optarg, "host"))
239                                 arg_link_journal = LINK_HOST;
240                         else {
241                                 log_error("Failed to parse link journal mode %s", optarg);
242                                 return -EINVAL;
243                         }
244
245                         break;
246
247                 case '?':
248                         return -EINVAL;
249
250                 default:
251                         log_error("Unknown option code %c", c);
252                         return -EINVAL;
253                 }
254         }
255
256         return 1;
257 }
258
259 static int mount_all(const char *dest) {
260
261         typedef struct MountPoint {
262                 const char *what;
263                 const char *where;
264                 const char *type;
265                 const char *options;
266                 unsigned long flags;
267                 bool fatal;
268         } MountPoint;
269
270         static const MountPoint mount_table[] = {
271                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
272                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
273                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
274                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
275                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
276                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
277                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
279 #ifdef HAVE_SELINUX
280                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
281                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
282 #endif
283         };
284
285         unsigned k;
286         int r = 0;
287
288         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289                 char _cleanup_free_ *where = NULL;
290                 int t;
291
292                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
293                         log_oom();
294
295                         if (r == 0)
296                                 r = -ENOMEM;
297
298                         break;
299                 }
300
301                 t = path_is_mount_point(where, true);
302                 if (t < 0) {
303                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 /* Skip this entry if it is not a remount. */
312                 if (mount_table[k].what && t > 0)
313                         continue;
314
315                 mkdir_p_label(where, 0755);
316
317                 if (mount(mount_table[k].what,
318                           where,
319                           mount_table[k].type,
320                           mount_table[k].flags,
321                           mount_table[k].options) < 0 &&
322                     mount_table[k].fatal) {
323
324                         log_error("mount(%s) failed: %m", where);
325
326                         if (r == 0)
327                                 r = -errno;
328                 }
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         where = strappend(dest, "/etc/localtime");
341         if (!where)
342                 return log_oom();
343
344         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347         free(where);
348
349         where = strappend(dest, "/etc/timezone");
350         if (!where)
351                 return log_oom();
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         where = strappend(dest, "/etc/resolv.conf");
371         if (!where)
372                 return log_oom();
373
374         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377         free(where);
378
379         return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383         char _cleanup_free_ *from = NULL, *to = NULL;
384         sd_id128_t rnd;
385         char as_uuid[37];
386         int r;
387
388         assert(dest);
389
390         /* Generate a new randomized boot ID, so that each boot-up of
391          * the container gets a new one */
392
393         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
395         if (!from || !to)
396                 return log_oom();
397
398         r = sd_id128_randomize(&rnd);
399         if (r < 0) {
400                 log_error("Failed to generate random boot id: %s", strerror(-r));
401                 return r;
402         }
403
404         snprintf(as_uuid, sizeof(as_uuid),
405                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
406                  SD_ID128_FORMAT_VAL(rnd));
407         char_array_0(as_uuid);
408
409         r = write_one_line_file(from, as_uuid);
410         if (r < 0) {
411                 log_error("Failed to write boot id: %s", strerror(-r));
412                 return r;
413         }
414
415         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
416                 log_error("Failed to bind mount boot id: %m");
417                 r = -errno;
418         } else
419                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
420
421         unlink(from);
422         return r;
423 }
424
425 static int copy_devnodes(const char *dest) {
426
427         static const char devnodes[] =
428                 "null\0"
429                 "zero\0"
430                 "full\0"
431                 "random\0"
432                 "urandom\0"
433                 "tty\0"
434                 "ptmx\0";
435
436         const char *d;
437         int r = 0;
438         mode_t _cleanup_umask_ u;
439
440         assert(dest);
441
442         u = umask(0000);
443
444         NULSTR_FOREACH(d, devnodes) {
445                 struct stat st;
446                 char _cleanup_free_ *from = NULL, *to = NULL;
447
448                 asprintf(&from, "/dev/%s", d);
449                 asprintf(&to, "%s/dev/%s", dest, d);
450
451                 if (!from || !to) {
452                         log_oom();
453
454                         if (r == 0)
455                                 r = -ENOMEM;
456
457                         break;
458                 }
459
460                 if (stat(from, &st) < 0) {
461
462                         if (errno != ENOENT) {
463                                 log_error("Failed to stat %s: %m", from);
464                                 if (r == 0)
465                                         r = -errno;
466                         }
467
468                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
469
470                         log_error("%s is not a char or block device, cannot copy", from);
471                         if (r == 0)
472                                 r = -EIO;
473
474                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
475
476                         log_error("mknod(%s) failed: %m", dest);
477                         if (r == 0)
478                                 r = -errno;
479                 }
480         }
481
482         return r;
483 }
484
485 static int setup_dev_console(const char *dest, const char *console) {
486         struct stat st;
487         char _cleanup_free_ *to = NULL;
488         int r;
489         mode_t _cleanup_umask_ u;
490
491         assert(dest);
492         assert(console);
493
494         u = umask(0000);
495
496         if (stat(console, &st) < 0) {
497                 log_error("Failed to stat %s: %m", console);
498                 return -errno;
499
500         } else if (!S_ISCHR(st.st_mode)) {
501                 log_error("/dev/console is not a char device");
502                 return -EIO;
503         }
504
505         r = chmod_and_chown(console, 0600, 0, 0);
506         if (r < 0) {
507                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
508                 return r;
509         }
510
511         if (asprintf(&to, "%s/dev/console", dest) < 0)
512                 return log_oom();
513
514         /* We need to bind mount the right tty to /dev/console since
515          * ptys can only exist on pts file systems. To have something
516          * to bind mount things on we create a device node first, that
517          * has the right major/minor (note that the major minor
518          * doesn't actually matter here, since we mount it over
519          * anyway). */
520
521         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
522                 log_error("mknod() for /dev/console failed: %m");
523                 return -errno;
524         }
525
526         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
527                 log_error("Bind mount for /dev/console failed: %m");
528                 return -errno;
529         }
530
531         return 0;
532 }
533
534 static int setup_kmsg(const char *dest, int kmsg_socket) {
535         char _cleanup_free_ *from = NULL, *to = NULL;
536         int r, fd, k;
537         mode_t _cleanup_umask_ u;
538         union {
539                 struct cmsghdr cmsghdr;
540                 uint8_t buf[CMSG_SPACE(sizeof(int))];
541         } control;
542         struct msghdr mh;
543         struct cmsghdr *cmsg;
544
545         assert(dest);
546         assert(kmsg_socket >= 0);
547
548         u = umask(0000);
549
550         /* We create the kmsg FIFO as /dev/kmsg, but immediately
551          * delete it after bind mounting it to /proc/kmsg. While FIFOs
552          * on the reading side behave very similar to /proc/kmsg,
553          * their writing side behaves differently from /dev/kmsg in
554          * that writing blocks when nothing is reading. In order to
555          * avoid any problems with containers deadlocking due to this
556          * we simply make /dev/kmsg unavailable to the container. */
557         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
558             asprintf(&to, "%s/proc/kmsg", dest) < 0)
559                 return log_oom();
560
561         if (mkfifo(from, 0600) < 0) {
562                 log_error("mkfifo() for /dev/kmsg failed: %m");
563                 return -errno;
564         }
565
566         r = chmod_and_chown(from, 0600, 0, 0);
567         if (r < 0) {
568                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
569                 return r;
570         }
571
572         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
573                 log_error("Bind mount for /proc/kmsg failed: %m");
574                 return -errno;
575         }
576
577         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
578         if (fd < 0) {
579                 log_error("Failed to open fifo: %m");
580                 return -errno;
581         }
582
583         zero(mh);
584         zero(control);
585
586         mh.msg_control = &control;
587         mh.msg_controllen = sizeof(control);
588
589         cmsg = CMSG_FIRSTHDR(&mh);
590         cmsg->cmsg_level = SOL_SOCKET;
591         cmsg->cmsg_type = SCM_RIGHTS;
592         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
593         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
594
595         mh.msg_controllen = cmsg->cmsg_len;
596
597         /* Store away the fd in the socket, so that it stays open as
598          * long as we run the child */
599         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
600         close_nointr_nofail(fd);
601
602         if (k < 0) {
603                 log_error("Failed to send FIFO fd: %m");
604                 return -errno;
605         }
606
607         /* And now make the FIFO unavailable as /dev/kmsg... */
608         unlink(from);
609         return 0;
610 }
611
612 static int setup_hostname(void) {
613         char *hn;
614         int r = 0;
615
616         hn = path_get_file_name(arg_directory);
617         if (hn) {
618                 hn = strdup(hn);
619                 if (!hn)
620                         return -ENOMEM;
621
622                 hostname_cleanup(hn);
623
624                 if (!isempty(hn))
625                         if (sethostname(hn, strlen(hn)) < 0)
626                                 r = -errno;
627
628                 free(hn);
629         }
630
631         return r;
632 }
633
634 static int setup_journal(const char *directory) {
635         sd_id128_t machine_id;
636         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
637         int r;
638
639         if (arg_link_journal == LINK_NO)
640                 return 0;
641
642         p = strappend(directory, "/etc/machine-id");
643         if (!p) {
644                 r = log_oom();
645                 goto finish;
646         }
647
648         r = read_one_line_file(p, &b);
649         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
650                 r = 0;
651                 goto finish;
652         } else if (r < 0) {
653                 log_error("Failed to read machine ID: %s", strerror(-r));
654                 return r;
655         }
656
657         l = strstrip(b);
658         if (isempty(l) && arg_link_journal == LINK_AUTO) {
659                 r = 0;
660                 goto finish;
661         }
662
663         /* Verify validaty */
664         r = sd_id128_from_string(l, &machine_id);
665         if (r < 0) {
666                 log_error("Failed to parse machine ID: %s", strerror(-r));
667                 goto finish;
668         }
669
670         free(p);
671         p = strappend("/var/log/journal/", l);
672         q = strjoin(directory, "/var/log/journal/", l, NULL);
673         if (!p || !q) {
674                 r = log_oom();
675                 goto finish;
676         }
677
678         if (path_is_mount_point(p, false) > 0 ||
679             path_is_mount_point(q, false) > 0) {
680                 if (arg_link_journal != LINK_AUTO) {
681                         log_error("Journal already a mount point, refusing.");
682                         r = -EEXIST;
683                         goto finish;
684                 }
685
686                 r = 0;
687                 goto finish;
688         }
689
690         r = readlink_and_make_absolute(p, &d);
691         if (r >= 0) {
692                 if ((arg_link_journal == LINK_GUEST ||
693                      arg_link_journal == LINK_AUTO) &&
694                     path_equal(d, q)) {
695
696                         mkdir_p(q, 0755);
697
698                         r = 0;
699                         goto finish;
700                 }
701
702                 if (unlink(p) < 0) {
703                         log_error("Failed to remove symlink %s: %m", p);
704                         r = -errno;
705                         goto finish;
706                 }
707         } else if (r == -EINVAL) {
708
709                 if (arg_link_journal == LINK_GUEST &&
710                     rmdir(p) < 0) {
711
712                         if (errno == ENOTDIR)
713                                 log_error("%s already exists and is neither symlink nor directory.", p);
714                         else {
715                                 log_error("Failed to remove %s: %m", p);
716                                 r = -errno;
717                         }
718
719                         goto finish;
720                 }
721         } else if (r != -ENOENT) {
722                 log_error("readlink(%s) failed: %m", p);
723                 goto finish;
724         }
725
726         if (arg_link_journal == LINK_GUEST) {
727
728                 if (symlink(q, p) < 0) {
729                         log_error("Failed to symlink %s to %s: %m", q, p);
730                         r = -errno;
731                         goto finish;
732                 }
733
734                 mkdir_p(q, 0755);
735
736                 r = 0;
737                 goto finish;
738         }
739
740         if (arg_link_journal == LINK_HOST) {
741                 r = mkdir_p(p, 0755);
742                 if (r < 0) {
743                         log_error("Failed to create %s: %m", p);
744                         goto finish;
745                 }
746
747         } else if (access(p, F_OK) < 0) {
748                 r = 0;
749                 goto finish;
750         }
751
752         if (dir_is_empty(q) == 0) {
753                 log_error("%s not empty.", q);
754                 r = -ENOTEMPTY;
755                 goto finish;
756         }
757
758         r = mkdir_p(q, 0755);
759         if (r < 0) {
760                 log_error("Failed to create %s: %m", q);
761                 goto finish;
762         }
763
764         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Failed to bind mount journal from host into guest: %m");
766                 r = -errno;
767                 goto finish;
768         }
769
770         r = 0;
771
772 finish:
773         free(p);
774         free(q);
775         free(d);
776         free(b);
777         return r;
778
779 }
780
781 static int drop_capabilities(void) {
782         return capability_bounding_set_drop(~arg_retain, false);
783 }
784
785 static int is_os_tree(const char *path) {
786         int r;
787         char *p;
788         /* We use /bin/sh as flag file if something is an OS */
789
790         if (asprintf(&p, "%s/bin/sh", path) < 0)
791                 return -ENOMEM;
792
793         r = access(p, F_OK);
794         free(p);
795
796         return r < 0 ? 0 : 1;
797 }
798
799 static int process_pty(int master, sigset_t *mask) {
800
801         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802         size_t in_buffer_full = 0, out_buffer_full = 0;
803         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805         int ep = -1, signal_fd = -1, r;
806
807         fd_nonblock(STDIN_FILENO, 1);
808         fd_nonblock(STDOUT_FILENO, 1);
809         fd_nonblock(master, 1);
810
811         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
812         if (signal_fd < 0) {
813                 log_error("signalfd(): %m");
814                 r = -errno;
815                 goto finish;
816         }
817
818         ep = epoll_create1(EPOLL_CLOEXEC);
819         if (ep < 0) {
820                 log_error("Failed to create epoll: %m");
821                 r = -errno;
822                 goto finish;
823         }
824
825         zero(stdin_ev);
826         stdin_ev.events = EPOLLIN|EPOLLET;
827         stdin_ev.data.fd = STDIN_FILENO;
828
829         zero(stdout_ev);
830         stdout_ev.events = EPOLLOUT|EPOLLET;
831         stdout_ev.data.fd = STDOUT_FILENO;
832
833         zero(master_ev);
834         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
835         master_ev.data.fd = master;
836
837         zero(signal_ev);
838         signal_ev.events = EPOLLIN;
839         signal_ev.data.fd = signal_fd;
840
841         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
842             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
843             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
844             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
845                 log_error("Failed to regiser fds in epoll: %m");
846                 r = -errno;
847                 goto finish;
848         }
849
850         for (;;) {
851                 struct epoll_event ev[16];
852                 ssize_t k;
853                 int i, nfds;
854
855                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
856                 if (nfds < 0) {
857
858                         if (errno == EINTR || errno == EAGAIN)
859                                 continue;
860
861                         log_error("epoll_wait(): %m");
862                         r = -errno;
863                         goto finish;
864                 }
865
866                 assert(nfds >= 1);
867
868                 for (i = 0; i < nfds; i++) {
869                         if (ev[i].data.fd == STDIN_FILENO) {
870
871                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
872                                         stdin_readable = true;
873
874                         } else if (ev[i].data.fd == STDOUT_FILENO) {
875
876                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877                                         stdout_writable = true;
878
879                         } else if (ev[i].data.fd == master) {
880
881                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
882                                         master_readable = true;
883
884                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
885                                         master_writable = true;
886
887                         } else if (ev[i].data.fd == signal_fd) {
888                                 struct signalfd_siginfo sfsi;
889                                 ssize_t n;
890
891                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
892                                 if (n != sizeof(sfsi)) {
893
894                                         if (n >= 0) {
895                                                 log_error("Failed to read from signalfd: invalid block size");
896                                                 r = -EIO;
897                                                 goto finish;
898                                         }
899
900                                         if (errno != EINTR && errno != EAGAIN) {
901                                                 log_error("Failed to read from signalfd: %m");
902                                                 r = -errno;
903                                                 goto finish;
904                                         }
905                                 } else {
906
907                                         if (sfsi.ssi_signo == SIGWINCH) {
908                                                 struct winsize ws;
909
910                                                 /* The window size changed, let's forward that. */
911                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
912                                                         ioctl(master, TIOCSWINSZ, &ws);
913                                         } else {
914                                                 r = 0;
915                                                 goto finish;
916                                         }
917                                 }
918                         }
919                 }
920
921                 while ((stdin_readable && in_buffer_full <= 0) ||
922                        (master_writable && in_buffer_full > 0) ||
923                        (master_readable && out_buffer_full <= 0) ||
924                        (stdout_writable && out_buffer_full > 0)) {
925
926                         if (stdin_readable && in_buffer_full < LINE_MAX) {
927
928                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
929                                 if (k < 0) {
930
931                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
932                                                 stdin_readable = false;
933                                         else {
934                                                 log_error("read(): %m");
935                                                 r = -errno;
936                                                 goto finish;
937                                         }
938                                 } else
939                                         in_buffer_full += (size_t) k;
940                         }
941
942                         if (master_writable && in_buffer_full > 0) {
943
944                                 k = write(master, in_buffer, in_buffer_full);
945                                 if (k < 0) {
946
947                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
948                                                 master_writable = false;
949                                         else {
950                                                 log_error("write(): %m");
951                                                 r = -errno;
952                                                 goto finish;
953                                         }
954
955                                 } else {
956                                         assert(in_buffer_full >= (size_t) k);
957                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
958                                         in_buffer_full -= k;
959                                 }
960                         }
961
962                         if (master_readable && out_buffer_full < LINE_MAX) {
963
964                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
965                                 if (k < 0) {
966
967                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
968                                                 master_readable = false;
969                                         else {
970                                                 log_error("read(): %m");
971                                                 r = -errno;
972                                                 goto finish;
973                                         }
974                                 }  else
975                                         out_buffer_full += (size_t) k;
976                         }
977
978                         if (stdout_writable && out_buffer_full > 0) {
979
980                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
981                                 if (k < 0) {
982
983                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984                                                 stdout_writable = false;
985                                         else {
986                                                 log_error("write(): %m");
987                                                 r = -errno;
988                                                 goto finish;
989                                         }
990
991                                 } else {
992                                         assert(out_buffer_full >= (size_t) k);
993                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
994                                         out_buffer_full -= k;
995                                 }
996                         }
997                 }
998         }
999
1000 finish:
1001         if (ep >= 0)
1002                 close_nointr_nofail(ep);
1003
1004         if (signal_fd >= 0)
1005                 close_nointr_nofail(signal_fd);
1006
1007         return r;
1008 }
1009
1010 int main(int argc, char *argv[]) {
1011         pid_t pid = 0;
1012         int r = EXIT_FAILURE, k;
1013         char *oldcg = NULL, *newcg = NULL;
1014         char **controller = NULL;
1015         int master = -1;
1016         const char *console = NULL;
1017         struct termios saved_attr, raw_attr;
1018         sigset_t mask;
1019         bool saved_attr_valid = false;
1020         struct winsize ws;
1021         int kmsg_socket_pair[2] = { -1, -1 };
1022
1023         log_parse_environment();
1024         log_open();
1025
1026         r = parse_argv(argc, argv);
1027         if (r <= 0)
1028                 goto finish;
1029
1030         if (arg_directory) {
1031                 char *p;
1032
1033                 p = path_make_absolute_cwd(arg_directory);
1034                 free(arg_directory);
1035                 arg_directory = p;
1036         } else
1037                 arg_directory = get_current_dir_name();
1038
1039         if (!arg_directory) {
1040                 log_error("Failed to determine path");
1041                 goto finish;
1042         }
1043
1044         path_kill_slashes(arg_directory);
1045
1046         if (geteuid() != 0) {
1047                 log_error("Need to be root.");
1048                 goto finish;
1049         }
1050
1051         if (sd_booted() <= 0) {
1052                 log_error("Not running on a systemd system.");
1053                 goto finish;
1054         }
1055
1056         if (path_equal(arg_directory, "/")) {
1057                 log_error("Spawning container on root directory not supported.");
1058                 goto finish;
1059         }
1060
1061         if (is_os_tree(arg_directory) <= 0) {
1062                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1063                 goto finish;
1064         }
1065
1066         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1067         if (k < 0) {
1068                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1069                 goto finish;
1070         }
1071
1072         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1073                 log_error("Failed to allocate cgroup path.");
1074                 goto finish;
1075         }
1076
1077         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1078         if (k < 0)  {
1079                 log_error("Failed to create cgroup: %s", strerror(-k));
1080                 goto finish;
1081         }
1082
1083         STRV_FOREACH(controller, arg_controllers) {
1084                 k = cg_create_and_attach(*controller, newcg, 0);
1085                 if (k < 0)
1086                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1087         }
1088
1089         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1090         if (master < 0) {
1091                 log_error("Failed to acquire pseudo tty: %m");
1092                 goto finish;
1093         }
1094
1095         console = ptsname(master);
1096         if (!console) {
1097                 log_error("Failed to determine tty name: %m");
1098                 goto finish;
1099         }
1100
1101         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1102
1103         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1104                 ioctl(master, TIOCSWINSZ, &ws);
1105
1106         if (unlockpt(master) < 0) {
1107                 log_error("Failed to unlock tty: %m");
1108                 goto finish;
1109         }
1110
1111         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1112                 log_error("Failed to get terminal attributes: %m");
1113                 goto finish;
1114         }
1115
1116         saved_attr_valid = true;
1117
1118         raw_attr = saved_attr;
1119         cfmakeraw(&raw_attr);
1120         raw_attr.c_lflag &= ~ECHO;
1121
1122         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1123                 log_error("Failed to create kmsg socket pair");
1124                 goto finish;
1125         }
1126
1127         assert_se(sigemptyset(&mask) == 0);
1128         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1129         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1130
1131         for (;;) {
1132                 siginfo_t status;
1133
1134                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1135                         log_error("Failed to set terminal attributes: %m");
1136                         goto finish;
1137                 }
1138
1139                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1140                 if (pid < 0) {
1141                         if (errno == EINVAL)
1142                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1143                         else
1144                                 log_error("clone() failed: %m");
1145
1146                         goto finish;
1147                 }
1148
1149                 if (pid == 0) {
1150                         /* child */
1151
1152                         const char *home = NULL;
1153                         uid_t uid = (uid_t) -1;
1154                         gid_t gid = (gid_t) -1;
1155                         const char *envp[] = {
1156                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1157                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1158                                 NULL, /* TERM */
1159                                 NULL, /* HOME */
1160                                 NULL, /* USER */
1161                                 NULL, /* LOGNAME */
1162                                 NULL, /* container_uuid */
1163                                 NULL
1164                         };
1165
1166                         envp[2] = strv_find_prefix(environ, "TERM=");
1167
1168                         close_nointr_nofail(master);
1169
1170                         close_nointr(STDIN_FILENO);
1171                         close_nointr(STDOUT_FILENO);
1172                         close_nointr(STDERR_FILENO);
1173
1174                         close_all_fds(&kmsg_socket_pair[1], 1);
1175
1176                         reset_all_signal_handlers();
1177
1178                         assert_se(sigemptyset(&mask) == 0);
1179                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1180
1181                         if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1182                             dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1183                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1184                                 goto child_fail;
1185
1186                         if (setsid() < 0) {
1187                                 log_error("setsid() failed: %m");
1188                                 goto child_fail;
1189                         }
1190
1191                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1192                                 log_error("PR_SET_PDEATHSIG failed: %m");
1193                                 goto child_fail;
1194                         }
1195
1196                         /* Mark everything as slave, so that we still
1197                          * receive mounts from the real root, but don't
1198                          * propagate mounts to the real root. */
1199                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1200                                 log_error("MS_SLAVE|MS_REC failed: %m");
1201                                 goto child_fail;
1202                         }
1203
1204                         /* Turn directory into bind mount */
1205                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1206                                 log_error("Failed to make bind mount.");
1207                                 goto child_fail;
1208                         }
1209
1210                         if (arg_read_only)
1211                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1212                                         log_error("Failed to make read-only.");
1213                                         goto child_fail;
1214                                 }
1215
1216                         if (mount_all(arg_directory) < 0)
1217                                 goto child_fail;
1218
1219                         if (copy_devnodes(arg_directory) < 0)
1220                                 goto child_fail;
1221
1222                         dev_setup(arg_directory);
1223
1224                         if (setup_dev_console(arg_directory, console) < 0)
1225                                 goto child_fail;
1226
1227                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1228                                 goto child_fail;
1229
1230                         close_nointr_nofail(kmsg_socket_pair[1]);
1231
1232                         if (setup_boot_id(arg_directory) < 0)
1233                                 goto child_fail;
1234
1235                         if (setup_timezone(arg_directory) < 0)
1236                                 goto child_fail;
1237
1238                         if (setup_resolv_conf(arg_directory) < 0)
1239                                 goto child_fail;
1240
1241                         if (setup_journal(arg_directory) < 0)
1242                                 goto child_fail;
1243
1244                         if (chdir(arg_directory) < 0) {
1245                                 log_error("chdir(%s) failed: %m", arg_directory);
1246                                 goto child_fail;
1247                         }
1248
1249                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1250                                 log_error("mount(MS_MOVE) failed: %m");
1251                                 goto child_fail;
1252                         }
1253
1254                         if (chroot(".") < 0) {
1255                                 log_error("chroot() failed: %m");
1256                                 goto child_fail;
1257                         }
1258
1259                         if (chdir("/") < 0) {
1260                                 log_error("chdir() failed: %m");
1261                                 goto child_fail;
1262                         }
1263
1264                         umask(0022);
1265
1266                         loopback_setup();
1267
1268                         if (drop_capabilities() < 0) {
1269                                 log_error("drop_capabilities() failed: %m");
1270                                 goto child_fail;
1271                         }
1272
1273                         if (arg_user) {
1274
1275                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1276                                         log_error("get_user_creds() failed: %m");
1277                                         goto child_fail;
1278                                 }
1279
1280                                 if (mkdir_parents_label(home, 0775) < 0) {
1281                                         log_error("mkdir_parents_label() failed: %m");
1282                                         goto child_fail;
1283                                 }
1284
1285                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1286                                         log_error("mkdir_safe_label() failed: %m");
1287                                         goto child_fail;
1288                                 }
1289
1290                                 if (initgroups((const char*)arg_user, gid) < 0) {
1291                                         log_error("initgroups() failed: %m");
1292                                         goto child_fail;
1293                                 }
1294
1295                                 if (setresgid(gid, gid, gid) < 0) {
1296                                         log_error("setregid() failed: %m");
1297                                         goto child_fail;
1298                                 }
1299
1300                                 if (setresuid(uid, uid, uid) < 0) {
1301                                         log_error("setreuid() failed: %m");
1302                                         goto child_fail;
1303                                 }
1304                         }
1305
1306                         if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1307                             (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1308                             (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1309                                 log_oom();
1310                                 goto child_fail;
1311                         }
1312
1313                         if (arg_uuid) {
1314                                 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1315                                         log_oom();
1316                                         goto child_fail;
1317                                 }
1318                         }
1319
1320                         setup_hostname();
1321
1322                         if (arg_boot) {
1323                                 char **a;
1324                                 size_t l;
1325
1326                                 /* Automatically search for the init system */
1327
1328                                 l = 1 + argc - optind;
1329                                 a = newa(char*, l + 1);
1330                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1331
1332                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1333                                 execve(a[0], a, (char**) envp);
1334
1335                                 a[0] = (char*) "/lib/systemd/systemd";
1336                                 execve(a[0], a, (char**) envp);
1337
1338                                 a[0] = (char*) "/sbin/init";
1339                                 execve(a[0], a, (char**) envp);
1340                         } else if (argc > optind)
1341                                 execvpe(argv[optind], argv + optind, (char**) envp);
1342                         else {
1343                                 chdir(home ? home : "/root");
1344                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1345                         }
1346
1347                         log_error("execv() failed: %m");
1348
1349                 child_fail:
1350                         _exit(EXIT_FAILURE);
1351                 }
1352
1353                 if (process_pty(master, &mask) < 0)
1354                         goto finish;
1355
1356
1357                 if (saved_attr_valid)
1358                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1359
1360                 r = wait_for_terminate(pid, &status);
1361                 if (r < 0) {
1362                         r = EXIT_FAILURE;
1363                         break;
1364                 }
1365
1366                 if (status.si_code == CLD_EXITED) {
1367                         if (status.si_status != 0) {
1368                                 log_error("Container failed with error code %i.", status.si_status);
1369                                 r = status.si_status;
1370                                 break;
1371                         }
1372
1373                         log_debug("Container exited successfully.");
1374                         break;
1375                 } else if (status.si_code == CLD_KILLED &&
1376                            status.si_status == SIGINT) {
1377                         log_info("Container has been shut down.");
1378                         r = 0;
1379                         break;
1380                 } else if (status.si_code == CLD_KILLED &&
1381                            status.si_status == SIGHUP) {
1382                         log_info("Container is being rebooted.");
1383                         continue;
1384                 } else if (status.si_code == CLD_KILLED ||
1385                            status.si_code == CLD_DUMPED) {
1386
1387                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1388                         r = EXIT_FAILURE;
1389                         break;
1390                 } else {
1391                         log_error("Container failed due to unknown reason.");
1392                         r = EXIT_FAILURE;
1393                         break;
1394                 }
1395         }
1396
1397 finish:
1398         if (saved_attr_valid)
1399                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1400
1401         if (master >= 0)
1402                 close_nointr_nofail(master);
1403
1404         close_pipe(kmsg_socket_pair);
1405
1406         if (oldcg)
1407                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1408
1409         if (newcg)
1410                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1411
1412         free(arg_directory);
1413         strv_free(arg_controllers);
1414         free(oldcg);
1415         free(newcg);
1416
1417         return r;
1418 }