chiark / gitweb /
8765b0185fbff5cde3bb269eb63e2d67f11d39bb
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE);
96
97 static int help(void) {
98
99         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101                "  -h --help               Show this help\n"
102                "  -D --directory=NAME     Root directory for the container\n"
103                "  -b --boot               Boot up full system (i.e. invoke init)\n"
104                "  -u --user=USER          Run the command under specified user or uid\n"
105                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
106                "     --uuid=UUID          Set a specific machine UUID for the container\n"
107                "     --private-network    Disable network in container\n"
108                "     --read-only          Mount the root directory read-only\n"
109                "     --capability=CAP     In addition to the default, retain specified capability\n"
110                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
111                "  -j                      Equivalent to --link-journal=host\n",
112                program_invocation_short_name);
113
114         return 0;
115 }
116
117 static int parse_argv(int argc, char *argv[]) {
118
119         enum {
120                 ARG_PRIVATE_NETWORK = 0x100,
121                 ARG_UUID,
122                 ARG_READ_ONLY,
123                 ARG_CAPABILITY,
124                 ARG_LINK_JOURNAL
125         };
126
127         static const struct option options[] = {
128                 { "help",            no_argument,       NULL, 'h'                 },
129                 { "directory",       required_argument, NULL, 'D'                 },
130                 { "user",            required_argument, NULL, 'u'                 },
131                 { "controllers",     required_argument, NULL, 'C'                 },
132                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
133                 { "boot",            no_argument,       NULL, 'b'                 },
134                 { "uuid",            required_argument, NULL, ARG_UUID            },
135                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
136                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
137                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
138                 { NULL,              0,                 NULL, 0                   }
139         };
140
141         int c;
142
143         assert(argc >= 0);
144         assert(argv);
145
146         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
147
148                 switch (c) {
149
150                 case 'h':
151                         help();
152                         return 0;
153
154                 case 'D':
155                         free(arg_directory);
156                         arg_directory = canonicalize_file_name(optarg);
157                         if (!arg_directory) {
158                                 log_error("Failed to canonicalize root directory.");
159                                 return -ENOMEM;
160                         }
161
162                         break;
163
164                 case 'u':
165                         free(arg_user);
166                         if (!(arg_user = strdup(optarg))) {
167                                 log_error("Failed to duplicate user name.");
168                                 return -ENOMEM;
169                         }
170
171                         break;
172
173                 case 'C':
174                         strv_free(arg_controllers);
175                         arg_controllers = strv_split(optarg, ",");
176                         if (!arg_controllers) {
177                                 log_error("Failed to split controllers list.");
178                                 return -ENOMEM;
179                         }
180                         strv_uniq(arg_controllers);
181
182                         break;
183
184                 case ARG_PRIVATE_NETWORK:
185                         arg_private_network = true;
186                         break;
187
188                 case 'b':
189                         arg_boot = true;
190                         break;
191
192                 case ARG_UUID:
193                         arg_uuid = optarg;
194                         break;
195
196                 case ARG_READ_ONLY:
197                         arg_read_only = true;
198                         break;
199
200                 case ARG_CAPABILITY: {
201                         char *state, *word;
202                         size_t length;
203
204                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205                                 cap_value_t cap;
206                                 char *t;
207
208                                 t = strndup(word, length);
209                                 if (!t)
210                                         return log_oom();
211
212                                 if (cap_from_name(t, &cap) < 0) {
213                                         log_error("Failed to parse capability %s.", t);
214                                         free(t);
215                                         return -EINVAL;
216                                 }
217
218                                 free(t);
219                                 arg_retain |= 1ULL << (uint64_t) cap;
220                         }
221
222                         break;
223                 }
224
225                 case 'j':
226                         arg_link_journal = LINK_GUEST;
227                         break;
228
229                 case ARG_LINK_JOURNAL:
230                         if (streq(optarg, "auto"))
231                                 arg_link_journal = LINK_AUTO;
232                         else if (streq(optarg, "no"))
233                                 arg_link_journal = LINK_NO;
234                         else if (streq(optarg, "guest"))
235                                 arg_link_journal = LINK_GUEST;
236                         else if (streq(optarg, "host"))
237                                 arg_link_journal = LINK_HOST;
238                         else {
239                                 log_error("Failed to parse link journal mode %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         break;
244
245                 case '?':
246                         return -EINVAL;
247
248                 default:
249                         log_error("Unknown option code %c", c);
250                         return -EINVAL;
251                 }
252         }
253
254         return 1;
255 }
256
257 static int mount_all(const char *dest) {
258
259         typedef struct MountPoint {
260                 const char *what;
261                 const char *where;
262                 const char *type;
263                 const char *options;
264                 unsigned long flags;
265                 bool fatal;
266         } MountPoint;
267
268         static const MountPoint mount_table[] = {
269                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
270                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
271                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
272                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
273                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
274                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
275                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
276 #ifdef HAVE_SELINUX
277                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
278                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
279 #endif
280         };
281
282         unsigned k;
283         int r = 0;
284         char *where;
285
286         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287                 int t;
288
289                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290                         log_oom();
291
292                         if (r == 0)
293                                 r = -ENOMEM;
294
295                         break;
296                 }
297
298                 t = path_is_mount_point(where, true);
299                 if (t < 0) {
300                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301                         free(where);
302
303                         if (r == 0)
304                                 r = t;
305
306                         continue;
307                 }
308
309                 /* Skip this entry if it is not a remount. */
310                 if (mount_table[k].what && t > 0)
311                         continue;
312
313                 mkdir_p_label(where, 0755);
314
315                 if (mount(mount_table[k].what,
316                           where,
317                           mount_table[k].type,
318                           mount_table[k].flags,
319                           mount_table[k].options) < 0 &&
320                     mount_table[k].fatal) {
321
322                         log_error("mount(%s) failed: %m", where);
323
324                         if (r == 0)
325                                 r = -errno;
326                 }
327
328                 free(where);
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         where = strappend(dest, "/etc/localtime");
341         if (!where)
342                 return log_oom();
343
344         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347         free(where);
348
349         where = strappend(dest, "/etc/timezone");
350         if (!where)
351                 return log_oom();
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         where = strappend(dest, "/etc/resolv.conf");
371         if (!where)
372                 return log_oom();
373
374         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377         free(where);
378
379         return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383         char *from = NULL, *to = NULL;
384         sd_id128_t rnd;
385         char as_uuid[37];
386         int r;
387
388         assert(dest);
389
390         /* Generate a new randomized boot ID, so that each boot-up of
391          * the container gets a new one */
392
393         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394         if (!from) {
395                 r = log_oom();
396                 goto finish;
397         }
398
399         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
400         if (!to) {
401                 r = log_oom();
402                 goto finish;
403         }
404
405         r = sd_id128_randomize(&rnd);
406         if (r < 0) {
407                 log_error("Failed to generate random boot id: %s", strerror(-r));
408                 goto finish;
409         }
410
411         snprintf(as_uuid, sizeof(as_uuid),
412                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413                  SD_ID128_FORMAT_VAL(rnd));
414         char_array_0(as_uuid);
415
416         r = write_one_line_file(from, as_uuid);
417         if (r < 0) {
418                 log_error("Failed to write boot id: %s", strerror(-r));
419                 goto finish;
420         }
421
422         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423                 log_error("Failed to bind mount boot id: %m");
424                 r = -errno;
425         } else
426                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
427
428         unlink(from);
429
430 finish:
431         free(from);
432         free(to);
433
434         return r;
435 }
436
437 static int copy_devnodes(const char *dest) {
438
439         static const char devnodes[] =
440                 "null\0"
441                 "zero\0"
442                 "full\0"
443                 "random\0"
444                 "urandom\0"
445                 "tty\0"
446                 "ptmx\0"
447                 "rtc0\0";
448
449         const char *d;
450         int r = 0;
451         mode_t u;
452
453         assert(dest);
454
455         u = umask(0000);
456
457         NULSTR_FOREACH(d, devnodes) {
458                 struct stat st;
459                 char *from = NULL, *to = NULL;
460
461                 asprintf(&from, "/dev/%s", d);
462                 asprintf(&to, "%s/dev/%s", dest, d);
463
464                 if (!from || !to) {
465                         log_error("Failed to allocate devnode path");
466
467                         free(from);
468                         free(to);
469
470                         from = to = NULL;
471
472                         if (r == 0)
473                                 r = -ENOMEM;
474
475                         break;
476                 }
477
478                 if (stat(from, &st) < 0) {
479
480                         if (errno != ENOENT) {
481                                 log_error("Failed to stat %s: %m", from);
482                                 if (r == 0)
483                                         r = -errno;
484                         }
485
486                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
487
488                         log_error("%s is not a char or block device, cannot copy.", from);
489                         if (r == 0)
490                                 r = -EIO;
491
492                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
493
494                         log_error("mknod(%s) failed: %m", dest);
495                         if (r == 0)
496                                 r = -errno;
497                 }
498
499                 free(from);
500                 free(to);
501         }
502
503         umask(u);
504
505         return r;
506 }
507
508 static int setup_dev_console(const char *dest, const char *console) {
509         struct stat st;
510         char *to = NULL;
511         int r;
512         mode_t u;
513
514         assert(dest);
515         assert(console);
516
517         u = umask(0000);
518
519         if (stat(console, &st) < 0) {
520                 log_error("Failed to stat %s: %m", console);
521                 r = -errno;
522                 goto finish;
523
524         } else if (!S_ISCHR(st.st_mode)) {
525                 log_error("/dev/console is not a char device.");
526                 r = -EIO;
527                 goto finish;
528         }
529
530         r = chmod_and_chown(console, 0600, 0, 0);
531         if (r < 0) {
532                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
533                 goto finish;
534         }
535
536         if (asprintf(&to, "%s/dev/console", dest) < 0) {
537                 r = log_oom();
538                 goto finish;
539         }
540
541         /* We need to bind mount the right tty to /dev/console since
542          * ptys can only exist on pts file systems. To have something
543          * to bind mount things on we create a device node first, that
544          * has the right major/minor (note that the major minor
545          * doesn't actually matter here, since we mount it over
546          * anyway). */
547
548         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
549                 log_error("mknod() for /dev/console failed: %m");
550                 r = -errno;
551                 goto finish;
552         }
553
554         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
555                 log_error("Bind mount for /dev/console failed: %m");
556                 r = -errno;
557                 goto finish;
558         }
559
560 finish:
561         free(to);
562         umask(u);
563
564         return r;
565 }
566
567 static int setup_kmsg(const char *dest, int kmsg_socket) {
568         char *from = NULL, *to = NULL;
569         int r, fd, k;
570         mode_t u;
571         union {
572                 struct cmsghdr cmsghdr;
573                 uint8_t buf[CMSG_SPACE(sizeof(int))];
574         } control;
575         struct msghdr mh;
576         struct cmsghdr *cmsg;
577
578         assert(dest);
579         assert(kmsg_socket >= 0);
580
581         u = umask(0000);
582
583         /* We create the kmsg FIFO as /dev/kmsg, but immediately
584          * delete it after bind mounting it to /proc/kmsg. While FIFOs
585          * on the reading side behave very similar to /proc/kmsg,
586          * their writing side behaves differently from /dev/kmsg in
587          * that writing blocks when nothing is reading. In order to
588          * avoid any problems with containers deadlocking due to this
589          * we simply make /dev/kmsg unavailable to the container. */
590         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
591                 r = log_oom();
592                 goto finish;
593         }
594
595         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
596                 r = log_oom();
597                 goto finish;
598         }
599
600         if (mkfifo(from, 0600) < 0) {
601                 log_error("mkfifo() for /dev/kmsg failed: %m");
602                 r = -errno;
603                 goto finish;
604         }
605
606         r = chmod_and_chown(from, 0600, 0, 0);
607         if (r < 0) {
608                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
609                 goto finish;
610         }
611
612         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613                 log_error("Bind mount for /proc/kmsg failed: %m");
614                 r = -errno;
615                 goto finish;
616         }
617
618         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
619         if (fd < 0) {
620                 log_error("Failed to open fifo: %m");
621                 r = -errno;
622                 goto finish;
623         }
624
625         zero(mh);
626         zero(control);
627
628         mh.msg_control = &control;
629         mh.msg_controllen = sizeof(control);
630
631         cmsg = CMSG_FIRSTHDR(&mh);
632         cmsg->cmsg_level = SOL_SOCKET;
633         cmsg->cmsg_type = SCM_RIGHTS;
634         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
635         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
636
637         mh.msg_controllen = cmsg->cmsg_len;
638
639         /* Store away the fd in the socket, so that it stays open as
640          * long as we run the child */
641         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
642         close_nointr_nofail(fd);
643
644         if (k < 0) {
645                 log_error("Failed to send FIFO fd: %m");
646                 r = -errno;
647                 goto finish;
648         }
649
650         /* And now make the FIFO unavailable as /dev/kmsg... */
651         unlink(from);
652
653 finish:
654         free(from);
655         free(to);
656         umask(u);
657
658         return r;
659 }
660
661 static int setup_hostname(void) {
662         char *hn;
663         int r = 0;
664
665         hn = path_get_file_name(arg_directory);
666         if (hn) {
667                 hn = strdup(hn);
668                 if (!hn)
669                         return -ENOMEM;
670
671                 hostname_cleanup(hn);
672
673                 if (!isempty(hn))
674                         if (sethostname(hn, strlen(hn)) < 0)
675                                 r = -errno;
676
677                 free(hn);
678         }
679
680         return r;
681 }
682
683 static int setup_journal(const char *directory) {
684         sd_id128_t machine_id;
685         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
686         int r;
687
688         if (arg_link_journal == LINK_NO)
689                 return 0;
690
691         p = strappend(directory, "/etc/machine-id");
692         if (!p) {
693                 r = log_oom();
694                 goto finish;
695         }
696
697         r = read_one_line_file(p, &b);
698         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
699                 r = 0;
700                 goto finish;
701         } else if (r < 0) {
702                 log_error("Failed to read machine ID: %s", strerror(-r));
703                 return r;
704         }
705
706         l = strstrip(b);
707         if (isempty(l) && arg_link_journal == LINK_AUTO) {
708                 r = 0;
709                 goto finish;
710         }
711
712         /* Verify validaty */
713         r = sd_id128_from_string(l, &machine_id);
714         if (r < 0) {
715                 log_error("Failed to parse machine ID: %s", strerror(-r));
716                 goto finish;
717         }
718
719         free(p);
720         p = strappend("/var/log/journal/", l);
721         q = strjoin(directory, "/var/log/journal/", l, NULL);
722         if (!p || !q) {
723                 r = log_oom();
724                 goto finish;
725         }
726
727         if (path_is_mount_point(p, false) > 0 ||
728             path_is_mount_point(q, false) > 0) {
729                 if (arg_link_journal != LINK_AUTO) {
730                         log_error("Journal already a mount point, refusing.");
731                         r = -EEXIST;
732                         goto finish;
733                 }
734
735                 r = 0;
736                 goto finish;
737         }
738
739         r = readlink_and_make_absolute(p, &d);
740         if (r >= 0) {
741                 if ((arg_link_journal == LINK_GUEST ||
742                      arg_link_journal == LINK_AUTO) &&
743                     path_equal(d, q)) {
744
745                         mkdir_p(q, 0755);
746
747                         r = 0;
748                         goto finish;
749                 }
750
751                 if (unlink(p) < 0) {
752                         log_error("Failed to remove symlink %s: %m", p);
753                         r = -errno;
754                         goto finish;
755                 }
756         } else if (r == -EINVAL) {
757
758                 if (arg_link_journal == LINK_GUEST &&
759                     rmdir(p) < 0) {
760
761                         if (errno == ENOTDIR)
762                                 log_error("%s already exists and is neither symlink nor directory.", p);
763                         else {
764                                 log_error("Failed to remove %s: %m", p);
765                                 r = -errno;
766                         }
767
768                         goto finish;
769                 }
770         } else if (r != -ENOENT) {
771                 log_error("readlink(%s) failed: %m", p);
772                 goto finish;
773         }
774
775         if (arg_link_journal == LINK_GUEST) {
776
777                 if (symlink(q, p) < 0) {
778                         log_error("Failed to symlink %s to %s: %m", q, p);
779                         r = -errno;
780                         goto finish;
781                 }
782
783                 mkdir_p(q, 0755);
784
785                 r = 0;
786                 goto finish;
787         }
788
789         if (arg_link_journal == LINK_HOST) {
790                 r = mkdir_p(p, 0755);
791                 if (r < 0) {
792                         log_error("Failed to create %s: %m", p);
793                         goto finish;
794                 }
795
796         } else if (access(p, F_OK) < 0) {
797                 r = 0;
798                 goto finish;
799         }
800
801         if (dir_is_empty(q) == 0) {
802                 log_error("%s not empty.", q);
803                 r = -ENOTEMPTY;
804                 goto finish;
805         }
806
807         r = mkdir_p(q, 0755);
808         if (r < 0) {
809                 log_error("Failed to create %s: %m", q);
810                 goto finish;
811         }
812
813         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
814                 log_error("Failed to bind mount journal from host into guest: %m");
815                 r = -errno;
816                 goto finish;
817         }
818
819         r = 0;
820
821 finish:
822         free(p);
823         free(q);
824         free(d);
825         free(b);
826         return r;
827
828 }
829
830 static int drop_capabilities(void) {
831         return capability_bounding_set_drop(~arg_retain, false);
832 }
833
834 static int is_os_tree(const char *path) {
835         int r;
836         char *p;
837         /* We use /bin/sh as flag file if something is an OS */
838
839         if (asprintf(&p, "%s/bin/sh", path) < 0)
840                 return -ENOMEM;
841
842         r = access(p, F_OK);
843         free(p);
844
845         return r < 0 ? 0 : 1;
846 }
847
848 static int process_pty(int master, sigset_t *mask) {
849
850         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
851         size_t in_buffer_full = 0, out_buffer_full = 0;
852         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
853         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
854         int ep = -1, signal_fd = -1, r;
855
856         fd_nonblock(STDIN_FILENO, 1);
857         fd_nonblock(STDOUT_FILENO, 1);
858         fd_nonblock(master, 1);
859
860         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
861         if (signal_fd < 0) {
862                 log_error("signalfd(): %m");
863                 r = -errno;
864                 goto finish;
865         }
866
867         ep = epoll_create1(EPOLL_CLOEXEC);
868         if (ep < 0) {
869                 log_error("Failed to create epoll: %m");
870                 r = -errno;
871                 goto finish;
872         }
873
874         zero(stdin_ev);
875         stdin_ev.events = EPOLLIN|EPOLLET;
876         stdin_ev.data.fd = STDIN_FILENO;
877
878         zero(stdout_ev);
879         stdout_ev.events = EPOLLOUT|EPOLLET;
880         stdout_ev.data.fd = STDOUT_FILENO;
881
882         zero(master_ev);
883         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884         master_ev.data.fd = master;
885
886         zero(signal_ev);
887         signal_ev.events = EPOLLIN;
888         signal_ev.data.fd = signal_fd;
889
890         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
891             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
892             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
893             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
894                 log_error("Failed to regiser fds in epoll: %m");
895                 r = -errno;
896                 goto finish;
897         }
898
899         for (;;) {
900                 struct epoll_event ev[16];
901                 ssize_t k;
902                 int i, nfds;
903
904                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
905                 if (nfds < 0) {
906
907                         if (errno == EINTR || errno == EAGAIN)
908                                 continue;
909
910                         log_error("epoll_wait(): %m");
911                         r = -errno;
912                         goto finish;
913                 }
914
915                 assert(nfds >= 1);
916
917                 for (i = 0; i < nfds; i++) {
918                         if (ev[i].data.fd == STDIN_FILENO) {
919
920                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
921                                         stdin_readable = true;
922
923                         } else if (ev[i].data.fd == STDOUT_FILENO) {
924
925                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
926                                         stdout_writable = true;
927
928                         } else if (ev[i].data.fd == master) {
929
930                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
931                                         master_readable = true;
932
933                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934                                         master_writable = true;
935
936                         } else if (ev[i].data.fd == signal_fd) {
937                                 struct signalfd_siginfo sfsi;
938                                 ssize_t n;
939
940                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
941                                 if (n != sizeof(sfsi)) {
942
943                                         if (n >= 0) {
944                                                 log_error("Failed to read from signalfd: invalid block size");
945                                                 r = -EIO;
946                                                 goto finish;
947                                         }
948
949                                         if (errno != EINTR && errno != EAGAIN) {
950                                                 log_error("Failed to read from signalfd: %m");
951                                                 r = -errno;
952                                                 goto finish;
953                                         }
954                                 } else {
955
956                                         if (sfsi.ssi_signo == SIGWINCH) {
957                                                 struct winsize ws;
958
959                                                 /* The window size changed, let's forward that. */
960                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
961                                                         ioctl(master, TIOCSWINSZ, &ws);
962                                         } else {
963                                                 r = 0;
964                                                 goto finish;
965                                         }
966                                 }
967                         }
968                 }
969
970                 while ((stdin_readable && in_buffer_full <= 0) ||
971                        (master_writable && in_buffer_full > 0) ||
972                        (master_readable && out_buffer_full <= 0) ||
973                        (stdout_writable && out_buffer_full > 0)) {
974
975                         if (stdin_readable && in_buffer_full < LINE_MAX) {
976
977                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
978                                 if (k < 0) {
979
980                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
981                                                 stdin_readable = false;
982                                         else {
983                                                 log_error("read(): %m");
984                                                 r = -errno;
985                                                 goto finish;
986                                         }
987                                 } else
988                                         in_buffer_full += (size_t) k;
989                         }
990
991                         if (master_writable && in_buffer_full > 0) {
992
993                                 k = write(master, in_buffer, in_buffer_full);
994                                 if (k < 0) {
995
996                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997                                                 master_writable = false;
998                                         else {
999                                                 log_error("write(): %m");
1000                                                 r = -errno;
1001                                                 goto finish;
1002                                         }
1003
1004                                 } else {
1005                                         assert(in_buffer_full >= (size_t) k);
1006                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1007                                         in_buffer_full -= k;
1008                                 }
1009                         }
1010
1011                         if (master_readable && out_buffer_full < LINE_MAX) {
1012
1013                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1014                                 if (k < 0) {
1015
1016                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1017                                                 master_readable = false;
1018                                         else {
1019                                                 log_error("read(): %m");
1020                                                 r = -errno;
1021                                                 goto finish;
1022                                         }
1023                                 }  else
1024                                         out_buffer_full += (size_t) k;
1025                         }
1026
1027                         if (stdout_writable && out_buffer_full > 0) {
1028
1029                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1030                                 if (k < 0) {
1031
1032                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033                                                 stdout_writable = false;
1034                                         else {
1035                                                 log_error("write(): %m");
1036                                                 r = -errno;
1037                                                 goto finish;
1038                                         }
1039
1040                                 } else {
1041                                         assert(out_buffer_full >= (size_t) k);
1042                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1043                                         out_buffer_full -= k;
1044                                 }
1045                         }
1046                 }
1047         }
1048
1049 finish:
1050         if (ep >= 0)
1051                 close_nointr_nofail(ep);
1052
1053         if (signal_fd >= 0)
1054                 close_nointr_nofail(signal_fd);
1055
1056         return r;
1057 }
1058
1059 int main(int argc, char *argv[]) {
1060         pid_t pid = 0;
1061         int r = EXIT_FAILURE, k;
1062         char *oldcg = NULL, *newcg = NULL;
1063         char **controller = NULL;
1064         int master = -1;
1065         const char *console = NULL;
1066         struct termios saved_attr, raw_attr;
1067         sigset_t mask;
1068         bool saved_attr_valid = false;
1069         struct winsize ws;
1070         int kmsg_socket_pair[2] = { -1, -1 };
1071
1072         log_parse_environment();
1073         log_open();
1074
1075         r = parse_argv(argc, argv);
1076         if (r <= 0)
1077                 goto finish;
1078
1079         if (arg_directory) {
1080                 char *p;
1081
1082                 p = path_make_absolute_cwd(arg_directory);
1083                 free(arg_directory);
1084                 arg_directory = p;
1085         } else
1086                 arg_directory = get_current_dir_name();
1087
1088         if (!arg_directory) {
1089                 log_error("Failed to determine path");
1090                 goto finish;
1091         }
1092
1093         path_kill_slashes(arg_directory);
1094
1095         if (geteuid() != 0) {
1096                 log_error("Need to be root.");
1097                 goto finish;
1098         }
1099
1100         if (sd_booted() <= 0) {
1101                 log_error("Not running on a systemd system.");
1102                 goto finish;
1103         }
1104
1105         if (path_equal(arg_directory, "/")) {
1106                 log_error("Spawning container on root directory not supported.");
1107                 goto finish;
1108         }
1109
1110         if (is_os_tree(arg_directory) <= 0) {
1111                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1112                 goto finish;
1113         }
1114
1115         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1116         if (k < 0) {
1117                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1118                 goto finish;
1119         }
1120
1121         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1122                 log_error("Failed to allocate cgroup path.");
1123                 goto finish;
1124         }
1125
1126         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1127         if (k < 0)  {
1128                 log_error("Failed to create cgroup: %s", strerror(-k));
1129                 goto finish;
1130         }
1131
1132         STRV_FOREACH(controller, arg_controllers) {
1133                 k = cg_create_and_attach(*controller, newcg, 0);
1134                 if (k < 0)
1135                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1136         }
1137
1138         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1139         if (master < 0) {
1140                 log_error("Failed to acquire pseudo tty: %m");
1141                 goto finish;
1142         }
1143
1144         console = ptsname(master);
1145         if (!console) {
1146                 log_error("Failed to determine tty name: %m");
1147                 goto finish;
1148         }
1149
1150         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1151
1152         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1153                 ioctl(master, TIOCSWINSZ, &ws);
1154
1155         if (unlockpt(master) < 0) {
1156                 log_error("Failed to unlock tty: %m");
1157                 goto finish;
1158         }
1159
1160         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1161                 log_error("Failed to get terminal attributes: %m");
1162                 goto finish;
1163         }
1164
1165         saved_attr_valid = true;
1166
1167         raw_attr = saved_attr;
1168         cfmakeraw(&raw_attr);
1169         raw_attr.c_lflag &= ~ECHO;
1170
1171         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1172                 log_error("Failed to set terminal attributes: %m");
1173                 goto finish;
1174         }
1175
1176         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1177                 log_error("Failed to create kmsg socket pair");
1178                 goto finish;
1179         }
1180
1181         assert_se(sigemptyset(&mask) == 0);
1182         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1183         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1184
1185         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1186         if (pid < 0) {
1187                 if (errno == EINVAL)
1188                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1189                 else
1190                         log_error("clone() failed: %m");
1191
1192                 goto finish;
1193         }
1194
1195         if (pid == 0) {
1196                 /* child */
1197
1198                 const char *home = NULL;
1199                 uid_t uid = (uid_t) -1;
1200                 gid_t gid = (gid_t) -1;
1201                 const char *envp[] = {
1202                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1203                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1204                         NULL, /* TERM */
1205                         NULL, /* HOME */
1206                         NULL, /* USER */
1207                         NULL, /* LOGNAME */
1208                         NULL, /* container_uuid */
1209                         NULL
1210                 };
1211
1212                 envp[2] = strv_find_prefix(environ, "TERM=");
1213
1214                 close_nointr_nofail(master);
1215
1216                 close_nointr(STDIN_FILENO);
1217                 close_nointr(STDOUT_FILENO);
1218                 close_nointr(STDERR_FILENO);
1219
1220                 close_all_fds(&kmsg_socket_pair[1], 1);
1221
1222                 reset_all_signal_handlers();
1223
1224                 assert_se(sigemptyset(&mask) == 0);
1225                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1226
1227                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1228                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1229                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1230                         goto child_fail;
1231
1232                 if (setsid() < 0) {
1233                         log_error("setsid() failed: %m");
1234                         goto child_fail;
1235                 }
1236
1237                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1238                         log_error("PR_SET_PDEATHSIG failed: %m");
1239                         goto child_fail;
1240                 }
1241
1242                 /* Mark everything as slave, so that we still
1243                  * receive mounts from the real root, but don't
1244                  * propagate mounts to the real root. */
1245                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1246                         log_error("MS_SLAVE|MS_REC failed: %m");
1247                         goto child_fail;
1248                 }
1249
1250                 /* Turn directory into bind mount */
1251                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1252                         log_error("Failed to make bind mount.");
1253                         goto child_fail;
1254                 }
1255
1256                 if (arg_read_only)
1257                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1258                                 log_error("Failed to make read-only.");
1259                                 goto child_fail;
1260                         }
1261
1262                 if (mount_all(arg_directory) < 0)
1263                         goto child_fail;
1264
1265                 if (copy_devnodes(arg_directory) < 0)
1266                         goto child_fail;
1267
1268                 dev_setup(arg_directory);
1269
1270                 if (setup_dev_console(arg_directory, console) < 0)
1271                         goto child_fail;
1272
1273                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1274                         goto child_fail;
1275
1276                 close_nointr_nofail(kmsg_socket_pair[1]);
1277
1278                 if (setup_boot_id(arg_directory) < 0)
1279                         goto child_fail;
1280
1281                 if (setup_timezone(arg_directory) < 0)
1282                         goto child_fail;
1283
1284                 if (setup_resolv_conf(arg_directory) < 0)
1285                         goto child_fail;
1286
1287                 if (setup_journal(arg_directory) < 0)
1288                         goto child_fail;
1289
1290                 if (chdir(arg_directory) < 0) {
1291                         log_error("chdir(%s) failed: %m", arg_directory);
1292                         goto child_fail;
1293                 }
1294
1295                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1296                         log_error("mount(MS_MOVE) failed: %m");
1297                         goto child_fail;
1298                 }
1299
1300                 if (chroot(".") < 0) {
1301                         log_error("chroot() failed: %m");
1302                         goto child_fail;
1303                 }
1304
1305                 if (chdir("/") < 0) {
1306                         log_error("chdir() failed: %m");
1307                         goto child_fail;
1308                 }
1309
1310                 umask(0022);
1311
1312                 loopback_setup();
1313
1314                 if (drop_capabilities() < 0) {
1315                         log_error("drop_capabilities() failed: %m");
1316                         goto child_fail;
1317                 }
1318
1319                 if (arg_user) {
1320
1321                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1322                                 log_error("get_user_creds() failed: %m");
1323                                 goto child_fail;
1324                         }
1325
1326                         if (mkdir_parents_label(home, 0775) < 0) {
1327                                 log_error("mkdir_parents_label() failed: %m");
1328                                 goto child_fail;
1329                         }
1330
1331                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1332                                 log_error("mkdir_safe_label() failed: %m");
1333                                 goto child_fail;
1334                         }
1335
1336                         if (initgroups((const char*)arg_user, gid) < 0) {
1337                                 log_error("initgroups() failed: %m");
1338                                 goto child_fail;
1339                         }
1340
1341                         if (setresgid(gid, gid, gid) < 0) {
1342                                 log_error("setregid() failed: %m");
1343                                 goto child_fail;
1344                         }
1345
1346                         if (setresuid(uid, uid, uid) < 0) {
1347                                 log_error("setreuid() failed: %m");
1348                                 goto child_fail;
1349                         }
1350                 }
1351
1352                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1353                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1354                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1355                     log_oom();
1356                     goto child_fail;
1357                 }
1358
1359                 if (arg_uuid) {
1360                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1361                                 log_oom();
1362                                 goto child_fail;
1363                         }
1364                 }
1365
1366                 setup_hostname();
1367
1368                 if (arg_boot) {
1369                         char **a;
1370                         size_t l;
1371
1372                         /* Automatically search for the init system */
1373
1374                         l = 1 + argc - optind;
1375                         a = newa(char*, l + 1);
1376                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1377
1378                         a[0] = (char*) "/usr/lib/systemd/systemd";
1379                         execve(a[0], a, (char**) envp);
1380
1381                         a[0] = (char*) "/lib/systemd/systemd";
1382                         execve(a[0], a, (char**) envp);
1383
1384                         a[0] = (char*) "/sbin/init";
1385                         execve(a[0], a, (char**) envp);
1386                 } else if (argc > optind)
1387                         execvpe(argv[optind], argv + optind, (char**) envp);
1388                 else {
1389                         chdir(home ? home : "/root");
1390                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1391                 }
1392
1393                 log_error("execv() failed: %m");
1394
1395         child_fail:
1396                 _exit(EXIT_FAILURE);
1397         }
1398
1399         if (process_pty(master, &mask) < 0)
1400                 goto finish;
1401
1402         if (saved_attr_valid) {
1403                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1404                 saved_attr_valid = false;
1405         }
1406
1407         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1408
1409         if (r < 0)
1410                 r = EXIT_FAILURE;
1411
1412 finish:
1413         if (saved_attr_valid)
1414                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1415
1416         if (master >= 0)
1417                 close_nointr_nofail(master);
1418
1419         close_pipe(kmsg_socket_pair);
1420
1421         if (oldcg)
1422                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1423
1424         if (newcg)
1425                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1426
1427         free(arg_directory);
1428         strv_free(arg_controllers);
1429         free(oldcg);
1430         free(newcg);
1431
1432         return r;
1433 }