chiark / gitweb /
nspawn: don't provide /dev/rtc0 in the container
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE);
96
97 static int help(void) {
98
99         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101                "  -h --help               Show this help\n"
102                "  -D --directory=NAME     Root directory for the container\n"
103                "  -b --boot               Boot up full system (i.e. invoke init)\n"
104                "  -u --user=USER          Run the command under specified user or uid\n"
105                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
106                "     --uuid=UUID          Set a specific machine UUID for the container\n"
107                "     --private-network    Disable network in container\n"
108                "     --read-only          Mount the root directory read-only\n"
109                "     --capability=CAP     In addition to the default, retain specified capability\n"
110                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
111                "  -j                      Equivalent to --link-journal=host\n",
112                program_invocation_short_name);
113
114         return 0;
115 }
116
117 static int parse_argv(int argc, char *argv[]) {
118
119         enum {
120                 ARG_PRIVATE_NETWORK = 0x100,
121                 ARG_UUID,
122                 ARG_READ_ONLY,
123                 ARG_CAPABILITY,
124                 ARG_LINK_JOURNAL
125         };
126
127         static const struct option options[] = {
128                 { "help",            no_argument,       NULL, 'h'                 },
129                 { "directory",       required_argument, NULL, 'D'                 },
130                 { "user",            required_argument, NULL, 'u'                 },
131                 { "controllers",     required_argument, NULL, 'C'                 },
132                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
133                 { "boot",            no_argument,       NULL, 'b'                 },
134                 { "uuid",            required_argument, NULL, ARG_UUID            },
135                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
136                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
137                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
138                 { NULL,              0,                 NULL, 0                   }
139         };
140
141         int c;
142
143         assert(argc >= 0);
144         assert(argv);
145
146         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
147
148                 switch (c) {
149
150                 case 'h':
151                         help();
152                         return 0;
153
154                 case 'D':
155                         free(arg_directory);
156                         arg_directory = canonicalize_file_name(optarg);
157                         if (!arg_directory) {
158                                 log_error("Failed to canonicalize root directory.");
159                                 return -ENOMEM;
160                         }
161
162                         break;
163
164                 case 'u':
165                         free(arg_user);
166                         if (!(arg_user = strdup(optarg))) {
167                                 log_error("Failed to duplicate user name.");
168                                 return -ENOMEM;
169                         }
170
171                         break;
172
173                 case 'C':
174                         strv_free(arg_controllers);
175                         arg_controllers = strv_split(optarg, ",");
176                         if (!arg_controllers) {
177                                 log_error("Failed to split controllers list.");
178                                 return -ENOMEM;
179                         }
180                         strv_uniq(arg_controllers);
181
182                         break;
183
184                 case ARG_PRIVATE_NETWORK:
185                         arg_private_network = true;
186                         break;
187
188                 case 'b':
189                         arg_boot = true;
190                         break;
191
192                 case ARG_UUID:
193                         arg_uuid = optarg;
194                         break;
195
196                 case ARG_READ_ONLY:
197                         arg_read_only = true;
198                         break;
199
200                 case ARG_CAPABILITY: {
201                         char *state, *word;
202                         size_t length;
203
204                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205                                 cap_value_t cap;
206                                 char *t;
207
208                                 t = strndup(word, length);
209                                 if (!t)
210                                         return log_oom();
211
212                                 if (cap_from_name(t, &cap) < 0) {
213                                         log_error("Failed to parse capability %s.", t);
214                                         free(t);
215                                         return -EINVAL;
216                                 }
217
218                                 free(t);
219                                 arg_retain |= 1ULL << (uint64_t) cap;
220                         }
221
222                         break;
223                 }
224
225                 case 'j':
226                         arg_link_journal = LINK_GUEST;
227                         break;
228
229                 case ARG_LINK_JOURNAL:
230                         if (streq(optarg, "auto"))
231                                 arg_link_journal = LINK_AUTO;
232                         else if (streq(optarg, "no"))
233                                 arg_link_journal = LINK_NO;
234                         else if (streq(optarg, "guest"))
235                                 arg_link_journal = LINK_GUEST;
236                         else if (streq(optarg, "host"))
237                                 arg_link_journal = LINK_HOST;
238                         else {
239                                 log_error("Failed to parse link journal mode %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         break;
244
245                 case '?':
246                         return -EINVAL;
247
248                 default:
249                         log_error("Unknown option code %c", c);
250                         return -EINVAL;
251                 }
252         }
253
254         return 1;
255 }
256
257 static int mount_all(const char *dest) {
258
259         typedef struct MountPoint {
260                 const char *what;
261                 const char *where;
262                 const char *type;
263                 const char *options;
264                 unsigned long flags;
265                 bool fatal;
266         } MountPoint;
267
268         static const MountPoint mount_table[] = {
269                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
270                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
271                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
272                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
273                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
274                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
275                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
276 #ifdef HAVE_SELINUX
277                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
278                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
279 #endif
280         };
281
282         unsigned k;
283         int r = 0;
284         char *where;
285
286         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
287                 int t;
288
289                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
290                         log_oom();
291
292                         if (r == 0)
293                                 r = -ENOMEM;
294
295                         break;
296                 }
297
298                 t = path_is_mount_point(where, true);
299                 if (t < 0) {
300                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
301                         free(where);
302
303                         if (r == 0)
304                                 r = t;
305
306                         continue;
307                 }
308
309                 /* Skip this entry if it is not a remount. */
310                 if (mount_table[k].what && t > 0)
311                         continue;
312
313                 mkdir_p_label(where, 0755);
314
315                 if (mount(mount_table[k].what,
316                           where,
317                           mount_table[k].type,
318                           mount_table[k].flags,
319                           mount_table[k].options) < 0 &&
320                     mount_table[k].fatal) {
321
322                         log_error("mount(%s) failed: %m", where);
323
324                         if (r == 0)
325                                 r = -errno;
326                 }
327
328                 free(where);
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         where = strappend(dest, "/etc/localtime");
341         if (!where)
342                 return log_oom();
343
344         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347         free(where);
348
349         where = strappend(dest, "/etc/timezone");
350         if (!where)
351                 return log_oom();
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         where = strappend(dest, "/etc/resolv.conf");
371         if (!where)
372                 return log_oom();
373
374         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377         free(where);
378
379         return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383         char *from = NULL, *to = NULL;
384         sd_id128_t rnd;
385         char as_uuid[37];
386         int r;
387
388         assert(dest);
389
390         /* Generate a new randomized boot ID, so that each boot-up of
391          * the container gets a new one */
392
393         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394         if (!from) {
395                 r = log_oom();
396                 goto finish;
397         }
398
399         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
400         if (!to) {
401                 r = log_oom();
402                 goto finish;
403         }
404
405         r = sd_id128_randomize(&rnd);
406         if (r < 0) {
407                 log_error("Failed to generate random boot id: %s", strerror(-r));
408                 goto finish;
409         }
410
411         snprintf(as_uuid, sizeof(as_uuid),
412                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413                  SD_ID128_FORMAT_VAL(rnd));
414         char_array_0(as_uuid);
415
416         r = write_one_line_file(from, as_uuid);
417         if (r < 0) {
418                 log_error("Failed to write boot id: %s", strerror(-r));
419                 goto finish;
420         }
421
422         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423                 log_error("Failed to bind mount boot id: %m");
424                 r = -errno;
425         } else
426                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
427
428         unlink(from);
429
430 finish:
431         free(from);
432         free(to);
433
434         return r;
435 }
436
437 static int copy_devnodes(const char *dest) {
438
439         static const char devnodes[] =
440                 "null\0"
441                 "zero\0"
442                 "full\0"
443                 "random\0"
444                 "urandom\0"
445                 "tty\0"
446                 "ptmx\0";
447
448         const char *d;
449         int r = 0;
450         mode_t u;
451
452         assert(dest);
453
454         u = umask(0000);
455
456         NULSTR_FOREACH(d, devnodes) {
457                 struct stat st;
458                 char *from = NULL, *to = NULL;
459
460                 asprintf(&from, "/dev/%s", d);
461                 asprintf(&to, "%s/dev/%s", dest, d);
462
463                 if (!from || !to) {
464                         log_error("Failed to allocate devnode path");
465
466                         free(from);
467                         free(to);
468
469                         from = to = NULL;
470
471                         if (r == 0)
472                                 r = -ENOMEM;
473
474                         break;
475                 }
476
477                 if (stat(from, &st) < 0) {
478
479                         if (errno != ENOENT) {
480                                 log_error("Failed to stat %s: %m", from);
481                                 if (r == 0)
482                                         r = -errno;
483                         }
484
485                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
486
487                         log_error("%s is not a char or block device, cannot copy.", from);
488                         if (r == 0)
489                                 r = -EIO;
490
491                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
492
493                         log_error("mknod(%s) failed: %m", dest);
494                         if (r == 0)
495                                 r = -errno;
496                 }
497
498                 free(from);
499                 free(to);
500         }
501
502         umask(u);
503
504         return r;
505 }
506
507 static int setup_dev_console(const char *dest, const char *console) {
508         struct stat st;
509         char *to = NULL;
510         int r;
511         mode_t u;
512
513         assert(dest);
514         assert(console);
515
516         u = umask(0000);
517
518         if (stat(console, &st) < 0) {
519                 log_error("Failed to stat %s: %m", console);
520                 r = -errno;
521                 goto finish;
522
523         } else if (!S_ISCHR(st.st_mode)) {
524                 log_error("/dev/console is not a char device.");
525                 r = -EIO;
526                 goto finish;
527         }
528
529         r = chmod_and_chown(console, 0600, 0, 0);
530         if (r < 0) {
531                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
532                 goto finish;
533         }
534
535         if (asprintf(&to, "%s/dev/console", dest) < 0) {
536                 r = log_oom();
537                 goto finish;
538         }
539
540         /* We need to bind mount the right tty to /dev/console since
541          * ptys can only exist on pts file systems. To have something
542          * to bind mount things on we create a device node first, that
543          * has the right major/minor (note that the major minor
544          * doesn't actually matter here, since we mount it over
545          * anyway). */
546
547         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
548                 log_error("mknod() for /dev/console failed: %m");
549                 r = -errno;
550                 goto finish;
551         }
552
553         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
554                 log_error("Bind mount for /dev/console failed: %m");
555                 r = -errno;
556                 goto finish;
557         }
558
559 finish:
560         free(to);
561         umask(u);
562
563         return r;
564 }
565
566 static int setup_kmsg(const char *dest, int kmsg_socket) {
567         char *from = NULL, *to = NULL;
568         int r, fd, k;
569         mode_t u;
570         union {
571                 struct cmsghdr cmsghdr;
572                 uint8_t buf[CMSG_SPACE(sizeof(int))];
573         } control;
574         struct msghdr mh;
575         struct cmsghdr *cmsg;
576
577         assert(dest);
578         assert(kmsg_socket >= 0);
579
580         u = umask(0000);
581
582         /* We create the kmsg FIFO as /dev/kmsg, but immediately
583          * delete it after bind mounting it to /proc/kmsg. While FIFOs
584          * on the reading side behave very similar to /proc/kmsg,
585          * their writing side behaves differently from /dev/kmsg in
586          * that writing blocks when nothing is reading. In order to
587          * avoid any problems with containers deadlocking due to this
588          * we simply make /dev/kmsg unavailable to the container. */
589         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
590                 r = log_oom();
591                 goto finish;
592         }
593
594         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
595                 r = log_oom();
596                 goto finish;
597         }
598
599         if (mkfifo(from, 0600) < 0) {
600                 log_error("mkfifo() for /dev/kmsg failed: %m");
601                 r = -errno;
602                 goto finish;
603         }
604
605         r = chmod_and_chown(from, 0600, 0, 0);
606         if (r < 0) {
607                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
608                 goto finish;
609         }
610
611         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612                 log_error("Bind mount for /proc/kmsg failed: %m");
613                 r = -errno;
614                 goto finish;
615         }
616
617         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
618         if (fd < 0) {
619                 log_error("Failed to open fifo: %m");
620                 r = -errno;
621                 goto finish;
622         }
623
624         zero(mh);
625         zero(control);
626
627         mh.msg_control = &control;
628         mh.msg_controllen = sizeof(control);
629
630         cmsg = CMSG_FIRSTHDR(&mh);
631         cmsg->cmsg_level = SOL_SOCKET;
632         cmsg->cmsg_type = SCM_RIGHTS;
633         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
634         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
635
636         mh.msg_controllen = cmsg->cmsg_len;
637
638         /* Store away the fd in the socket, so that it stays open as
639          * long as we run the child */
640         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
641         close_nointr_nofail(fd);
642
643         if (k < 0) {
644                 log_error("Failed to send FIFO fd: %m");
645                 r = -errno;
646                 goto finish;
647         }
648
649         /* And now make the FIFO unavailable as /dev/kmsg... */
650         unlink(from);
651
652 finish:
653         free(from);
654         free(to);
655         umask(u);
656
657         return r;
658 }
659
660 static int setup_hostname(void) {
661         char *hn;
662         int r = 0;
663
664         hn = path_get_file_name(arg_directory);
665         if (hn) {
666                 hn = strdup(hn);
667                 if (!hn)
668                         return -ENOMEM;
669
670                 hostname_cleanup(hn);
671
672                 if (!isempty(hn))
673                         if (sethostname(hn, strlen(hn)) < 0)
674                                 r = -errno;
675
676                 free(hn);
677         }
678
679         return r;
680 }
681
682 static int setup_journal(const char *directory) {
683         sd_id128_t machine_id;
684         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
685         int r;
686
687         if (arg_link_journal == LINK_NO)
688                 return 0;
689
690         p = strappend(directory, "/etc/machine-id");
691         if (!p) {
692                 r = log_oom();
693                 goto finish;
694         }
695
696         r = read_one_line_file(p, &b);
697         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
698                 r = 0;
699                 goto finish;
700         } else if (r < 0) {
701                 log_error("Failed to read machine ID: %s", strerror(-r));
702                 return r;
703         }
704
705         l = strstrip(b);
706         if (isempty(l) && arg_link_journal == LINK_AUTO) {
707                 r = 0;
708                 goto finish;
709         }
710
711         /* Verify validaty */
712         r = sd_id128_from_string(l, &machine_id);
713         if (r < 0) {
714                 log_error("Failed to parse machine ID: %s", strerror(-r));
715                 goto finish;
716         }
717
718         free(p);
719         p = strappend("/var/log/journal/", l);
720         q = strjoin(directory, "/var/log/journal/", l, NULL);
721         if (!p || !q) {
722                 r = log_oom();
723                 goto finish;
724         }
725
726         if (path_is_mount_point(p, false) > 0 ||
727             path_is_mount_point(q, false) > 0) {
728                 if (arg_link_journal != LINK_AUTO) {
729                         log_error("Journal already a mount point, refusing.");
730                         r = -EEXIST;
731                         goto finish;
732                 }
733
734                 r = 0;
735                 goto finish;
736         }
737
738         r = readlink_and_make_absolute(p, &d);
739         if (r >= 0) {
740                 if ((arg_link_journal == LINK_GUEST ||
741                      arg_link_journal == LINK_AUTO) &&
742                     path_equal(d, q)) {
743
744                         mkdir_p(q, 0755);
745
746                         r = 0;
747                         goto finish;
748                 }
749
750                 if (unlink(p) < 0) {
751                         log_error("Failed to remove symlink %s: %m", p);
752                         r = -errno;
753                         goto finish;
754                 }
755         } else if (r == -EINVAL) {
756
757                 if (arg_link_journal == LINK_GUEST &&
758                     rmdir(p) < 0) {
759
760                         if (errno == ENOTDIR)
761                                 log_error("%s already exists and is neither symlink nor directory.", p);
762                         else {
763                                 log_error("Failed to remove %s: %m", p);
764                                 r = -errno;
765                         }
766
767                         goto finish;
768                 }
769         } else if (r != -ENOENT) {
770                 log_error("readlink(%s) failed: %m", p);
771                 goto finish;
772         }
773
774         if (arg_link_journal == LINK_GUEST) {
775
776                 if (symlink(q, p) < 0) {
777                         log_error("Failed to symlink %s to %s: %m", q, p);
778                         r = -errno;
779                         goto finish;
780                 }
781
782                 mkdir_p(q, 0755);
783
784                 r = 0;
785                 goto finish;
786         }
787
788         if (arg_link_journal == LINK_HOST) {
789                 r = mkdir_p(p, 0755);
790                 if (r < 0) {
791                         log_error("Failed to create %s: %m", p);
792                         goto finish;
793                 }
794
795         } else if (access(p, F_OK) < 0) {
796                 r = 0;
797                 goto finish;
798         }
799
800         if (dir_is_empty(q) == 0) {
801                 log_error("%s not empty.", q);
802                 r = -ENOTEMPTY;
803                 goto finish;
804         }
805
806         r = mkdir_p(q, 0755);
807         if (r < 0) {
808                 log_error("Failed to create %s: %m", q);
809                 goto finish;
810         }
811
812         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813                 log_error("Failed to bind mount journal from host into guest: %m");
814                 r = -errno;
815                 goto finish;
816         }
817
818         r = 0;
819
820 finish:
821         free(p);
822         free(q);
823         free(d);
824         free(b);
825         return r;
826
827 }
828
829 static int drop_capabilities(void) {
830         return capability_bounding_set_drop(~arg_retain, false);
831 }
832
833 static int is_os_tree(const char *path) {
834         int r;
835         char *p;
836         /* We use /bin/sh as flag file if something is an OS */
837
838         if (asprintf(&p, "%s/bin/sh", path) < 0)
839                 return -ENOMEM;
840
841         r = access(p, F_OK);
842         free(p);
843
844         return r < 0 ? 0 : 1;
845 }
846
847 static int process_pty(int master, sigset_t *mask) {
848
849         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
850         size_t in_buffer_full = 0, out_buffer_full = 0;
851         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
852         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
853         int ep = -1, signal_fd = -1, r;
854
855         fd_nonblock(STDIN_FILENO, 1);
856         fd_nonblock(STDOUT_FILENO, 1);
857         fd_nonblock(master, 1);
858
859         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
860         if (signal_fd < 0) {
861                 log_error("signalfd(): %m");
862                 r = -errno;
863                 goto finish;
864         }
865
866         ep = epoll_create1(EPOLL_CLOEXEC);
867         if (ep < 0) {
868                 log_error("Failed to create epoll: %m");
869                 r = -errno;
870                 goto finish;
871         }
872
873         zero(stdin_ev);
874         stdin_ev.events = EPOLLIN|EPOLLET;
875         stdin_ev.data.fd = STDIN_FILENO;
876
877         zero(stdout_ev);
878         stdout_ev.events = EPOLLOUT|EPOLLET;
879         stdout_ev.data.fd = STDOUT_FILENO;
880
881         zero(master_ev);
882         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883         master_ev.data.fd = master;
884
885         zero(signal_ev);
886         signal_ev.events = EPOLLIN;
887         signal_ev.data.fd = signal_fd;
888
889         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
890             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
891             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
892             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
893                 log_error("Failed to regiser fds in epoll: %m");
894                 r = -errno;
895                 goto finish;
896         }
897
898         for (;;) {
899                 struct epoll_event ev[16];
900                 ssize_t k;
901                 int i, nfds;
902
903                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
904                 if (nfds < 0) {
905
906                         if (errno == EINTR || errno == EAGAIN)
907                                 continue;
908
909                         log_error("epoll_wait(): %m");
910                         r = -errno;
911                         goto finish;
912                 }
913
914                 assert(nfds >= 1);
915
916                 for (i = 0; i < nfds; i++) {
917                         if (ev[i].data.fd == STDIN_FILENO) {
918
919                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
920                                         stdin_readable = true;
921
922                         } else if (ev[i].data.fd == STDOUT_FILENO) {
923
924                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
925                                         stdout_writable = true;
926
927                         } else if (ev[i].data.fd == master) {
928
929                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
930                                         master_readable = true;
931
932                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933                                         master_writable = true;
934
935                         } else if (ev[i].data.fd == signal_fd) {
936                                 struct signalfd_siginfo sfsi;
937                                 ssize_t n;
938
939                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
940                                 if (n != sizeof(sfsi)) {
941
942                                         if (n >= 0) {
943                                                 log_error("Failed to read from signalfd: invalid block size");
944                                                 r = -EIO;
945                                                 goto finish;
946                                         }
947
948                                         if (errno != EINTR && errno != EAGAIN) {
949                                                 log_error("Failed to read from signalfd: %m");
950                                                 r = -errno;
951                                                 goto finish;
952                                         }
953                                 } else {
954
955                                         if (sfsi.ssi_signo == SIGWINCH) {
956                                                 struct winsize ws;
957
958                                                 /* The window size changed, let's forward that. */
959                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
960                                                         ioctl(master, TIOCSWINSZ, &ws);
961                                         } else {
962                                                 r = 0;
963                                                 goto finish;
964                                         }
965                                 }
966                         }
967                 }
968
969                 while ((stdin_readable && in_buffer_full <= 0) ||
970                        (master_writable && in_buffer_full > 0) ||
971                        (master_readable && out_buffer_full <= 0) ||
972                        (stdout_writable && out_buffer_full > 0)) {
973
974                         if (stdin_readable && in_buffer_full < LINE_MAX) {
975
976                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
977                                 if (k < 0) {
978
979                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
980                                                 stdin_readable = false;
981                                         else {
982                                                 log_error("read(): %m");
983                                                 r = -errno;
984                                                 goto finish;
985                                         }
986                                 } else
987                                         in_buffer_full += (size_t) k;
988                         }
989
990                         if (master_writable && in_buffer_full > 0) {
991
992                                 k = write(master, in_buffer, in_buffer_full);
993                                 if (k < 0) {
994
995                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996                                                 master_writable = false;
997                                         else {
998                                                 log_error("write(): %m");
999                                                 r = -errno;
1000                                                 goto finish;
1001                                         }
1002
1003                                 } else {
1004                                         assert(in_buffer_full >= (size_t) k);
1005                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1006                                         in_buffer_full -= k;
1007                                 }
1008                         }
1009
1010                         if (master_readable && out_buffer_full < LINE_MAX) {
1011
1012                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1013                                 if (k < 0) {
1014
1015                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1016                                                 master_readable = false;
1017                                         else {
1018                                                 log_error("read(): %m");
1019                                                 r = -errno;
1020                                                 goto finish;
1021                                         }
1022                                 }  else
1023                                         out_buffer_full += (size_t) k;
1024                         }
1025
1026                         if (stdout_writable && out_buffer_full > 0) {
1027
1028                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1029                                 if (k < 0) {
1030
1031                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032                                                 stdout_writable = false;
1033                                         else {
1034                                                 log_error("write(): %m");
1035                                                 r = -errno;
1036                                                 goto finish;
1037                                         }
1038
1039                                 } else {
1040                                         assert(out_buffer_full >= (size_t) k);
1041                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1042                                         out_buffer_full -= k;
1043                                 }
1044                         }
1045                 }
1046         }
1047
1048 finish:
1049         if (ep >= 0)
1050                 close_nointr_nofail(ep);
1051
1052         if (signal_fd >= 0)
1053                 close_nointr_nofail(signal_fd);
1054
1055         return r;
1056 }
1057
1058 int main(int argc, char *argv[]) {
1059         pid_t pid = 0;
1060         int r = EXIT_FAILURE, k;
1061         char *oldcg = NULL, *newcg = NULL;
1062         char **controller = NULL;
1063         int master = -1;
1064         const char *console = NULL;
1065         struct termios saved_attr, raw_attr;
1066         sigset_t mask;
1067         bool saved_attr_valid = false;
1068         struct winsize ws;
1069         int kmsg_socket_pair[2] = { -1, -1 };
1070
1071         log_parse_environment();
1072         log_open();
1073
1074         r = parse_argv(argc, argv);
1075         if (r <= 0)
1076                 goto finish;
1077
1078         if (arg_directory) {
1079                 char *p;
1080
1081                 p = path_make_absolute_cwd(arg_directory);
1082                 free(arg_directory);
1083                 arg_directory = p;
1084         } else
1085                 arg_directory = get_current_dir_name();
1086
1087         if (!arg_directory) {
1088                 log_error("Failed to determine path");
1089                 goto finish;
1090         }
1091
1092         path_kill_slashes(arg_directory);
1093
1094         if (geteuid() != 0) {
1095                 log_error("Need to be root.");
1096                 goto finish;
1097         }
1098
1099         if (sd_booted() <= 0) {
1100                 log_error("Not running on a systemd system.");
1101                 goto finish;
1102         }
1103
1104         if (path_equal(arg_directory, "/")) {
1105                 log_error("Spawning container on root directory not supported.");
1106                 goto finish;
1107         }
1108
1109         if (is_os_tree(arg_directory) <= 0) {
1110                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1111                 goto finish;
1112         }
1113
1114         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1115         if (k < 0) {
1116                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1117                 goto finish;
1118         }
1119
1120         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1121                 log_error("Failed to allocate cgroup path.");
1122                 goto finish;
1123         }
1124
1125         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1126         if (k < 0)  {
1127                 log_error("Failed to create cgroup: %s", strerror(-k));
1128                 goto finish;
1129         }
1130
1131         STRV_FOREACH(controller, arg_controllers) {
1132                 k = cg_create_and_attach(*controller, newcg, 0);
1133                 if (k < 0)
1134                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1135         }
1136
1137         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1138         if (master < 0) {
1139                 log_error("Failed to acquire pseudo tty: %m");
1140                 goto finish;
1141         }
1142
1143         console = ptsname(master);
1144         if (!console) {
1145                 log_error("Failed to determine tty name: %m");
1146                 goto finish;
1147         }
1148
1149         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1150
1151         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1152                 ioctl(master, TIOCSWINSZ, &ws);
1153
1154         if (unlockpt(master) < 0) {
1155                 log_error("Failed to unlock tty: %m");
1156                 goto finish;
1157         }
1158
1159         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1160                 log_error("Failed to get terminal attributes: %m");
1161                 goto finish;
1162         }
1163
1164         saved_attr_valid = true;
1165
1166         raw_attr = saved_attr;
1167         cfmakeraw(&raw_attr);
1168         raw_attr.c_lflag &= ~ECHO;
1169
1170         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1171                 log_error("Failed to set terminal attributes: %m");
1172                 goto finish;
1173         }
1174
1175         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1176                 log_error("Failed to create kmsg socket pair");
1177                 goto finish;
1178         }
1179
1180         assert_se(sigemptyset(&mask) == 0);
1181         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1182         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1183
1184         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1185         if (pid < 0) {
1186                 if (errno == EINVAL)
1187                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1188                 else
1189                         log_error("clone() failed: %m");
1190
1191                 goto finish;
1192         }
1193
1194         if (pid == 0) {
1195                 /* child */
1196
1197                 const char *home = NULL;
1198                 uid_t uid = (uid_t) -1;
1199                 gid_t gid = (gid_t) -1;
1200                 const char *envp[] = {
1201                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1202                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1203                         NULL, /* TERM */
1204                         NULL, /* HOME */
1205                         NULL, /* USER */
1206                         NULL, /* LOGNAME */
1207                         NULL, /* container_uuid */
1208                         NULL
1209                 };
1210
1211                 envp[2] = strv_find_prefix(environ, "TERM=");
1212
1213                 close_nointr_nofail(master);
1214
1215                 close_nointr(STDIN_FILENO);
1216                 close_nointr(STDOUT_FILENO);
1217                 close_nointr(STDERR_FILENO);
1218
1219                 close_all_fds(&kmsg_socket_pair[1], 1);
1220
1221                 reset_all_signal_handlers();
1222
1223                 assert_se(sigemptyset(&mask) == 0);
1224                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1225
1226                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1227                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1228                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1229                         goto child_fail;
1230
1231                 if (setsid() < 0) {
1232                         log_error("setsid() failed: %m");
1233                         goto child_fail;
1234                 }
1235
1236                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1237                         log_error("PR_SET_PDEATHSIG failed: %m");
1238                         goto child_fail;
1239                 }
1240
1241                 /* Mark everything as slave, so that we still
1242                  * receive mounts from the real root, but don't
1243                  * propagate mounts to the real root. */
1244                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1245                         log_error("MS_SLAVE|MS_REC failed: %m");
1246                         goto child_fail;
1247                 }
1248
1249                 /* Turn directory into bind mount */
1250                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1251                         log_error("Failed to make bind mount.");
1252                         goto child_fail;
1253                 }
1254
1255                 if (arg_read_only)
1256                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1257                                 log_error("Failed to make read-only.");
1258                                 goto child_fail;
1259                         }
1260
1261                 if (mount_all(arg_directory) < 0)
1262                         goto child_fail;
1263
1264                 if (copy_devnodes(arg_directory) < 0)
1265                         goto child_fail;
1266
1267                 dev_setup(arg_directory);
1268
1269                 if (setup_dev_console(arg_directory, console) < 0)
1270                         goto child_fail;
1271
1272                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1273                         goto child_fail;
1274
1275                 close_nointr_nofail(kmsg_socket_pair[1]);
1276
1277                 if (setup_boot_id(arg_directory) < 0)
1278                         goto child_fail;
1279
1280                 if (setup_timezone(arg_directory) < 0)
1281                         goto child_fail;
1282
1283                 if (setup_resolv_conf(arg_directory) < 0)
1284                         goto child_fail;
1285
1286                 if (setup_journal(arg_directory) < 0)
1287                         goto child_fail;
1288
1289                 if (chdir(arg_directory) < 0) {
1290                         log_error("chdir(%s) failed: %m", arg_directory);
1291                         goto child_fail;
1292                 }
1293
1294                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1295                         log_error("mount(MS_MOVE) failed: %m");
1296                         goto child_fail;
1297                 }
1298
1299                 if (chroot(".") < 0) {
1300                         log_error("chroot() failed: %m");
1301                         goto child_fail;
1302                 }
1303
1304                 if (chdir("/") < 0) {
1305                         log_error("chdir() failed: %m");
1306                         goto child_fail;
1307                 }
1308
1309                 umask(0022);
1310
1311                 loopback_setup();
1312
1313                 if (drop_capabilities() < 0) {
1314                         log_error("drop_capabilities() failed: %m");
1315                         goto child_fail;
1316                 }
1317
1318                 if (arg_user) {
1319
1320                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1321                                 log_error("get_user_creds() failed: %m");
1322                                 goto child_fail;
1323                         }
1324
1325                         if (mkdir_parents_label(home, 0775) < 0) {
1326                                 log_error("mkdir_parents_label() failed: %m");
1327                                 goto child_fail;
1328                         }
1329
1330                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1331                                 log_error("mkdir_safe_label() failed: %m");
1332                                 goto child_fail;
1333                         }
1334
1335                         if (initgroups((const char*)arg_user, gid) < 0) {
1336                                 log_error("initgroups() failed: %m");
1337                                 goto child_fail;
1338                         }
1339
1340                         if (setresgid(gid, gid, gid) < 0) {
1341                                 log_error("setregid() failed: %m");
1342                                 goto child_fail;
1343                         }
1344
1345                         if (setresuid(uid, uid, uid) < 0) {
1346                                 log_error("setreuid() failed: %m");
1347                                 goto child_fail;
1348                         }
1349                 }
1350
1351                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1352                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1353                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1354                     log_oom();
1355                     goto child_fail;
1356                 }
1357
1358                 if (arg_uuid) {
1359                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1360                                 log_oom();
1361                                 goto child_fail;
1362                         }
1363                 }
1364
1365                 setup_hostname();
1366
1367                 if (arg_boot) {
1368                         char **a;
1369                         size_t l;
1370
1371                         /* Automatically search for the init system */
1372
1373                         l = 1 + argc - optind;
1374                         a = newa(char*, l + 1);
1375                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1376
1377                         a[0] = (char*) "/usr/lib/systemd/systemd";
1378                         execve(a[0], a, (char**) envp);
1379
1380                         a[0] = (char*) "/lib/systemd/systemd";
1381                         execve(a[0], a, (char**) envp);
1382
1383                         a[0] = (char*) "/sbin/init";
1384                         execve(a[0], a, (char**) envp);
1385                 } else if (argc > optind)
1386                         execvpe(argv[optind], argv + optind, (char**) envp);
1387                 else {
1388                         chdir(home ? home : "/root");
1389                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1390                 }
1391
1392                 log_error("execv() failed: %m");
1393
1394         child_fail:
1395                 _exit(EXIT_FAILURE);
1396         }
1397
1398         if (process_pty(master, &mask) < 0)
1399                 goto finish;
1400
1401         if (saved_attr_valid) {
1402                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1403                 saved_attr_valid = false;
1404         }
1405
1406         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1407
1408         if (r < 0)
1409                 r = EXIT_FAILURE;
1410
1411 finish:
1412         if (saved_attr_valid)
1413                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1414
1415         if (master >= 0)
1416                 close_nointr_nofail(master);
1417
1418         close_pipe(kmsg_socket_pair);
1419
1420         if (oldcg)
1421                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1422
1423         if (newcg)
1424                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1425
1426         free(arg_directory);
1427         strv_free(arg_controllers);
1428         free(oldcg);
1429         free(newcg);
1430
1431         return r;
1432 }