chiark / gitweb /
7f084ef2d01274a4ddefca9fa6351a9fc957e95a
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE) |
96         (1ULL << CAP_SYS_BOOT);
97
98 static int help(void) {
99
100         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
101                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
102                "  -h --help               Show this help\n"
103                "  -D --directory=NAME     Root directory for the container\n"
104                "  -b --boot               Boot up full system (i.e. invoke init)\n"
105                "  -u --user=USER          Run the command under specified user or uid\n"
106                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
107                "     --uuid=UUID          Set a specific machine UUID for the container\n"
108                "     --private-network    Disable network in container\n"
109                "     --read-only          Mount the root directory read-only\n"
110                "     --capability=CAP     In addition to the default, retain specified capability\n"
111                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
112                "  -j                      Equivalent to --link-journal=host\n",
113                program_invocation_short_name);
114
115         return 0;
116 }
117
118 static int parse_argv(int argc, char *argv[]) {
119
120         enum {
121                 ARG_PRIVATE_NETWORK = 0x100,
122                 ARG_UUID,
123                 ARG_READ_ONLY,
124                 ARG_CAPABILITY,
125                 ARG_LINK_JOURNAL
126         };
127
128         static const struct option options[] = {
129                 { "help",            no_argument,       NULL, 'h'                 },
130                 { "directory",       required_argument, NULL, 'D'                 },
131                 { "user",            required_argument, NULL, 'u'                 },
132                 { "controllers",     required_argument, NULL, 'C'                 },
133                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
134                 { "boot",            no_argument,       NULL, 'b'                 },
135                 { "uuid",            required_argument, NULL, ARG_UUID            },
136                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
137                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
138                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
139                 { NULL,              0,                 NULL, 0                   }
140         };
141
142         int c;
143
144         assert(argc >= 0);
145         assert(argv);
146
147         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
148
149                 switch (c) {
150
151                 case 'h':
152                         help();
153                         return 0;
154
155                 case 'D':
156                         free(arg_directory);
157                         arg_directory = canonicalize_file_name(optarg);
158                         if (!arg_directory) {
159                                 log_error("Failed to canonicalize root directory.");
160                                 return -ENOMEM;
161                         }
162
163                         break;
164
165                 case 'u':
166                         free(arg_user);
167                         if (!(arg_user = strdup(optarg))) {
168                                 log_error("Failed to duplicate user name.");
169                                 return -ENOMEM;
170                         }
171
172                         break;
173
174                 case 'C':
175                         strv_free(arg_controllers);
176                         arg_controllers = strv_split(optarg, ",");
177                         if (!arg_controllers) {
178                                 log_error("Failed to split controllers list.");
179                                 return -ENOMEM;
180                         }
181                         strv_uniq(arg_controllers);
182
183                         break;
184
185                 case ARG_PRIVATE_NETWORK:
186                         arg_private_network = true;
187                         break;
188
189                 case 'b':
190                         arg_boot = true;
191                         break;
192
193                 case ARG_UUID:
194                         arg_uuid = optarg;
195                         break;
196
197                 case ARG_READ_ONLY:
198                         arg_read_only = true;
199                         break;
200
201                 case ARG_CAPABILITY: {
202                         char *state, *word;
203                         size_t length;
204
205                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
206                                 cap_value_t cap;
207                                 char *t;
208
209                                 t = strndup(word, length);
210                                 if (!t)
211                                         return log_oom();
212
213                                 if (cap_from_name(t, &cap) < 0) {
214                                         log_error("Failed to parse capability %s.", t);
215                                         free(t);
216                                         return -EINVAL;
217                                 }
218
219                                 free(t);
220                                 arg_retain |= 1ULL << (uint64_t) cap;
221                         }
222
223                         break;
224                 }
225
226                 case 'j':
227                         arg_link_journal = LINK_GUEST;
228                         break;
229
230                 case ARG_LINK_JOURNAL:
231                         if (streq(optarg, "auto"))
232                                 arg_link_journal = LINK_AUTO;
233                         else if (streq(optarg, "no"))
234                                 arg_link_journal = LINK_NO;
235                         else if (streq(optarg, "guest"))
236                                 arg_link_journal = LINK_GUEST;
237                         else if (streq(optarg, "host"))
238                                 arg_link_journal = LINK_HOST;
239                         else {
240                                 log_error("Failed to parse link journal mode %s", optarg);
241                                 return -EINVAL;
242                         }
243
244                         break;
245
246                 case '?':
247                         return -EINVAL;
248
249                 default:
250                         log_error("Unknown option code %c", c);
251                         return -EINVAL;
252                 }
253         }
254
255         return 1;
256 }
257
258 static int mount_all(const char *dest) {
259
260         typedef struct MountPoint {
261                 const char *what;
262                 const char *where;
263                 const char *type;
264                 const char *options;
265                 unsigned long flags;
266                 bool fatal;
267         } MountPoint;
268
269         static const MountPoint mount_table[] = {
270                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
271                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
272                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
273                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
274                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
275                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
276                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
277 #ifdef HAVE_SELINUX
278                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
279                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
280 #endif
281         };
282
283         unsigned k;
284         int r = 0;
285         char *where;
286
287         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
288                 int t;
289
290                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
291                         log_oom();
292
293                         if (r == 0)
294                                 r = -ENOMEM;
295
296                         break;
297                 }
298
299                 t = path_is_mount_point(where, true);
300                 if (t < 0) {
301                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
302                         free(where);
303
304                         if (r == 0)
305                                 r = t;
306
307                         continue;
308                 }
309
310                 /* Skip this entry if it is not a remount. */
311                 if (mount_table[k].what && t > 0)
312                         continue;
313
314                 mkdir_p_label(where, 0755);
315
316                 if (mount(mount_table[k].what,
317                           where,
318                           mount_table[k].type,
319                           mount_table[k].flags,
320                           mount_table[k].options) < 0 &&
321                     mount_table[k].fatal) {
322
323                         log_error("mount(%s) failed: %m", where);
324
325                         if (r == 0)
326                                 r = -errno;
327                 }
328
329                 free(where);
330         }
331
332         return r;
333 }
334
335 static int setup_timezone(const char *dest) {
336         char *where;
337
338         assert(dest);
339
340         /* Fix the timezone, if possible */
341         where = strappend(dest, "/etc/localtime");
342         if (!where)
343                 return log_oom();
344
345         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
346                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
347
348         free(where);
349
350         where = strappend(dest, "/etc/timezone");
351         if (!where)
352                 return log_oom();
353
354         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
355                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
356
357         free(where);
358
359         return 0;
360 }
361
362 static int setup_resolv_conf(const char *dest) {
363         char *where;
364
365         assert(dest);
366
367         if (arg_private_network)
368                 return 0;
369
370         /* Fix resolv.conf, if possible */
371         where = strappend(dest, "/etc/resolv.conf");
372         if (!where)
373                 return log_oom();
374
375         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
377
378         free(where);
379
380         return 0;
381 }
382
383 static int setup_boot_id(const char *dest) {
384         char *from = NULL, *to = NULL;
385         sd_id128_t rnd;
386         char as_uuid[37];
387         int r;
388
389         assert(dest);
390
391         /* Generate a new randomized boot ID, so that each boot-up of
392          * the container gets a new one */
393
394         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
395         if (!from) {
396                 r = log_oom();
397                 goto finish;
398         }
399
400         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
401         if (!to) {
402                 r = log_oom();
403                 goto finish;
404         }
405
406         r = sd_id128_randomize(&rnd);
407         if (r < 0) {
408                 log_error("Failed to generate random boot id: %s", strerror(-r));
409                 goto finish;
410         }
411
412         snprintf(as_uuid, sizeof(as_uuid),
413                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
414                  SD_ID128_FORMAT_VAL(rnd));
415         char_array_0(as_uuid);
416
417         r = write_one_line_file(from, as_uuid);
418         if (r < 0) {
419                 log_error("Failed to write boot id: %s", strerror(-r));
420                 goto finish;
421         }
422
423         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
424                 log_error("Failed to bind mount boot id: %m");
425                 r = -errno;
426         } else
427                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
428
429         unlink(from);
430
431 finish:
432         free(from);
433         free(to);
434
435         return r;
436 }
437
438 static int copy_devnodes(const char *dest) {
439
440         static const char devnodes[] =
441                 "null\0"
442                 "zero\0"
443                 "full\0"
444                 "random\0"
445                 "urandom\0"
446                 "tty\0"
447                 "ptmx\0";
448
449         const char *d;
450         int r = 0;
451         mode_t u;
452
453         assert(dest);
454
455         u = umask(0000);
456
457         NULSTR_FOREACH(d, devnodes) {
458                 struct stat st;
459                 char *from = NULL, *to = NULL;
460
461                 asprintf(&from, "/dev/%s", d);
462                 asprintf(&to, "%s/dev/%s", dest, d);
463
464                 if (!from || !to) {
465                         log_error("Failed to allocate devnode path");
466
467                         free(from);
468                         free(to);
469
470                         from = to = NULL;
471
472                         if (r == 0)
473                                 r = -ENOMEM;
474
475                         break;
476                 }
477
478                 if (stat(from, &st) < 0) {
479
480                         if (errno != ENOENT) {
481                                 log_error("Failed to stat %s: %m", from);
482                                 if (r == 0)
483                                         r = -errno;
484                         }
485
486                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
487
488                         log_error("%s is not a char or block device, cannot copy.", from);
489                         if (r == 0)
490                                 r = -EIO;
491
492                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
493
494                         log_error("mknod(%s) failed: %m", dest);
495                         if (r == 0)
496                                 r = -errno;
497                 }
498
499                 free(from);
500                 free(to);
501         }
502
503         umask(u);
504
505         return r;
506 }
507
508 static int setup_dev_console(const char *dest, const char *console) {
509         struct stat st;
510         char *to = NULL;
511         int r;
512         mode_t u;
513
514         assert(dest);
515         assert(console);
516
517         u = umask(0000);
518
519         if (stat(console, &st) < 0) {
520                 log_error("Failed to stat %s: %m", console);
521                 r = -errno;
522                 goto finish;
523
524         } else if (!S_ISCHR(st.st_mode)) {
525                 log_error("/dev/console is not a char device.");
526                 r = -EIO;
527                 goto finish;
528         }
529
530         r = chmod_and_chown(console, 0600, 0, 0);
531         if (r < 0) {
532                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
533                 goto finish;
534         }
535
536         if (asprintf(&to, "%s/dev/console", dest) < 0) {
537                 r = log_oom();
538                 goto finish;
539         }
540
541         /* We need to bind mount the right tty to /dev/console since
542          * ptys can only exist on pts file systems. To have something
543          * to bind mount things on we create a device node first, that
544          * has the right major/minor (note that the major minor
545          * doesn't actually matter here, since we mount it over
546          * anyway). */
547
548         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
549                 log_error("mknod() for /dev/console failed: %m");
550                 r = -errno;
551                 goto finish;
552         }
553
554         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
555                 log_error("Bind mount for /dev/console failed: %m");
556                 r = -errno;
557                 goto finish;
558         }
559
560 finish:
561         free(to);
562         umask(u);
563
564         return r;
565 }
566
567 static int setup_kmsg(const char *dest, int kmsg_socket) {
568         char *from = NULL, *to = NULL;
569         int r, fd, k;
570         mode_t u;
571         union {
572                 struct cmsghdr cmsghdr;
573                 uint8_t buf[CMSG_SPACE(sizeof(int))];
574         } control;
575         struct msghdr mh;
576         struct cmsghdr *cmsg;
577
578         assert(dest);
579         assert(kmsg_socket >= 0);
580
581         u = umask(0000);
582
583         /* We create the kmsg FIFO as /dev/kmsg, but immediately
584          * delete it after bind mounting it to /proc/kmsg. While FIFOs
585          * on the reading side behave very similar to /proc/kmsg,
586          * their writing side behaves differently from /dev/kmsg in
587          * that writing blocks when nothing is reading. In order to
588          * avoid any problems with containers deadlocking due to this
589          * we simply make /dev/kmsg unavailable to the container. */
590         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
591                 r = log_oom();
592                 goto finish;
593         }
594
595         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
596                 r = log_oom();
597                 goto finish;
598         }
599
600         if (mkfifo(from, 0600) < 0) {
601                 log_error("mkfifo() for /dev/kmsg failed: %m");
602                 r = -errno;
603                 goto finish;
604         }
605
606         r = chmod_and_chown(from, 0600, 0, 0);
607         if (r < 0) {
608                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
609                 goto finish;
610         }
611
612         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
613                 log_error("Bind mount for /proc/kmsg failed: %m");
614                 r = -errno;
615                 goto finish;
616         }
617
618         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
619         if (fd < 0) {
620                 log_error("Failed to open fifo: %m");
621                 r = -errno;
622                 goto finish;
623         }
624
625         zero(mh);
626         zero(control);
627
628         mh.msg_control = &control;
629         mh.msg_controllen = sizeof(control);
630
631         cmsg = CMSG_FIRSTHDR(&mh);
632         cmsg->cmsg_level = SOL_SOCKET;
633         cmsg->cmsg_type = SCM_RIGHTS;
634         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
635         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
636
637         mh.msg_controllen = cmsg->cmsg_len;
638
639         /* Store away the fd in the socket, so that it stays open as
640          * long as we run the child */
641         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
642         close_nointr_nofail(fd);
643
644         if (k < 0) {
645                 log_error("Failed to send FIFO fd: %m");
646                 r = -errno;
647                 goto finish;
648         }
649
650         /* And now make the FIFO unavailable as /dev/kmsg... */
651         unlink(from);
652
653 finish:
654         free(from);
655         free(to);
656         umask(u);
657
658         return r;
659 }
660
661 static int setup_hostname(void) {
662         char *hn;
663         int r = 0;
664
665         hn = path_get_file_name(arg_directory);
666         if (hn) {
667                 hn = strdup(hn);
668                 if (!hn)
669                         return -ENOMEM;
670
671                 hostname_cleanup(hn);
672
673                 if (!isempty(hn))
674                         if (sethostname(hn, strlen(hn)) < 0)
675                                 r = -errno;
676
677                 free(hn);
678         }
679
680         return r;
681 }
682
683 static int setup_journal(const char *directory) {
684         sd_id128_t machine_id;
685         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
686         int r;
687
688         if (arg_link_journal == LINK_NO)
689                 return 0;
690
691         p = strappend(directory, "/etc/machine-id");
692         if (!p) {
693                 r = log_oom();
694                 goto finish;
695         }
696
697         r = read_one_line_file(p, &b);
698         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
699                 r = 0;
700                 goto finish;
701         } else if (r < 0) {
702                 log_error("Failed to read machine ID: %s", strerror(-r));
703                 return r;
704         }
705
706         l = strstrip(b);
707         if (isempty(l) && arg_link_journal == LINK_AUTO) {
708                 r = 0;
709                 goto finish;
710         }
711
712         /* Verify validaty */
713         r = sd_id128_from_string(l, &machine_id);
714         if (r < 0) {
715                 log_error("Failed to parse machine ID: %s", strerror(-r));
716                 goto finish;
717         }
718
719         free(p);
720         p = strappend("/var/log/journal/", l);
721         q = strjoin(directory, "/var/log/journal/", l, NULL);
722         if (!p || !q) {
723                 r = log_oom();
724                 goto finish;
725         }
726
727         if (path_is_mount_point(p, false) > 0 ||
728             path_is_mount_point(q, false) > 0) {
729                 if (arg_link_journal != LINK_AUTO) {
730                         log_error("Journal already a mount point, refusing.");
731                         r = -EEXIST;
732                         goto finish;
733                 }
734
735                 r = 0;
736                 goto finish;
737         }
738
739         r = readlink_and_make_absolute(p, &d);
740         if (r >= 0) {
741                 if ((arg_link_journal == LINK_GUEST ||
742                      arg_link_journal == LINK_AUTO) &&
743                     path_equal(d, q)) {
744
745                         mkdir_p(q, 0755);
746
747                         r = 0;
748                         goto finish;
749                 }
750
751                 if (unlink(p) < 0) {
752                         log_error("Failed to remove symlink %s: %m", p);
753                         r = -errno;
754                         goto finish;
755                 }
756         } else if (r == -EINVAL) {
757
758                 if (arg_link_journal == LINK_GUEST &&
759                     rmdir(p) < 0) {
760
761                         if (errno == ENOTDIR)
762                                 log_error("%s already exists and is neither symlink nor directory.", p);
763                         else {
764                                 log_error("Failed to remove %s: %m", p);
765                                 r = -errno;
766                         }
767
768                         goto finish;
769                 }
770         } else if (r != -ENOENT) {
771                 log_error("readlink(%s) failed: %m", p);
772                 goto finish;
773         }
774
775         if (arg_link_journal == LINK_GUEST) {
776
777                 if (symlink(q, p) < 0) {
778                         log_error("Failed to symlink %s to %s: %m", q, p);
779                         r = -errno;
780                         goto finish;
781                 }
782
783                 mkdir_p(q, 0755);
784
785                 r = 0;
786                 goto finish;
787         }
788
789         if (arg_link_journal == LINK_HOST) {
790                 r = mkdir_p(p, 0755);
791                 if (r < 0) {
792                         log_error("Failed to create %s: %m", p);
793                         goto finish;
794                 }
795
796         } else if (access(p, F_OK) < 0) {
797                 r = 0;
798                 goto finish;
799         }
800
801         if (dir_is_empty(q) == 0) {
802                 log_error("%s not empty.", q);
803                 r = -ENOTEMPTY;
804                 goto finish;
805         }
806
807         r = mkdir_p(q, 0755);
808         if (r < 0) {
809                 log_error("Failed to create %s: %m", q);
810                 goto finish;
811         }
812
813         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
814                 log_error("Failed to bind mount journal from host into guest: %m");
815                 r = -errno;
816                 goto finish;
817         }
818
819         r = 0;
820
821 finish:
822         free(p);
823         free(q);
824         free(d);
825         free(b);
826         return r;
827
828 }
829
830 static int drop_capabilities(void) {
831         return capability_bounding_set_drop(~arg_retain, false);
832 }
833
834 static int is_os_tree(const char *path) {
835         int r;
836         char *p;
837         /* We use /bin/sh as flag file if something is an OS */
838
839         if (asprintf(&p, "%s/bin/sh", path) < 0)
840                 return -ENOMEM;
841
842         r = access(p, F_OK);
843         free(p);
844
845         return r < 0 ? 0 : 1;
846 }
847
848 static int process_pty(int master, sigset_t *mask) {
849
850         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
851         size_t in_buffer_full = 0, out_buffer_full = 0;
852         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
853         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
854         int ep = -1, signal_fd = -1, r;
855
856         fd_nonblock(STDIN_FILENO, 1);
857         fd_nonblock(STDOUT_FILENO, 1);
858         fd_nonblock(master, 1);
859
860         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
861         if (signal_fd < 0) {
862                 log_error("signalfd(): %m");
863                 r = -errno;
864                 goto finish;
865         }
866
867         ep = epoll_create1(EPOLL_CLOEXEC);
868         if (ep < 0) {
869                 log_error("Failed to create epoll: %m");
870                 r = -errno;
871                 goto finish;
872         }
873
874         zero(stdin_ev);
875         stdin_ev.events = EPOLLIN|EPOLLET;
876         stdin_ev.data.fd = STDIN_FILENO;
877
878         zero(stdout_ev);
879         stdout_ev.events = EPOLLOUT|EPOLLET;
880         stdout_ev.data.fd = STDOUT_FILENO;
881
882         zero(master_ev);
883         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884         master_ev.data.fd = master;
885
886         zero(signal_ev);
887         signal_ev.events = EPOLLIN;
888         signal_ev.data.fd = signal_fd;
889
890         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
891             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
892             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
893             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
894                 log_error("Failed to regiser fds in epoll: %m");
895                 r = -errno;
896                 goto finish;
897         }
898
899         for (;;) {
900                 struct epoll_event ev[16];
901                 ssize_t k;
902                 int i, nfds;
903
904                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
905                 if (nfds < 0) {
906
907                         if (errno == EINTR || errno == EAGAIN)
908                                 continue;
909
910                         log_error("epoll_wait(): %m");
911                         r = -errno;
912                         goto finish;
913                 }
914
915                 assert(nfds >= 1);
916
917                 for (i = 0; i < nfds; i++) {
918                         if (ev[i].data.fd == STDIN_FILENO) {
919
920                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
921                                         stdin_readable = true;
922
923                         } else if (ev[i].data.fd == STDOUT_FILENO) {
924
925                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
926                                         stdout_writable = true;
927
928                         } else if (ev[i].data.fd == master) {
929
930                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
931                                         master_readable = true;
932
933                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934                                         master_writable = true;
935
936                         } else if (ev[i].data.fd == signal_fd) {
937                                 struct signalfd_siginfo sfsi;
938                                 ssize_t n;
939
940                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
941                                 if (n != sizeof(sfsi)) {
942
943                                         if (n >= 0) {
944                                                 log_error("Failed to read from signalfd: invalid block size");
945                                                 r = -EIO;
946                                                 goto finish;
947                                         }
948
949                                         if (errno != EINTR && errno != EAGAIN) {
950                                                 log_error("Failed to read from signalfd: %m");
951                                                 r = -errno;
952                                                 goto finish;
953                                         }
954                                 } else {
955
956                                         if (sfsi.ssi_signo == SIGWINCH) {
957                                                 struct winsize ws;
958
959                                                 /* The window size changed, let's forward that. */
960                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
961                                                         ioctl(master, TIOCSWINSZ, &ws);
962                                         } else {
963                                                 r = 0;
964                                                 goto finish;
965                                         }
966                                 }
967                         }
968                 }
969
970                 while ((stdin_readable && in_buffer_full <= 0) ||
971                        (master_writable && in_buffer_full > 0) ||
972                        (master_readable && out_buffer_full <= 0) ||
973                        (stdout_writable && out_buffer_full > 0)) {
974
975                         if (stdin_readable && in_buffer_full < LINE_MAX) {
976
977                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
978                                 if (k < 0) {
979
980                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
981                                                 stdin_readable = false;
982                                         else {
983                                                 log_error("read(): %m");
984                                                 r = -errno;
985                                                 goto finish;
986                                         }
987                                 } else
988                                         in_buffer_full += (size_t) k;
989                         }
990
991                         if (master_writable && in_buffer_full > 0) {
992
993                                 k = write(master, in_buffer, in_buffer_full);
994                                 if (k < 0) {
995
996                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997                                                 master_writable = false;
998                                         else {
999                                                 log_error("write(): %m");
1000                                                 r = -errno;
1001                                                 goto finish;
1002                                         }
1003
1004                                 } else {
1005                                         assert(in_buffer_full >= (size_t) k);
1006                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1007                                         in_buffer_full -= k;
1008                                 }
1009                         }
1010
1011                         if (master_readable && out_buffer_full < LINE_MAX) {
1012
1013                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1014                                 if (k < 0) {
1015
1016                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1017                                                 master_readable = false;
1018                                         else {
1019                                                 log_error("read(): %m");
1020                                                 r = -errno;
1021                                                 goto finish;
1022                                         }
1023                                 }  else
1024                                         out_buffer_full += (size_t) k;
1025                         }
1026
1027                         if (stdout_writable && out_buffer_full > 0) {
1028
1029                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1030                                 if (k < 0) {
1031
1032                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033                                                 stdout_writable = false;
1034                                         else {
1035                                                 log_error("write(): %m");
1036                                                 r = -errno;
1037                                                 goto finish;
1038                                         }
1039
1040                                 } else {
1041                                         assert(out_buffer_full >= (size_t) k);
1042                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1043                                         out_buffer_full -= k;
1044                                 }
1045                         }
1046                 }
1047         }
1048
1049 finish:
1050         if (ep >= 0)
1051                 close_nointr_nofail(ep);
1052
1053         if (signal_fd >= 0)
1054                 close_nointr_nofail(signal_fd);
1055
1056         return r;
1057 }
1058
1059 int main(int argc, char *argv[]) {
1060         pid_t pid = 0;
1061         int r = EXIT_FAILURE, k;
1062         char *oldcg = NULL, *newcg = NULL;
1063         char **controller = NULL;
1064         int master = -1;
1065         const char *console = NULL;
1066         struct termios saved_attr, raw_attr;
1067         sigset_t mask;
1068         bool saved_attr_valid = false;
1069         struct winsize ws;
1070         int kmsg_socket_pair[2] = { -1, -1 };
1071
1072         log_parse_environment();
1073         log_open();
1074
1075         r = parse_argv(argc, argv);
1076         if (r <= 0)
1077                 goto finish;
1078
1079         if (arg_directory) {
1080                 char *p;
1081
1082                 p = path_make_absolute_cwd(arg_directory);
1083                 free(arg_directory);
1084                 arg_directory = p;
1085         } else
1086                 arg_directory = get_current_dir_name();
1087
1088         if (!arg_directory) {
1089                 log_error("Failed to determine path");
1090                 goto finish;
1091         }
1092
1093         path_kill_slashes(arg_directory);
1094
1095         if (geteuid() != 0) {
1096                 log_error("Need to be root.");
1097                 goto finish;
1098         }
1099
1100         if (sd_booted() <= 0) {
1101                 log_error("Not running on a systemd system.");
1102                 goto finish;
1103         }
1104
1105         if (path_equal(arg_directory, "/")) {
1106                 log_error("Spawning container on root directory not supported.");
1107                 goto finish;
1108         }
1109
1110         if (is_os_tree(arg_directory) <= 0) {
1111                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1112                 goto finish;
1113         }
1114
1115         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1116         if (k < 0) {
1117                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1118                 goto finish;
1119         }
1120
1121         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1122                 log_error("Failed to allocate cgroup path.");
1123                 goto finish;
1124         }
1125
1126         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1127         if (k < 0)  {
1128                 log_error("Failed to create cgroup: %s", strerror(-k));
1129                 goto finish;
1130         }
1131
1132         STRV_FOREACH(controller, arg_controllers) {
1133                 k = cg_create_and_attach(*controller, newcg, 0);
1134                 if (k < 0)
1135                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1136         }
1137
1138         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1139         if (master < 0) {
1140                 log_error("Failed to acquire pseudo tty: %m");
1141                 goto finish;
1142         }
1143
1144         console = ptsname(master);
1145         if (!console) {
1146                 log_error("Failed to determine tty name: %m");
1147                 goto finish;
1148         }
1149
1150         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1151
1152         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1153                 ioctl(master, TIOCSWINSZ, &ws);
1154
1155         if (unlockpt(master) < 0) {
1156                 log_error("Failed to unlock tty: %m");
1157                 goto finish;
1158         }
1159
1160         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1161                 log_error("Failed to get terminal attributes: %m");
1162                 goto finish;
1163         }
1164
1165         saved_attr_valid = true;
1166
1167         raw_attr = saved_attr;
1168         cfmakeraw(&raw_attr);
1169         raw_attr.c_lflag &= ~ECHO;
1170
1171         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1172                 log_error("Failed to create kmsg socket pair");
1173                 goto finish;
1174         }
1175
1176         assert_se(sigemptyset(&mask) == 0);
1177         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1178         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1179
1180         for (;;) {
1181                 siginfo_t status;
1182
1183                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1184                         log_error("Failed to set terminal attributes: %m");
1185                         goto finish;
1186                 }
1187
1188                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1189                 if (pid < 0) {
1190                         if (errno == EINVAL)
1191                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1192                         else
1193                                 log_error("clone() failed: %m");
1194
1195                         goto finish;
1196                 }
1197
1198                 if (pid == 0) {
1199                         /* child */
1200
1201                         const char *home = NULL;
1202                         uid_t uid = (uid_t) -1;
1203                         gid_t gid = (gid_t) -1;
1204                         const char *envp[] = {
1205                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1206                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1207                                 NULL, /* TERM */
1208                                 NULL, /* HOME */
1209                                 NULL, /* USER */
1210                                 NULL, /* LOGNAME */
1211                                 NULL, /* container_uuid */
1212                                 NULL
1213                         };
1214
1215                         envp[2] = strv_find_prefix(environ, "TERM=");
1216
1217                         close_nointr_nofail(master);
1218
1219                         close_nointr(STDIN_FILENO);
1220                         close_nointr(STDOUT_FILENO);
1221                         close_nointr(STDERR_FILENO);
1222
1223                         close_all_fds(&kmsg_socket_pair[1], 1);
1224
1225                         reset_all_signal_handlers();
1226
1227                         assert_se(sigemptyset(&mask) == 0);
1228                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1229
1230                         if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1231                             dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1232                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1233                                 goto child_fail;
1234
1235                         if (setsid() < 0) {
1236                                 log_error("setsid() failed: %m");
1237                                 goto child_fail;
1238                         }
1239
1240                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1241                                 log_error("PR_SET_PDEATHSIG failed: %m");
1242                                 goto child_fail;
1243                         }
1244
1245                         /* Mark everything as slave, so that we still
1246                          * receive mounts from the real root, but don't
1247                          * propagate mounts to the real root. */
1248                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1249                                 log_error("MS_SLAVE|MS_REC failed: %m");
1250                                 goto child_fail;
1251                         }
1252
1253                         /* Turn directory into bind mount */
1254                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1255                                 log_error("Failed to make bind mount.");
1256                                 goto child_fail;
1257                         }
1258
1259                         if (arg_read_only)
1260                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1261                                         log_error("Failed to make read-only.");
1262                                         goto child_fail;
1263                                 }
1264
1265                         if (mount_all(arg_directory) < 0)
1266                                 goto child_fail;
1267
1268                         if (copy_devnodes(arg_directory) < 0)
1269                                 goto child_fail;
1270
1271                         dev_setup(arg_directory);
1272
1273                         if (setup_dev_console(arg_directory, console) < 0)
1274                                 goto child_fail;
1275
1276                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1277                                 goto child_fail;
1278
1279                         close_nointr_nofail(kmsg_socket_pair[1]);
1280
1281                         if (setup_boot_id(arg_directory) < 0)
1282                                 goto child_fail;
1283
1284                         if (setup_timezone(arg_directory) < 0)
1285                                 goto child_fail;
1286
1287                         if (setup_resolv_conf(arg_directory) < 0)
1288                                 goto child_fail;
1289
1290                         if (setup_journal(arg_directory) < 0)
1291                                 goto child_fail;
1292
1293                         if (chdir(arg_directory) < 0) {
1294                                 log_error("chdir(%s) failed: %m", arg_directory);
1295                                 goto child_fail;
1296                         }
1297
1298                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1299                                 log_error("mount(MS_MOVE) failed: %m");
1300                                 goto child_fail;
1301                         }
1302
1303                         if (chroot(".") < 0) {
1304                                 log_error("chroot() failed: %m");
1305                                 goto child_fail;
1306                         }
1307
1308                         if (chdir("/") < 0) {
1309                                 log_error("chdir() failed: %m");
1310                                 goto child_fail;
1311                         }
1312
1313                         umask(0022);
1314
1315                         loopback_setup();
1316
1317                         if (drop_capabilities() < 0) {
1318                                 log_error("drop_capabilities() failed: %m");
1319                                 goto child_fail;
1320                         }
1321
1322                         if (arg_user) {
1323
1324                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1325                                         log_error("get_user_creds() failed: %m");
1326                                         goto child_fail;
1327                                 }
1328
1329                                 if (mkdir_parents_label(home, 0775) < 0) {
1330                                         log_error("mkdir_parents_label() failed: %m");
1331                                         goto child_fail;
1332                                 }
1333
1334                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1335                                         log_error("mkdir_safe_label() failed: %m");
1336                                         goto child_fail;
1337                                 }
1338
1339                                 if (initgroups((const char*)arg_user, gid) < 0) {
1340                                         log_error("initgroups() failed: %m");
1341                                         goto child_fail;
1342                                 }
1343
1344                                 if (setresgid(gid, gid, gid) < 0) {
1345                                         log_error("setregid() failed: %m");
1346                                         goto child_fail;
1347                                 }
1348
1349                                 if (setresuid(uid, uid, uid) < 0) {
1350                                         log_error("setreuid() failed: %m");
1351                                         goto child_fail;
1352                                 }
1353                         }
1354
1355                         if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1356                             (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1357                             (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1358                                 log_oom();
1359                                 goto child_fail;
1360                         }
1361
1362                         if (arg_uuid) {
1363                                 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1364                                         log_oom();
1365                                         goto child_fail;
1366                                 }
1367                         }
1368
1369                         setup_hostname();
1370
1371                         if (arg_boot) {
1372                                 char **a;
1373                                 size_t l;
1374
1375                                 /* Automatically search for the init system */
1376
1377                                 l = 1 + argc - optind;
1378                                 a = newa(char*, l + 1);
1379                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1380
1381                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1382                                 execve(a[0], a, (char**) envp);
1383
1384                                 a[0] = (char*) "/lib/systemd/systemd";
1385                                 execve(a[0], a, (char**) envp);
1386
1387                                 a[0] = (char*) "/sbin/init";
1388                                 execve(a[0], a, (char**) envp);
1389                         } else if (argc > optind)
1390                                 execvpe(argv[optind], argv + optind, (char**) envp);
1391                         else {
1392                                 chdir(home ? home : "/root");
1393                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1394                         }
1395
1396                         log_error("execv() failed: %m");
1397
1398                 child_fail:
1399                         _exit(EXIT_FAILURE);
1400                 }
1401
1402                 if (process_pty(master, &mask) < 0)
1403                         goto finish;
1404
1405
1406                 if (saved_attr_valid)
1407                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1408
1409                 r = wait_for_terminate(pid, &status);
1410                 if (r < 0) {
1411                         r = EXIT_FAILURE;
1412                         break;
1413                 }
1414
1415                 if (status.si_code == CLD_EXITED) {
1416                         if (status.si_status != 0) {
1417                                 log_error("Container failed with error code %i.", status.si_status);
1418                                 r = status.si_status;
1419                                 break;
1420                         }
1421
1422                         log_debug("Container exited successfully.");
1423                         break;
1424                 } else if (status.si_code == CLD_KILLED &&
1425                            status.si_status == SIGINT) {
1426                         log_info("Container has been shut down.");
1427                         r = 0;
1428                         break;
1429                 } else if (status.si_code == CLD_KILLED &&
1430                            status.si_status == SIGHUP) {
1431                         log_info("Container is being rebooted.");
1432                         continue;
1433                 } else if (status.si_code == CLD_KILLED ||
1434                            status.si_code == CLD_DUMPED) {
1435
1436                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1437                         r = EXIT_FAILURE;
1438                         break;
1439                 } else {
1440                         log_error("Container failed due to unknown reason.");
1441                         r = EXIT_FAILURE;
1442                         break;
1443                 }
1444         }
1445
1446 finish:
1447         if (saved_attr_valid)
1448                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1449
1450         if (master >= 0)
1451                 close_nointr_nofail(master);
1452
1453         close_pipe(kmsg_socket_pair);
1454
1455         if (oldcg)
1456                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1457
1458         if (newcg)
1459                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1460
1461         free(arg_directory);
1462         strv_free(arg_controllers);
1463         free(oldcg);
1464         free(newcg);
1465
1466         return r;
1467 }