chiark / gitweb /
770019bb8a4e2740bd43cb953d51bba3322f38d8
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE) |
96         (1ULL << CAP_SYS_BOOT);
97
98 static int help(void) {
99
100         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
101                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
102                "  -h --help               Show this help\n"
103                "  -D --directory=NAME     Root directory for the container\n"
104                "  -b --boot               Boot up full system (i.e. invoke init)\n"
105                "  -u --user=USER          Run the command under specified user or uid\n"
106                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
107                "     --uuid=UUID          Set a specific machine UUID for the container\n"
108                "     --private-network    Disable network in container\n"
109                "     --read-only          Mount the root directory read-only\n"
110                "     --capability=CAP     In addition to the default, retain specified capability\n"
111                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
112                "  -j                      Equivalent to --link-journal=host\n",
113                program_invocation_short_name);
114
115         return 0;
116 }
117
118 static int parse_argv(int argc, char *argv[]) {
119
120         enum {
121                 ARG_PRIVATE_NETWORK = 0x100,
122                 ARG_UUID,
123                 ARG_READ_ONLY,
124                 ARG_CAPABILITY,
125                 ARG_LINK_JOURNAL
126         };
127
128         static const struct option options[] = {
129                 { "help",            no_argument,       NULL, 'h'                 },
130                 { "directory",       required_argument, NULL, 'D'                 },
131                 { "user",            required_argument, NULL, 'u'                 },
132                 { "controllers",     required_argument, NULL, 'C'                 },
133                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
134                 { "boot",            no_argument,       NULL, 'b'                 },
135                 { "uuid",            required_argument, NULL, ARG_UUID            },
136                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
137                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
138                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
139                 { NULL,              0,                 NULL, 0                   }
140         };
141
142         int c;
143
144         assert(argc >= 0);
145         assert(argv);
146
147         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
148
149                 switch (c) {
150
151                 case 'h':
152                         help();
153                         return 0;
154
155                 case 'D':
156                         free(arg_directory);
157                         arg_directory = canonicalize_file_name(optarg);
158                         if (!arg_directory) {
159                                 log_error("Failed to canonicalize root directory.");
160                                 return -ENOMEM;
161                         }
162
163                         break;
164
165                 case 'u':
166                         free(arg_user);
167                         if (!(arg_user = strdup(optarg))) {
168                                 log_error("Failed to duplicate user name.");
169                                 return -ENOMEM;
170                         }
171
172                         break;
173
174                 case 'C':
175                         strv_free(arg_controllers);
176                         arg_controllers = strv_split(optarg, ",");
177                         if (!arg_controllers) {
178                                 log_error("Failed to split controllers list.");
179                                 return -ENOMEM;
180                         }
181                         strv_uniq(arg_controllers);
182
183                         break;
184
185                 case ARG_PRIVATE_NETWORK:
186                         arg_private_network = true;
187                         break;
188
189                 case 'b':
190                         arg_boot = true;
191                         break;
192
193                 case ARG_UUID:
194                         arg_uuid = optarg;
195                         break;
196
197                 case ARG_READ_ONLY:
198                         arg_read_only = true;
199                         break;
200
201                 case ARG_CAPABILITY: {
202                         char *state, *word;
203                         size_t length;
204
205                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
206                                 cap_value_t cap;
207                                 char *t;
208
209                                 t = strndup(word, length);
210                                 if (!t)
211                                         return log_oom();
212
213                                 if (cap_from_name(t, &cap) < 0) {
214                                         log_error("Failed to parse capability %s.", t);
215                                         free(t);
216                                         return -EINVAL;
217                                 }
218
219                                 free(t);
220                                 arg_retain |= 1ULL << (uint64_t) cap;
221                         }
222
223                         break;
224                 }
225
226                 case 'j':
227                         arg_link_journal = LINK_GUEST;
228                         break;
229
230                 case ARG_LINK_JOURNAL:
231                         if (streq(optarg, "auto"))
232                                 arg_link_journal = LINK_AUTO;
233                         else if (streq(optarg, "no"))
234                                 arg_link_journal = LINK_NO;
235                         else if (streq(optarg, "guest"))
236                                 arg_link_journal = LINK_GUEST;
237                         else if (streq(optarg, "host"))
238                                 arg_link_journal = LINK_HOST;
239                         else {
240                                 log_error("Failed to parse link journal mode %s", optarg);
241                                 return -EINVAL;
242                         }
243
244                         break;
245
246                 case '?':
247                         return -EINVAL;
248
249                 default:
250                         log_error("Unknown option code %c", c);
251                         return -EINVAL;
252                 }
253         }
254
255         return 1;
256 }
257
258 static int mount_all(const char *dest) {
259
260         typedef struct MountPoint {
261                 const char *what;
262                 const char *where;
263                 const char *type;
264                 const char *options;
265                 unsigned long flags;
266                 bool fatal;
267         } MountPoint;
268
269         static const MountPoint mount_table[] = {
270                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
271                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
272                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
273                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
274                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
275                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
276                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
277                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278 #ifdef HAVE_SELINUX
279                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
280                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
281 #endif
282         };
283
284         unsigned k;
285         int r = 0;
286         char *where;
287
288         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289                 int t;
290
291                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
292                         log_oom();
293
294                         if (r == 0)
295                                 r = -ENOMEM;
296
297                         break;
298                 }
299
300                 t = path_is_mount_point(where, true);
301                 if (t < 0) {
302                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
303                         free(where);
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 /* Skip this entry if it is not a remount. */
312                 if (mount_table[k].what && t > 0)
313                         continue;
314
315                 mkdir_p_label(where, 0755);
316
317                 if (mount(mount_table[k].what,
318                           where,
319                           mount_table[k].type,
320                           mount_table[k].flags,
321                           mount_table[k].options) < 0 &&
322                     mount_table[k].fatal) {
323
324                         log_error("mount(%s) failed: %m", where);
325
326                         if (r == 0)
327                                 r = -errno;
328                 }
329
330                 free(where);
331         }
332
333         return r;
334 }
335
336 static int setup_timezone(const char *dest) {
337         char *where;
338
339         assert(dest);
340
341         /* Fix the timezone, if possible */
342         where = strappend(dest, "/etc/localtime");
343         if (!where)
344                 return log_oom();
345
346         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
347                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
348
349         free(where);
350
351         where = strappend(dest, "/etc/timezone");
352         if (!where)
353                 return log_oom();
354
355         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
356                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
357
358         free(where);
359
360         return 0;
361 }
362
363 static int setup_resolv_conf(const char *dest) {
364         char *where;
365
366         assert(dest);
367
368         if (arg_private_network)
369                 return 0;
370
371         /* Fix resolv.conf, if possible */
372         where = strappend(dest, "/etc/resolv.conf");
373         if (!where)
374                 return log_oom();
375
376         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
377                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
378
379         free(where);
380
381         return 0;
382 }
383
384 static int setup_boot_id(const char *dest) {
385         char *from = NULL, *to = NULL;
386         sd_id128_t rnd;
387         char as_uuid[37];
388         int r;
389
390         assert(dest);
391
392         /* Generate a new randomized boot ID, so that each boot-up of
393          * the container gets a new one */
394
395         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
396         if (!from) {
397                 r = log_oom();
398                 goto finish;
399         }
400
401         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
402         if (!to) {
403                 r = log_oom();
404                 goto finish;
405         }
406
407         r = sd_id128_randomize(&rnd);
408         if (r < 0) {
409                 log_error("Failed to generate random boot id: %s", strerror(-r));
410                 goto finish;
411         }
412
413         snprintf(as_uuid, sizeof(as_uuid),
414                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
415                  SD_ID128_FORMAT_VAL(rnd));
416         char_array_0(as_uuid);
417
418         r = write_one_line_file(from, as_uuid);
419         if (r < 0) {
420                 log_error("Failed to write boot id: %s", strerror(-r));
421                 goto finish;
422         }
423
424         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
425                 log_error("Failed to bind mount boot id: %m");
426                 r = -errno;
427         } else
428                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
429
430         unlink(from);
431
432 finish:
433         free(from);
434         free(to);
435
436         return r;
437 }
438
439 static int copy_devnodes(const char *dest) {
440
441         static const char devnodes[] =
442                 "null\0"
443                 "zero\0"
444                 "full\0"
445                 "random\0"
446                 "urandom\0"
447                 "tty\0"
448                 "ptmx\0";
449
450         const char *d;
451         int r = 0;
452         mode_t u;
453
454         assert(dest);
455
456         u = umask(0000);
457
458         NULSTR_FOREACH(d, devnodes) {
459                 struct stat st;
460                 char *from = NULL, *to = NULL;
461
462                 asprintf(&from, "/dev/%s", d);
463                 asprintf(&to, "%s/dev/%s", dest, d);
464
465                 if (!from || !to) {
466                         log_error("Failed to allocate devnode path");
467
468                         free(from);
469                         free(to);
470
471                         from = to = NULL;
472
473                         if (r == 0)
474                                 r = -ENOMEM;
475
476                         break;
477                 }
478
479                 if (stat(from, &st) < 0) {
480
481                         if (errno != ENOENT) {
482                                 log_error("Failed to stat %s: %m", from);
483                                 if (r == 0)
484                                         r = -errno;
485                         }
486
487                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
488
489                         log_error("%s is not a char or block device, cannot copy.", from);
490                         if (r == 0)
491                                 r = -EIO;
492
493                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
494
495                         log_error("mknod(%s) failed: %m", dest);
496                         if (r == 0)
497                                 r = -errno;
498                 }
499
500                 free(from);
501                 free(to);
502         }
503
504         umask(u);
505
506         return r;
507 }
508
509 static int setup_dev_console(const char *dest, const char *console) {
510         struct stat st;
511         char *to = NULL;
512         int r;
513         mode_t u;
514
515         assert(dest);
516         assert(console);
517
518         u = umask(0000);
519
520         if (stat(console, &st) < 0) {
521                 log_error("Failed to stat %s: %m", console);
522                 r = -errno;
523                 goto finish;
524
525         } else if (!S_ISCHR(st.st_mode)) {
526                 log_error("/dev/console is not a char device.");
527                 r = -EIO;
528                 goto finish;
529         }
530
531         r = chmod_and_chown(console, 0600, 0, 0);
532         if (r < 0) {
533                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
534                 goto finish;
535         }
536
537         if (asprintf(&to, "%s/dev/console", dest) < 0) {
538                 r = log_oom();
539                 goto finish;
540         }
541
542         /* We need to bind mount the right tty to /dev/console since
543          * ptys can only exist on pts file systems. To have something
544          * to bind mount things on we create a device node first, that
545          * has the right major/minor (note that the major minor
546          * doesn't actually matter here, since we mount it over
547          * anyway). */
548
549         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
550                 log_error("mknod() for /dev/console failed: %m");
551                 r = -errno;
552                 goto finish;
553         }
554
555         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
556                 log_error("Bind mount for /dev/console failed: %m");
557                 r = -errno;
558                 goto finish;
559         }
560
561 finish:
562         free(to);
563         umask(u);
564
565         return r;
566 }
567
568 static int setup_kmsg(const char *dest, int kmsg_socket) {
569         char *from = NULL, *to = NULL;
570         int r, fd, k;
571         mode_t u;
572         union {
573                 struct cmsghdr cmsghdr;
574                 uint8_t buf[CMSG_SPACE(sizeof(int))];
575         } control;
576         struct msghdr mh;
577         struct cmsghdr *cmsg;
578
579         assert(dest);
580         assert(kmsg_socket >= 0);
581
582         u = umask(0000);
583
584         /* We create the kmsg FIFO as /dev/kmsg, but immediately
585          * delete it after bind mounting it to /proc/kmsg. While FIFOs
586          * on the reading side behave very similar to /proc/kmsg,
587          * their writing side behaves differently from /dev/kmsg in
588          * that writing blocks when nothing is reading. In order to
589          * avoid any problems with containers deadlocking due to this
590          * we simply make /dev/kmsg unavailable to the container. */
591         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
592                 r = log_oom();
593                 goto finish;
594         }
595
596         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
597                 r = log_oom();
598                 goto finish;
599         }
600
601         if (mkfifo(from, 0600) < 0) {
602                 log_error("mkfifo() for /dev/kmsg failed: %m");
603                 r = -errno;
604                 goto finish;
605         }
606
607         r = chmod_and_chown(from, 0600, 0, 0);
608         if (r < 0) {
609                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
610                 goto finish;
611         }
612
613         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
614                 log_error("Bind mount for /proc/kmsg failed: %m");
615                 r = -errno;
616                 goto finish;
617         }
618
619         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
620         if (fd < 0) {
621                 log_error("Failed to open fifo: %m");
622                 r = -errno;
623                 goto finish;
624         }
625
626         zero(mh);
627         zero(control);
628
629         mh.msg_control = &control;
630         mh.msg_controllen = sizeof(control);
631
632         cmsg = CMSG_FIRSTHDR(&mh);
633         cmsg->cmsg_level = SOL_SOCKET;
634         cmsg->cmsg_type = SCM_RIGHTS;
635         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
636         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
637
638         mh.msg_controllen = cmsg->cmsg_len;
639
640         /* Store away the fd in the socket, so that it stays open as
641          * long as we run the child */
642         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
643         close_nointr_nofail(fd);
644
645         if (k < 0) {
646                 log_error("Failed to send FIFO fd: %m");
647                 r = -errno;
648                 goto finish;
649         }
650
651         /* And now make the FIFO unavailable as /dev/kmsg... */
652         unlink(from);
653
654 finish:
655         free(from);
656         free(to);
657         umask(u);
658
659         return r;
660 }
661
662 static int setup_hostname(void) {
663         char *hn;
664         int r = 0;
665
666         hn = path_get_file_name(arg_directory);
667         if (hn) {
668                 hn = strdup(hn);
669                 if (!hn)
670                         return -ENOMEM;
671
672                 hostname_cleanup(hn);
673
674                 if (!isempty(hn))
675                         if (sethostname(hn, strlen(hn)) < 0)
676                                 r = -errno;
677
678                 free(hn);
679         }
680
681         return r;
682 }
683
684 static int setup_journal(const char *directory) {
685         sd_id128_t machine_id;
686         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
687         int r;
688
689         if (arg_link_journal == LINK_NO)
690                 return 0;
691
692         p = strappend(directory, "/etc/machine-id");
693         if (!p) {
694                 r = log_oom();
695                 goto finish;
696         }
697
698         r = read_one_line_file(p, &b);
699         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
700                 r = 0;
701                 goto finish;
702         } else if (r < 0) {
703                 log_error("Failed to read machine ID: %s", strerror(-r));
704                 return r;
705         }
706
707         l = strstrip(b);
708         if (isempty(l) && arg_link_journal == LINK_AUTO) {
709                 r = 0;
710                 goto finish;
711         }
712
713         /* Verify validaty */
714         r = sd_id128_from_string(l, &machine_id);
715         if (r < 0) {
716                 log_error("Failed to parse machine ID: %s", strerror(-r));
717                 goto finish;
718         }
719
720         free(p);
721         p = strappend("/var/log/journal/", l);
722         q = strjoin(directory, "/var/log/journal/", l, NULL);
723         if (!p || !q) {
724                 r = log_oom();
725                 goto finish;
726         }
727
728         if (path_is_mount_point(p, false) > 0 ||
729             path_is_mount_point(q, false) > 0) {
730                 if (arg_link_journal != LINK_AUTO) {
731                         log_error("Journal already a mount point, refusing.");
732                         r = -EEXIST;
733                         goto finish;
734                 }
735
736                 r = 0;
737                 goto finish;
738         }
739
740         r = readlink_and_make_absolute(p, &d);
741         if (r >= 0) {
742                 if ((arg_link_journal == LINK_GUEST ||
743                      arg_link_journal == LINK_AUTO) &&
744                     path_equal(d, q)) {
745
746                         mkdir_p(q, 0755);
747
748                         r = 0;
749                         goto finish;
750                 }
751
752                 if (unlink(p) < 0) {
753                         log_error("Failed to remove symlink %s: %m", p);
754                         r = -errno;
755                         goto finish;
756                 }
757         } else if (r == -EINVAL) {
758
759                 if (arg_link_journal == LINK_GUEST &&
760                     rmdir(p) < 0) {
761
762                         if (errno == ENOTDIR)
763                                 log_error("%s already exists and is neither symlink nor directory.", p);
764                         else {
765                                 log_error("Failed to remove %s: %m", p);
766                                 r = -errno;
767                         }
768
769                         goto finish;
770                 }
771         } else if (r != -ENOENT) {
772                 log_error("readlink(%s) failed: %m", p);
773                 goto finish;
774         }
775
776         if (arg_link_journal == LINK_GUEST) {
777
778                 if (symlink(q, p) < 0) {
779                         log_error("Failed to symlink %s to %s: %m", q, p);
780                         r = -errno;
781                         goto finish;
782                 }
783
784                 mkdir_p(q, 0755);
785
786                 r = 0;
787                 goto finish;
788         }
789
790         if (arg_link_journal == LINK_HOST) {
791                 r = mkdir_p(p, 0755);
792                 if (r < 0) {
793                         log_error("Failed to create %s: %m", p);
794                         goto finish;
795                 }
796
797         } else if (access(p, F_OK) < 0) {
798                 r = 0;
799                 goto finish;
800         }
801
802         if (dir_is_empty(q) == 0) {
803                 log_error("%s not empty.", q);
804                 r = -ENOTEMPTY;
805                 goto finish;
806         }
807
808         r = mkdir_p(q, 0755);
809         if (r < 0) {
810                 log_error("Failed to create %s: %m", q);
811                 goto finish;
812         }
813
814         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
815                 log_error("Failed to bind mount journal from host into guest: %m");
816                 r = -errno;
817                 goto finish;
818         }
819
820         r = 0;
821
822 finish:
823         free(p);
824         free(q);
825         free(d);
826         free(b);
827         return r;
828
829 }
830
831 static int drop_capabilities(void) {
832         return capability_bounding_set_drop(~arg_retain, false);
833 }
834
835 static int is_os_tree(const char *path) {
836         int r;
837         char *p;
838         /* We use /bin/sh as flag file if something is an OS */
839
840         if (asprintf(&p, "%s/bin/sh", path) < 0)
841                 return -ENOMEM;
842
843         r = access(p, F_OK);
844         free(p);
845
846         return r < 0 ? 0 : 1;
847 }
848
849 static int process_pty(int master, sigset_t *mask) {
850
851         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
852         size_t in_buffer_full = 0, out_buffer_full = 0;
853         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
854         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
855         int ep = -1, signal_fd = -1, r;
856
857         fd_nonblock(STDIN_FILENO, 1);
858         fd_nonblock(STDOUT_FILENO, 1);
859         fd_nonblock(master, 1);
860
861         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
862         if (signal_fd < 0) {
863                 log_error("signalfd(): %m");
864                 r = -errno;
865                 goto finish;
866         }
867
868         ep = epoll_create1(EPOLL_CLOEXEC);
869         if (ep < 0) {
870                 log_error("Failed to create epoll: %m");
871                 r = -errno;
872                 goto finish;
873         }
874
875         zero(stdin_ev);
876         stdin_ev.events = EPOLLIN|EPOLLET;
877         stdin_ev.data.fd = STDIN_FILENO;
878
879         zero(stdout_ev);
880         stdout_ev.events = EPOLLOUT|EPOLLET;
881         stdout_ev.data.fd = STDOUT_FILENO;
882
883         zero(master_ev);
884         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
885         master_ev.data.fd = master;
886
887         zero(signal_ev);
888         signal_ev.events = EPOLLIN;
889         signal_ev.data.fd = signal_fd;
890
891         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
892             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
893             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
894             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
895                 log_error("Failed to regiser fds in epoll: %m");
896                 r = -errno;
897                 goto finish;
898         }
899
900         for (;;) {
901                 struct epoll_event ev[16];
902                 ssize_t k;
903                 int i, nfds;
904
905                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
906                 if (nfds < 0) {
907
908                         if (errno == EINTR || errno == EAGAIN)
909                                 continue;
910
911                         log_error("epoll_wait(): %m");
912                         r = -errno;
913                         goto finish;
914                 }
915
916                 assert(nfds >= 1);
917
918                 for (i = 0; i < nfds; i++) {
919                         if (ev[i].data.fd == STDIN_FILENO) {
920
921                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
922                                         stdin_readable = true;
923
924                         } else if (ev[i].data.fd == STDOUT_FILENO) {
925
926                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
927                                         stdout_writable = true;
928
929                         } else if (ev[i].data.fd == master) {
930
931                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
932                                         master_readable = true;
933
934                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
935                                         master_writable = true;
936
937                         } else if (ev[i].data.fd == signal_fd) {
938                                 struct signalfd_siginfo sfsi;
939                                 ssize_t n;
940
941                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
942                                 if (n != sizeof(sfsi)) {
943
944                                         if (n >= 0) {
945                                                 log_error("Failed to read from signalfd: invalid block size");
946                                                 r = -EIO;
947                                                 goto finish;
948                                         }
949
950                                         if (errno != EINTR && errno != EAGAIN) {
951                                                 log_error("Failed to read from signalfd: %m");
952                                                 r = -errno;
953                                                 goto finish;
954                                         }
955                                 } else {
956
957                                         if (sfsi.ssi_signo == SIGWINCH) {
958                                                 struct winsize ws;
959
960                                                 /* The window size changed, let's forward that. */
961                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
962                                                         ioctl(master, TIOCSWINSZ, &ws);
963                                         } else {
964                                                 r = 0;
965                                                 goto finish;
966                                         }
967                                 }
968                         }
969                 }
970
971                 while ((stdin_readable && in_buffer_full <= 0) ||
972                        (master_writable && in_buffer_full > 0) ||
973                        (master_readable && out_buffer_full <= 0) ||
974                        (stdout_writable && out_buffer_full > 0)) {
975
976                         if (stdin_readable && in_buffer_full < LINE_MAX) {
977
978                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
979                                 if (k < 0) {
980
981                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
982                                                 stdin_readable = false;
983                                         else {
984                                                 log_error("read(): %m");
985                                                 r = -errno;
986                                                 goto finish;
987                                         }
988                                 } else
989                                         in_buffer_full += (size_t) k;
990                         }
991
992                         if (master_writable && in_buffer_full > 0) {
993
994                                 k = write(master, in_buffer, in_buffer_full);
995                                 if (k < 0) {
996
997                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
998                                                 master_writable = false;
999                                         else {
1000                                                 log_error("write(): %m");
1001                                                 r = -errno;
1002                                                 goto finish;
1003                                         }
1004
1005                                 } else {
1006                                         assert(in_buffer_full >= (size_t) k);
1007                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1008                                         in_buffer_full -= k;
1009                                 }
1010                         }
1011
1012                         if (master_readable && out_buffer_full < LINE_MAX) {
1013
1014                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1015                                 if (k < 0) {
1016
1017                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1018                                                 master_readable = false;
1019                                         else {
1020                                                 log_error("read(): %m");
1021                                                 r = -errno;
1022                                                 goto finish;
1023                                         }
1024                                 }  else
1025                                         out_buffer_full += (size_t) k;
1026                         }
1027
1028                         if (stdout_writable && out_buffer_full > 0) {
1029
1030                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1031                                 if (k < 0) {
1032
1033                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1034                                                 stdout_writable = false;
1035                                         else {
1036                                                 log_error("write(): %m");
1037                                                 r = -errno;
1038                                                 goto finish;
1039                                         }
1040
1041                                 } else {
1042                                         assert(out_buffer_full >= (size_t) k);
1043                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1044                                         out_buffer_full -= k;
1045                                 }
1046                         }
1047                 }
1048         }
1049
1050 finish:
1051         if (ep >= 0)
1052                 close_nointr_nofail(ep);
1053
1054         if (signal_fd >= 0)
1055                 close_nointr_nofail(signal_fd);
1056
1057         return r;
1058 }
1059
1060 int main(int argc, char *argv[]) {
1061         pid_t pid = 0;
1062         int r = EXIT_FAILURE, k;
1063         char *oldcg = NULL, *newcg = NULL;
1064         char **controller = NULL;
1065         int master = -1;
1066         const char *console = NULL;
1067         struct termios saved_attr, raw_attr;
1068         sigset_t mask;
1069         bool saved_attr_valid = false;
1070         struct winsize ws;
1071         int kmsg_socket_pair[2] = { -1, -1 };
1072
1073         log_parse_environment();
1074         log_open();
1075
1076         r = parse_argv(argc, argv);
1077         if (r <= 0)
1078                 goto finish;
1079
1080         if (arg_directory) {
1081                 char *p;
1082
1083                 p = path_make_absolute_cwd(arg_directory);
1084                 free(arg_directory);
1085                 arg_directory = p;
1086         } else
1087                 arg_directory = get_current_dir_name();
1088
1089         if (!arg_directory) {
1090                 log_error("Failed to determine path");
1091                 goto finish;
1092         }
1093
1094         path_kill_slashes(arg_directory);
1095
1096         if (geteuid() != 0) {
1097                 log_error("Need to be root.");
1098                 goto finish;
1099         }
1100
1101         if (sd_booted() <= 0) {
1102                 log_error("Not running on a systemd system.");
1103                 goto finish;
1104         }
1105
1106         if (path_equal(arg_directory, "/")) {
1107                 log_error("Spawning container on root directory not supported.");
1108                 goto finish;
1109         }
1110
1111         if (is_os_tree(arg_directory) <= 0) {
1112                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1113                 goto finish;
1114         }
1115
1116         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1117         if (k < 0) {
1118                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1119                 goto finish;
1120         }
1121
1122         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1123                 log_error("Failed to allocate cgroup path.");
1124                 goto finish;
1125         }
1126
1127         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1128         if (k < 0)  {
1129                 log_error("Failed to create cgroup: %s", strerror(-k));
1130                 goto finish;
1131         }
1132
1133         STRV_FOREACH(controller, arg_controllers) {
1134                 k = cg_create_and_attach(*controller, newcg, 0);
1135                 if (k < 0)
1136                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1137         }
1138
1139         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1140         if (master < 0) {
1141                 log_error("Failed to acquire pseudo tty: %m");
1142                 goto finish;
1143         }
1144
1145         console = ptsname(master);
1146         if (!console) {
1147                 log_error("Failed to determine tty name: %m");
1148                 goto finish;
1149         }
1150
1151         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1152
1153         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1154                 ioctl(master, TIOCSWINSZ, &ws);
1155
1156         if (unlockpt(master) < 0) {
1157                 log_error("Failed to unlock tty: %m");
1158                 goto finish;
1159         }
1160
1161         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1162                 log_error("Failed to get terminal attributes: %m");
1163                 goto finish;
1164         }
1165
1166         saved_attr_valid = true;
1167
1168         raw_attr = saved_attr;
1169         cfmakeraw(&raw_attr);
1170         raw_attr.c_lflag &= ~ECHO;
1171
1172         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1173                 log_error("Failed to create kmsg socket pair");
1174                 goto finish;
1175         }
1176
1177         assert_se(sigemptyset(&mask) == 0);
1178         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1179         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1180
1181         for (;;) {
1182                 siginfo_t status;
1183
1184                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1185                         log_error("Failed to set terminal attributes: %m");
1186                         goto finish;
1187                 }
1188
1189                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1190                 if (pid < 0) {
1191                         if (errno == EINVAL)
1192                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1193                         else
1194                                 log_error("clone() failed: %m");
1195
1196                         goto finish;
1197                 }
1198
1199                 if (pid == 0) {
1200                         /* child */
1201
1202                         const char *home = NULL;
1203                         uid_t uid = (uid_t) -1;
1204                         gid_t gid = (gid_t) -1;
1205                         const char *envp[] = {
1206                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1207                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1208                                 NULL, /* TERM */
1209                                 NULL, /* HOME */
1210                                 NULL, /* USER */
1211                                 NULL, /* LOGNAME */
1212                                 NULL, /* container_uuid */
1213                                 NULL
1214                         };
1215
1216                         envp[2] = strv_find_prefix(environ, "TERM=");
1217
1218                         close_nointr_nofail(master);
1219
1220                         close_nointr(STDIN_FILENO);
1221                         close_nointr(STDOUT_FILENO);
1222                         close_nointr(STDERR_FILENO);
1223
1224                         close_all_fds(&kmsg_socket_pair[1], 1);
1225
1226                         reset_all_signal_handlers();
1227
1228                         assert_se(sigemptyset(&mask) == 0);
1229                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1230
1231                         if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1232                             dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1233                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1234                                 goto child_fail;
1235
1236                         if (setsid() < 0) {
1237                                 log_error("setsid() failed: %m");
1238                                 goto child_fail;
1239                         }
1240
1241                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1242                                 log_error("PR_SET_PDEATHSIG failed: %m");
1243                                 goto child_fail;
1244                         }
1245
1246                         /* Mark everything as slave, so that we still
1247                          * receive mounts from the real root, but don't
1248                          * propagate mounts to the real root. */
1249                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1250                                 log_error("MS_SLAVE|MS_REC failed: %m");
1251                                 goto child_fail;
1252                         }
1253
1254                         /* Turn directory into bind mount */
1255                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1256                                 log_error("Failed to make bind mount.");
1257                                 goto child_fail;
1258                         }
1259
1260                         if (arg_read_only)
1261                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1262                                         log_error("Failed to make read-only.");
1263                                         goto child_fail;
1264                                 }
1265
1266                         if (mount_all(arg_directory) < 0)
1267                                 goto child_fail;
1268
1269                         if (copy_devnodes(arg_directory) < 0)
1270                                 goto child_fail;
1271
1272                         dev_setup(arg_directory);
1273
1274                         if (setup_dev_console(arg_directory, console) < 0)
1275                                 goto child_fail;
1276
1277                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1278                                 goto child_fail;
1279
1280                         close_nointr_nofail(kmsg_socket_pair[1]);
1281
1282                         if (setup_boot_id(arg_directory) < 0)
1283                                 goto child_fail;
1284
1285                         if (setup_timezone(arg_directory) < 0)
1286                                 goto child_fail;
1287
1288                         if (setup_resolv_conf(arg_directory) < 0)
1289                                 goto child_fail;
1290
1291                         if (setup_journal(arg_directory) < 0)
1292                                 goto child_fail;
1293
1294                         if (chdir(arg_directory) < 0) {
1295                                 log_error("chdir(%s) failed: %m", arg_directory);
1296                                 goto child_fail;
1297                         }
1298
1299                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1300                                 log_error("mount(MS_MOVE) failed: %m");
1301                                 goto child_fail;
1302                         }
1303
1304                         if (chroot(".") < 0) {
1305                                 log_error("chroot() failed: %m");
1306                                 goto child_fail;
1307                         }
1308
1309                         if (chdir("/") < 0) {
1310                                 log_error("chdir() failed: %m");
1311                                 goto child_fail;
1312                         }
1313
1314                         umask(0022);
1315
1316                         loopback_setup();
1317
1318                         if (drop_capabilities() < 0) {
1319                                 log_error("drop_capabilities() failed: %m");
1320                                 goto child_fail;
1321                         }
1322
1323                         if (arg_user) {
1324
1325                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1326                                         log_error("get_user_creds() failed: %m");
1327                                         goto child_fail;
1328                                 }
1329
1330                                 if (mkdir_parents_label(home, 0775) < 0) {
1331                                         log_error("mkdir_parents_label() failed: %m");
1332                                         goto child_fail;
1333                                 }
1334
1335                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1336                                         log_error("mkdir_safe_label() failed: %m");
1337                                         goto child_fail;
1338                                 }
1339
1340                                 if (initgroups((const char*)arg_user, gid) < 0) {
1341                                         log_error("initgroups() failed: %m");
1342                                         goto child_fail;
1343                                 }
1344
1345                                 if (setresgid(gid, gid, gid) < 0) {
1346                                         log_error("setregid() failed: %m");
1347                                         goto child_fail;
1348                                 }
1349
1350                                 if (setresuid(uid, uid, uid) < 0) {
1351                                         log_error("setreuid() failed: %m");
1352                                         goto child_fail;
1353                                 }
1354                         }
1355
1356                         if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1357                             (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1358                             (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1359                                 log_oom();
1360                                 goto child_fail;
1361                         }
1362
1363                         if (arg_uuid) {
1364                                 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1365                                         log_oom();
1366                                         goto child_fail;
1367                                 }
1368                         }
1369
1370                         setup_hostname();
1371
1372                         if (arg_boot) {
1373                                 char **a;
1374                                 size_t l;
1375
1376                                 /* Automatically search for the init system */
1377
1378                                 l = 1 + argc - optind;
1379                                 a = newa(char*, l + 1);
1380                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1381
1382                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1383                                 execve(a[0], a, (char**) envp);
1384
1385                                 a[0] = (char*) "/lib/systemd/systemd";
1386                                 execve(a[0], a, (char**) envp);
1387
1388                                 a[0] = (char*) "/sbin/init";
1389                                 execve(a[0], a, (char**) envp);
1390                         } else if (argc > optind)
1391                                 execvpe(argv[optind], argv + optind, (char**) envp);
1392                         else {
1393                                 chdir(home ? home : "/root");
1394                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1395                         }
1396
1397                         log_error("execv() failed: %m");
1398
1399                 child_fail:
1400                         _exit(EXIT_FAILURE);
1401                 }
1402
1403                 if (process_pty(master, &mask) < 0)
1404                         goto finish;
1405
1406
1407                 if (saved_attr_valid)
1408                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1409
1410                 r = wait_for_terminate(pid, &status);
1411                 if (r < 0) {
1412                         r = EXIT_FAILURE;
1413                         break;
1414                 }
1415
1416                 if (status.si_code == CLD_EXITED) {
1417                         if (status.si_status != 0) {
1418                                 log_error("Container failed with error code %i.", status.si_status);
1419                                 r = status.si_status;
1420                                 break;
1421                         }
1422
1423                         log_debug("Container exited successfully.");
1424                         break;
1425                 } else if (status.si_code == CLD_KILLED &&
1426                            status.si_status == SIGINT) {
1427                         log_info("Container has been shut down.");
1428                         r = 0;
1429                         break;
1430                 } else if (status.si_code == CLD_KILLED &&
1431                            status.si_status == SIGHUP) {
1432                         log_info("Container is being rebooted.");
1433                         continue;
1434                 } else if (status.si_code == CLD_KILLED ||
1435                            status.si_code == CLD_DUMPED) {
1436
1437                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1438                         r = EXIT_FAILURE;
1439                         break;
1440                 } else {
1441                         log_error("Container failed due to unknown reason.");
1442                         r = EXIT_FAILURE;
1443                         break;
1444                 }
1445         }
1446
1447 finish:
1448         if (saved_attr_valid)
1449                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1450
1451         if (master >= 0)
1452                 close_nointr_nofail(master);
1453
1454         close_pipe(kmsg_socket_pair);
1455
1456         if (oldcg)
1457                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1458
1459         if (newcg)
1460                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1461
1462         free(arg_directory);
1463         strv_free(arg_controllers);
1464         free(oldcg);
1465         free(newcg);
1466
1467         return r;
1468 }