chiark / gitweb /
c46f63ba29a4d7ab89bf9dec972e010db18e60e2
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58
59 typedef enum LinkJournal {
60         LINK_NO,
61         LINK_AUTO,
62         LINK_HOST,
63         LINK_GUEST
64 } LinkJournal;
65
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
75         (1ULL << CAP_CHOWN) |
76         (1ULL << CAP_DAC_OVERRIDE) |
77         (1ULL << CAP_DAC_READ_SEARCH) |
78         (1ULL << CAP_FOWNER) |
79         (1ULL << CAP_FSETID) |
80         (1ULL << CAP_IPC_OWNER) |
81         (1ULL << CAP_KILL) |
82         (1ULL << CAP_LEASE) |
83         (1ULL << CAP_LINUX_IMMUTABLE) |
84         (1ULL << CAP_NET_BIND_SERVICE) |
85         (1ULL << CAP_NET_BROADCAST) |
86         (1ULL << CAP_NET_RAW) |
87         (1ULL << CAP_SETGID) |
88         (1ULL << CAP_SETFCAP) |
89         (1ULL << CAP_SETPCAP) |
90         (1ULL << CAP_SETUID) |
91         (1ULL << CAP_SYS_ADMIN) |
92         (1ULL << CAP_SYS_CHROOT) |
93         (1ULL << CAP_SYS_NICE) |
94         (1ULL << CAP_SYS_PTRACE) |
95         (1ULL << CAP_SYS_TTY_CONFIG) |
96         (1ULL << CAP_SYS_RESOURCE) |
97         (1ULL << CAP_SYS_BOOT);
98
99 static int help(void) {
100
101         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103                "  -h --help               Show this help\n"
104                "  -D --directory=NAME     Root directory for the container\n"
105                "  -b --boot               Boot up full system (i.e. invoke init)\n"
106                "  -u --user=USER          Run the command under specified user or uid\n"
107                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
108                "     --uuid=UUID          Set a specific machine UUID for the container\n"
109                "     --private-network    Disable network in container\n"
110                "     --read-only          Mount the root directory read-only\n"
111                "     --capability=CAP     In addition to the default, retain specified capability\n"
112                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
113                "  -j                      Equivalent to --link-journal=host\n",
114                program_invocation_short_name);
115
116         return 0;
117 }
118
119 static int parse_argv(int argc, char *argv[]) {
120
121         enum {
122                 ARG_PRIVATE_NETWORK = 0x100,
123                 ARG_UUID,
124                 ARG_READ_ONLY,
125                 ARG_CAPABILITY,
126                 ARG_LINK_JOURNAL
127         };
128
129         static const struct option options[] = {
130                 { "help",            no_argument,       NULL, 'h'                 },
131                 { "directory",       required_argument, NULL, 'D'                 },
132                 { "user",            required_argument, NULL, 'u'                 },
133                 { "controllers",     required_argument, NULL, 'C'                 },
134                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
135                 { "boot",            no_argument,       NULL, 'b'                 },
136                 { "uuid",            required_argument, NULL, ARG_UUID            },
137                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
138                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
139                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
140                 { NULL,              0,                 NULL, 0                   }
141         };
142
143         int c;
144
145         assert(argc >= 0);
146         assert(argv);
147
148         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
149
150                 switch (c) {
151
152                 case 'h':
153                         help();
154                         return 0;
155
156                 case 'D':
157                         free(arg_directory);
158                         arg_directory = canonicalize_file_name(optarg);
159                         if (!arg_directory) {
160                                 log_error("Failed to canonicalize root directory.");
161                                 return -ENOMEM;
162                         }
163
164                         break;
165
166                 case 'u':
167                         free(arg_user);
168                         if (!(arg_user = strdup(optarg))) {
169                                 log_error("Failed to duplicate user name.");
170                                 return -ENOMEM;
171                         }
172
173                         break;
174
175                 case 'C':
176                         strv_free(arg_controllers);
177                         arg_controllers = strv_split(optarg, ",");
178                         if (!arg_controllers) {
179                                 log_error("Failed to split controllers list.");
180                                 return -ENOMEM;
181                         }
182                         strv_uniq(arg_controllers);
183
184                         break;
185
186                 case ARG_PRIVATE_NETWORK:
187                         arg_private_network = true;
188                         break;
189
190                 case 'b':
191                         arg_boot = true;
192                         break;
193
194                 case ARG_UUID:
195                         arg_uuid = optarg;
196                         break;
197
198                 case ARG_READ_ONLY:
199                         arg_read_only = true;
200                         break;
201
202                 case ARG_CAPABILITY: {
203                         char *state, *word;
204                         size_t length;
205
206                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207                                 cap_value_t cap;
208                                 char *t;
209
210                                 t = strndup(word, length);
211                                 if (!t)
212                                         return log_oom();
213
214                                 if (cap_from_name(t, &cap) < 0) {
215                                         log_error("Failed to parse capability %s.", t);
216                                         free(t);
217                                         return -EINVAL;
218                                 }
219
220                                 free(t);
221                                 arg_retain |= 1ULL << (uint64_t) cap;
222                         }
223
224                         break;
225                 }
226
227                 case 'j':
228                         arg_link_journal = LINK_GUEST;
229                         break;
230
231                 case ARG_LINK_JOURNAL:
232                         if (streq(optarg, "auto"))
233                                 arg_link_journal = LINK_AUTO;
234                         else if (streq(optarg, "no"))
235                                 arg_link_journal = LINK_NO;
236                         else if (streq(optarg, "guest"))
237                                 arg_link_journal = LINK_GUEST;
238                         else if (streq(optarg, "host"))
239                                 arg_link_journal = LINK_HOST;
240                         else {
241                                 log_error("Failed to parse link journal mode %s", optarg);
242                                 return -EINVAL;
243                         }
244
245                         break;
246
247                 case '?':
248                         return -EINVAL;
249
250                 default:
251                         log_error("Unknown option code %c", c);
252                         return -EINVAL;
253                 }
254         }
255
256         return 1;
257 }
258
259 static int mount_all(const char *dest) {
260
261         typedef struct MountPoint {
262                 const char *what;
263                 const char *where;
264                 const char *type;
265                 const char *options;
266                 unsigned long flags;
267                 bool fatal;
268         } MountPoint;
269
270         static const MountPoint mount_table[] = {
271                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
272                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
273                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
274                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
275                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
276                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
277                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
279 #ifdef HAVE_SELINUX
280                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
281                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
282 #endif
283         };
284
285         unsigned k;
286         int r = 0;
287         char _cleanup_free_ *where = NULL;
288
289         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290                 int t;
291
292                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
293                         log_oom();
294
295                         if (r == 0)
296                                 r = -ENOMEM;
297
298                         break;
299                 }
300
301                 t = path_is_mount_point(where, true);
302                 if (t < 0) {
303                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 /* Skip this entry if it is not a remount. */
312                 if (mount_table[k].what && t > 0)
313                         continue;
314
315                 mkdir_p_label(where, 0755);
316
317                 if (mount(mount_table[k].what,
318                           where,
319                           mount_table[k].type,
320                           mount_table[k].flags,
321                           mount_table[k].options) < 0 &&
322                     mount_table[k].fatal) {
323
324                         log_error("mount(%s) failed: %m", where);
325
326                         if (r == 0)
327                                 r = -errno;
328                 }
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         where = strappend(dest, "/etc/localtime");
341         if (!where)
342                 return log_oom();
343
344         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347         free(where);
348
349         where = strappend(dest, "/etc/timezone");
350         if (!where)
351                 return log_oom();
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         where = strappend(dest, "/etc/resolv.conf");
371         if (!where)
372                 return log_oom();
373
374         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377         free(where);
378
379         return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383         char *from = NULL, *to = NULL;
384         sd_id128_t rnd;
385         char as_uuid[37];
386         int r;
387
388         assert(dest);
389
390         /* Generate a new randomized boot ID, so that each boot-up of
391          * the container gets a new one */
392
393         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394         if (!from) {
395                 r = log_oom();
396                 goto finish;
397         }
398
399         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
400         if (!to) {
401                 r = log_oom();
402                 goto finish;
403         }
404
405         r = sd_id128_randomize(&rnd);
406         if (r < 0) {
407                 log_error("Failed to generate random boot id: %s", strerror(-r));
408                 goto finish;
409         }
410
411         snprintf(as_uuid, sizeof(as_uuid),
412                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
413                  SD_ID128_FORMAT_VAL(rnd));
414         char_array_0(as_uuid);
415
416         r = write_one_line_file(from, as_uuid);
417         if (r < 0) {
418                 log_error("Failed to write boot id: %s", strerror(-r));
419                 goto finish;
420         }
421
422         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
423                 log_error("Failed to bind mount boot id: %m");
424                 r = -errno;
425         } else
426                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
427
428         unlink(from);
429
430 finish:
431         free(from);
432         free(to);
433
434         return r;
435 }
436
437 static int copy_devnodes(const char *dest) {
438
439         static const char devnodes[] =
440                 "null\0"
441                 "zero\0"
442                 "full\0"
443                 "random\0"
444                 "urandom\0"
445                 "tty\0"
446                 "ptmx\0";
447
448         const char *d;
449         int r = 0;
450         mode_t u;
451
452         assert(dest);
453
454         u = umask(0000);
455
456         NULSTR_FOREACH(d, devnodes) {
457                 struct stat st;
458                 char *from = NULL, *to = NULL;
459
460                 asprintf(&from, "/dev/%s", d);
461                 asprintf(&to, "%s/dev/%s", dest, d);
462
463                 if (!from || !to) {
464                         log_error("Failed to allocate devnode path");
465
466                         free(from);
467                         free(to);
468
469                         from = to = NULL;
470
471                         if (r == 0)
472                                 r = -ENOMEM;
473
474                         break;
475                 }
476
477                 if (stat(from, &st) < 0) {
478
479                         if (errno != ENOENT) {
480                                 log_error("Failed to stat %s: %m", from);
481                                 if (r == 0)
482                                         r = -errno;
483                         }
484
485                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
486
487                         log_error("%s is not a char or block device, cannot copy.", from);
488                         if (r == 0)
489                                 r = -EIO;
490
491                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
492
493                         log_error("mknod(%s) failed: %m", dest);
494                         if (r == 0)
495                                 r = -errno;
496                 }
497
498                 free(from);
499                 free(to);
500         }
501
502         umask(u);
503
504         return r;
505 }
506
507 static int setup_dev_console(const char *dest, const char *console) {
508         struct stat st;
509         char *to = NULL;
510         int r;
511         mode_t u;
512
513         assert(dest);
514         assert(console);
515
516         u = umask(0000);
517
518         if (stat(console, &st) < 0) {
519                 log_error("Failed to stat %s: %m", console);
520                 r = -errno;
521                 goto finish;
522
523         } else if (!S_ISCHR(st.st_mode)) {
524                 log_error("/dev/console is not a char device.");
525                 r = -EIO;
526                 goto finish;
527         }
528
529         r = chmod_and_chown(console, 0600, 0, 0);
530         if (r < 0) {
531                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
532                 goto finish;
533         }
534
535         if (asprintf(&to, "%s/dev/console", dest) < 0) {
536                 r = log_oom();
537                 goto finish;
538         }
539
540         /* We need to bind mount the right tty to /dev/console since
541          * ptys can only exist on pts file systems. To have something
542          * to bind mount things on we create a device node first, that
543          * has the right major/minor (note that the major minor
544          * doesn't actually matter here, since we mount it over
545          * anyway). */
546
547         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
548                 log_error("mknod() for /dev/console failed: %m");
549                 r = -errno;
550                 goto finish;
551         }
552
553         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
554                 log_error("Bind mount for /dev/console failed: %m");
555                 r = -errno;
556                 goto finish;
557         }
558
559 finish:
560         free(to);
561         umask(u);
562
563         return r;
564 }
565
566 static int setup_kmsg(const char *dest, int kmsg_socket) {
567         char *from = NULL, *to = NULL;
568         int r, fd, k;
569         mode_t u;
570         union {
571                 struct cmsghdr cmsghdr;
572                 uint8_t buf[CMSG_SPACE(sizeof(int))];
573         } control;
574         struct msghdr mh;
575         struct cmsghdr *cmsg;
576
577         assert(dest);
578         assert(kmsg_socket >= 0);
579
580         u = umask(0000);
581
582         /* We create the kmsg FIFO as /dev/kmsg, but immediately
583          * delete it after bind mounting it to /proc/kmsg. While FIFOs
584          * on the reading side behave very similar to /proc/kmsg,
585          * their writing side behaves differently from /dev/kmsg in
586          * that writing blocks when nothing is reading. In order to
587          * avoid any problems with containers deadlocking due to this
588          * we simply make /dev/kmsg unavailable to the container. */
589         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
590                 r = log_oom();
591                 goto finish;
592         }
593
594         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
595                 r = log_oom();
596                 goto finish;
597         }
598
599         if (mkfifo(from, 0600) < 0) {
600                 log_error("mkfifo() for /dev/kmsg failed: %m");
601                 r = -errno;
602                 goto finish;
603         }
604
605         r = chmod_and_chown(from, 0600, 0, 0);
606         if (r < 0) {
607                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
608                 goto finish;
609         }
610
611         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
612                 log_error("Bind mount for /proc/kmsg failed: %m");
613                 r = -errno;
614                 goto finish;
615         }
616
617         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
618         if (fd < 0) {
619                 log_error("Failed to open fifo: %m");
620                 r = -errno;
621                 goto finish;
622         }
623
624         zero(mh);
625         zero(control);
626
627         mh.msg_control = &control;
628         mh.msg_controllen = sizeof(control);
629
630         cmsg = CMSG_FIRSTHDR(&mh);
631         cmsg->cmsg_level = SOL_SOCKET;
632         cmsg->cmsg_type = SCM_RIGHTS;
633         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
634         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
635
636         mh.msg_controllen = cmsg->cmsg_len;
637
638         /* Store away the fd in the socket, so that it stays open as
639          * long as we run the child */
640         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
641         close_nointr_nofail(fd);
642
643         if (k < 0) {
644                 log_error("Failed to send FIFO fd: %m");
645                 r = -errno;
646                 goto finish;
647         }
648
649         /* And now make the FIFO unavailable as /dev/kmsg... */
650         unlink(from);
651
652 finish:
653         free(from);
654         free(to);
655         umask(u);
656
657         return r;
658 }
659
660 static int setup_hostname(void) {
661         char *hn;
662         int r = 0;
663
664         hn = path_get_file_name(arg_directory);
665         if (hn) {
666                 hn = strdup(hn);
667                 if (!hn)
668                         return -ENOMEM;
669
670                 hostname_cleanup(hn);
671
672                 if (!isempty(hn))
673                         if (sethostname(hn, strlen(hn)) < 0)
674                                 r = -errno;
675
676                 free(hn);
677         }
678
679         return r;
680 }
681
682 static int setup_journal(const char *directory) {
683         sd_id128_t machine_id;
684         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
685         int r;
686
687         if (arg_link_journal == LINK_NO)
688                 return 0;
689
690         p = strappend(directory, "/etc/machine-id");
691         if (!p) {
692                 r = log_oom();
693                 goto finish;
694         }
695
696         r = read_one_line_file(p, &b);
697         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
698                 r = 0;
699                 goto finish;
700         } else if (r < 0) {
701                 log_error("Failed to read machine ID: %s", strerror(-r));
702                 return r;
703         }
704
705         l = strstrip(b);
706         if (isempty(l) && arg_link_journal == LINK_AUTO) {
707                 r = 0;
708                 goto finish;
709         }
710
711         /* Verify validaty */
712         r = sd_id128_from_string(l, &machine_id);
713         if (r < 0) {
714                 log_error("Failed to parse machine ID: %s", strerror(-r));
715                 goto finish;
716         }
717
718         free(p);
719         p = strappend("/var/log/journal/", l);
720         q = strjoin(directory, "/var/log/journal/", l, NULL);
721         if (!p || !q) {
722                 r = log_oom();
723                 goto finish;
724         }
725
726         if (path_is_mount_point(p, false) > 0 ||
727             path_is_mount_point(q, false) > 0) {
728                 if (arg_link_journal != LINK_AUTO) {
729                         log_error("Journal already a mount point, refusing.");
730                         r = -EEXIST;
731                         goto finish;
732                 }
733
734                 r = 0;
735                 goto finish;
736         }
737
738         r = readlink_and_make_absolute(p, &d);
739         if (r >= 0) {
740                 if ((arg_link_journal == LINK_GUEST ||
741                      arg_link_journal == LINK_AUTO) &&
742                     path_equal(d, q)) {
743
744                         mkdir_p(q, 0755);
745
746                         r = 0;
747                         goto finish;
748                 }
749
750                 if (unlink(p) < 0) {
751                         log_error("Failed to remove symlink %s: %m", p);
752                         r = -errno;
753                         goto finish;
754                 }
755         } else if (r == -EINVAL) {
756
757                 if (arg_link_journal == LINK_GUEST &&
758                     rmdir(p) < 0) {
759
760                         if (errno == ENOTDIR)
761                                 log_error("%s already exists and is neither symlink nor directory.", p);
762                         else {
763                                 log_error("Failed to remove %s: %m", p);
764                                 r = -errno;
765                         }
766
767                         goto finish;
768                 }
769         } else if (r != -ENOENT) {
770                 log_error("readlink(%s) failed: %m", p);
771                 goto finish;
772         }
773
774         if (arg_link_journal == LINK_GUEST) {
775
776                 if (symlink(q, p) < 0) {
777                         log_error("Failed to symlink %s to %s: %m", q, p);
778                         r = -errno;
779                         goto finish;
780                 }
781
782                 mkdir_p(q, 0755);
783
784                 r = 0;
785                 goto finish;
786         }
787
788         if (arg_link_journal == LINK_HOST) {
789                 r = mkdir_p(p, 0755);
790                 if (r < 0) {
791                         log_error("Failed to create %s: %m", p);
792                         goto finish;
793                 }
794
795         } else if (access(p, F_OK) < 0) {
796                 r = 0;
797                 goto finish;
798         }
799
800         if (dir_is_empty(q) == 0) {
801                 log_error("%s not empty.", q);
802                 r = -ENOTEMPTY;
803                 goto finish;
804         }
805
806         r = mkdir_p(q, 0755);
807         if (r < 0) {
808                 log_error("Failed to create %s: %m", q);
809                 goto finish;
810         }
811
812         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813                 log_error("Failed to bind mount journal from host into guest: %m");
814                 r = -errno;
815                 goto finish;
816         }
817
818         r = 0;
819
820 finish:
821         free(p);
822         free(q);
823         free(d);
824         free(b);
825         return r;
826
827 }
828
829 static int drop_capabilities(void) {
830         return capability_bounding_set_drop(~arg_retain, false);
831 }
832
833 static int is_os_tree(const char *path) {
834         int r;
835         char *p;
836         /* We use /bin/sh as flag file if something is an OS */
837
838         if (asprintf(&p, "%s/bin/sh", path) < 0)
839                 return -ENOMEM;
840
841         r = access(p, F_OK);
842         free(p);
843
844         return r < 0 ? 0 : 1;
845 }
846
847 static int process_pty(int master, sigset_t *mask) {
848
849         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
850         size_t in_buffer_full = 0, out_buffer_full = 0;
851         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
852         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
853         int ep = -1, signal_fd = -1, r;
854
855         fd_nonblock(STDIN_FILENO, 1);
856         fd_nonblock(STDOUT_FILENO, 1);
857         fd_nonblock(master, 1);
858
859         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
860         if (signal_fd < 0) {
861                 log_error("signalfd(): %m");
862                 r = -errno;
863                 goto finish;
864         }
865
866         ep = epoll_create1(EPOLL_CLOEXEC);
867         if (ep < 0) {
868                 log_error("Failed to create epoll: %m");
869                 r = -errno;
870                 goto finish;
871         }
872
873         zero(stdin_ev);
874         stdin_ev.events = EPOLLIN|EPOLLET;
875         stdin_ev.data.fd = STDIN_FILENO;
876
877         zero(stdout_ev);
878         stdout_ev.events = EPOLLOUT|EPOLLET;
879         stdout_ev.data.fd = STDOUT_FILENO;
880
881         zero(master_ev);
882         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883         master_ev.data.fd = master;
884
885         zero(signal_ev);
886         signal_ev.events = EPOLLIN;
887         signal_ev.data.fd = signal_fd;
888
889         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
890             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
891             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
892             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
893                 log_error("Failed to regiser fds in epoll: %m");
894                 r = -errno;
895                 goto finish;
896         }
897
898         for (;;) {
899                 struct epoll_event ev[16];
900                 ssize_t k;
901                 int i, nfds;
902
903                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
904                 if (nfds < 0) {
905
906                         if (errno == EINTR || errno == EAGAIN)
907                                 continue;
908
909                         log_error("epoll_wait(): %m");
910                         r = -errno;
911                         goto finish;
912                 }
913
914                 assert(nfds >= 1);
915
916                 for (i = 0; i < nfds; i++) {
917                         if (ev[i].data.fd == STDIN_FILENO) {
918
919                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
920                                         stdin_readable = true;
921
922                         } else if (ev[i].data.fd == STDOUT_FILENO) {
923
924                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
925                                         stdout_writable = true;
926
927                         } else if (ev[i].data.fd == master) {
928
929                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
930                                         master_readable = true;
931
932                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933                                         master_writable = true;
934
935                         } else if (ev[i].data.fd == signal_fd) {
936                                 struct signalfd_siginfo sfsi;
937                                 ssize_t n;
938
939                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
940                                 if (n != sizeof(sfsi)) {
941
942                                         if (n >= 0) {
943                                                 log_error("Failed to read from signalfd: invalid block size");
944                                                 r = -EIO;
945                                                 goto finish;
946                                         }
947
948                                         if (errno != EINTR && errno != EAGAIN) {
949                                                 log_error("Failed to read from signalfd: %m");
950                                                 r = -errno;
951                                                 goto finish;
952                                         }
953                                 } else {
954
955                                         if (sfsi.ssi_signo == SIGWINCH) {
956                                                 struct winsize ws;
957
958                                                 /* The window size changed, let's forward that. */
959                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
960                                                         ioctl(master, TIOCSWINSZ, &ws);
961                                         } else {
962                                                 r = 0;
963                                                 goto finish;
964                                         }
965                                 }
966                         }
967                 }
968
969                 while ((stdin_readable && in_buffer_full <= 0) ||
970                        (master_writable && in_buffer_full > 0) ||
971                        (master_readable && out_buffer_full <= 0) ||
972                        (stdout_writable && out_buffer_full > 0)) {
973
974                         if (stdin_readable && in_buffer_full < LINE_MAX) {
975
976                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
977                                 if (k < 0) {
978
979                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
980                                                 stdin_readable = false;
981                                         else {
982                                                 log_error("read(): %m");
983                                                 r = -errno;
984                                                 goto finish;
985                                         }
986                                 } else
987                                         in_buffer_full += (size_t) k;
988                         }
989
990                         if (master_writable && in_buffer_full > 0) {
991
992                                 k = write(master, in_buffer, in_buffer_full);
993                                 if (k < 0) {
994
995                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996                                                 master_writable = false;
997                                         else {
998                                                 log_error("write(): %m");
999                                                 r = -errno;
1000                                                 goto finish;
1001                                         }
1002
1003                                 } else {
1004                                         assert(in_buffer_full >= (size_t) k);
1005                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1006                                         in_buffer_full -= k;
1007                                 }
1008                         }
1009
1010                         if (master_readable && out_buffer_full < LINE_MAX) {
1011
1012                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1013                                 if (k < 0) {
1014
1015                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1016                                                 master_readable = false;
1017                                         else {
1018                                                 log_error("read(): %m");
1019                                                 r = -errno;
1020                                                 goto finish;
1021                                         }
1022                                 }  else
1023                                         out_buffer_full += (size_t) k;
1024                         }
1025
1026                         if (stdout_writable && out_buffer_full > 0) {
1027
1028                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1029                                 if (k < 0) {
1030
1031                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032                                                 stdout_writable = false;
1033                                         else {
1034                                                 log_error("write(): %m");
1035                                                 r = -errno;
1036                                                 goto finish;
1037                                         }
1038
1039                                 } else {
1040                                         assert(out_buffer_full >= (size_t) k);
1041                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1042                                         out_buffer_full -= k;
1043                                 }
1044                         }
1045                 }
1046         }
1047
1048 finish:
1049         if (ep >= 0)
1050                 close_nointr_nofail(ep);
1051
1052         if (signal_fd >= 0)
1053                 close_nointr_nofail(signal_fd);
1054
1055         return r;
1056 }
1057
1058 int main(int argc, char *argv[]) {
1059         pid_t pid = 0;
1060         int r = EXIT_FAILURE, k;
1061         char *oldcg = NULL, *newcg = NULL;
1062         char **controller = NULL;
1063         int master = -1;
1064         const char *console = NULL;
1065         struct termios saved_attr, raw_attr;
1066         sigset_t mask;
1067         bool saved_attr_valid = false;
1068         struct winsize ws;
1069         int kmsg_socket_pair[2] = { -1, -1 };
1070
1071         log_parse_environment();
1072         log_open();
1073
1074         r = parse_argv(argc, argv);
1075         if (r <= 0)
1076                 goto finish;
1077
1078         if (arg_directory) {
1079                 char *p;
1080
1081                 p = path_make_absolute_cwd(arg_directory);
1082                 free(arg_directory);
1083                 arg_directory = p;
1084         } else
1085                 arg_directory = get_current_dir_name();
1086
1087         if (!arg_directory) {
1088                 log_error("Failed to determine path");
1089                 goto finish;
1090         }
1091
1092         path_kill_slashes(arg_directory);
1093
1094         if (geteuid() != 0) {
1095                 log_error("Need to be root.");
1096                 goto finish;
1097         }
1098
1099         if (sd_booted() <= 0) {
1100                 log_error("Not running on a systemd system.");
1101                 goto finish;
1102         }
1103
1104         if (path_equal(arg_directory, "/")) {
1105                 log_error("Spawning container on root directory not supported.");
1106                 goto finish;
1107         }
1108
1109         if (is_os_tree(arg_directory) <= 0) {
1110                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1111                 goto finish;
1112         }
1113
1114         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1115         if (k < 0) {
1116                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1117                 goto finish;
1118         }
1119
1120         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1121                 log_error("Failed to allocate cgroup path.");
1122                 goto finish;
1123         }
1124
1125         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1126         if (k < 0)  {
1127                 log_error("Failed to create cgroup: %s", strerror(-k));
1128                 goto finish;
1129         }
1130
1131         STRV_FOREACH(controller, arg_controllers) {
1132                 k = cg_create_and_attach(*controller, newcg, 0);
1133                 if (k < 0)
1134                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1135         }
1136
1137         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1138         if (master < 0) {
1139                 log_error("Failed to acquire pseudo tty: %m");
1140                 goto finish;
1141         }
1142
1143         console = ptsname(master);
1144         if (!console) {
1145                 log_error("Failed to determine tty name: %m");
1146                 goto finish;
1147         }
1148
1149         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1150
1151         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1152                 ioctl(master, TIOCSWINSZ, &ws);
1153
1154         if (unlockpt(master) < 0) {
1155                 log_error("Failed to unlock tty: %m");
1156                 goto finish;
1157         }
1158
1159         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1160                 log_error("Failed to get terminal attributes: %m");
1161                 goto finish;
1162         }
1163
1164         saved_attr_valid = true;
1165
1166         raw_attr = saved_attr;
1167         cfmakeraw(&raw_attr);
1168         raw_attr.c_lflag &= ~ECHO;
1169
1170         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1171                 log_error("Failed to create kmsg socket pair");
1172                 goto finish;
1173         }
1174
1175         assert_se(sigemptyset(&mask) == 0);
1176         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1177         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1178
1179         for (;;) {
1180                 siginfo_t status;
1181
1182                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1183                         log_error("Failed to set terminal attributes: %m");
1184                         goto finish;
1185                 }
1186
1187                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1188                 if (pid < 0) {
1189                         if (errno == EINVAL)
1190                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1191                         else
1192                                 log_error("clone() failed: %m");
1193
1194                         goto finish;
1195                 }
1196
1197                 if (pid == 0) {
1198                         /* child */
1199
1200                         const char *home = NULL;
1201                         uid_t uid = (uid_t) -1;
1202                         gid_t gid = (gid_t) -1;
1203                         const char *envp[] = {
1204                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1205                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1206                                 NULL, /* TERM */
1207                                 NULL, /* HOME */
1208                                 NULL, /* USER */
1209                                 NULL, /* LOGNAME */
1210                                 NULL, /* container_uuid */
1211                                 NULL
1212                         };
1213
1214                         envp[2] = strv_find_prefix(environ, "TERM=");
1215
1216                         close_nointr_nofail(master);
1217
1218                         close_nointr(STDIN_FILENO);
1219                         close_nointr(STDOUT_FILENO);
1220                         close_nointr(STDERR_FILENO);
1221
1222                         close_all_fds(&kmsg_socket_pair[1], 1);
1223
1224                         reset_all_signal_handlers();
1225
1226                         assert_se(sigemptyset(&mask) == 0);
1227                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1228
1229                         if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1230                             dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1231                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1232                                 goto child_fail;
1233
1234                         if (setsid() < 0) {
1235                                 log_error("setsid() failed: %m");
1236                                 goto child_fail;
1237                         }
1238
1239                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1240                                 log_error("PR_SET_PDEATHSIG failed: %m");
1241                                 goto child_fail;
1242                         }
1243
1244                         /* Mark everything as slave, so that we still
1245                          * receive mounts from the real root, but don't
1246                          * propagate mounts to the real root. */
1247                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1248                                 log_error("MS_SLAVE|MS_REC failed: %m");
1249                                 goto child_fail;
1250                         }
1251
1252                         /* Turn directory into bind mount */
1253                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1254                                 log_error("Failed to make bind mount.");
1255                                 goto child_fail;
1256                         }
1257
1258                         if (arg_read_only)
1259                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1260                                         log_error("Failed to make read-only.");
1261                                         goto child_fail;
1262                                 }
1263
1264                         if (mount_all(arg_directory) < 0)
1265                                 goto child_fail;
1266
1267                         if (copy_devnodes(arg_directory) < 0)
1268                                 goto child_fail;
1269
1270                         dev_setup(arg_directory);
1271
1272                         if (setup_dev_console(arg_directory, console) < 0)
1273                                 goto child_fail;
1274
1275                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1276                                 goto child_fail;
1277
1278                         close_nointr_nofail(kmsg_socket_pair[1]);
1279
1280                         if (setup_boot_id(arg_directory) < 0)
1281                                 goto child_fail;
1282
1283                         if (setup_timezone(arg_directory) < 0)
1284                                 goto child_fail;
1285
1286                         if (setup_resolv_conf(arg_directory) < 0)
1287                                 goto child_fail;
1288
1289                         if (setup_journal(arg_directory) < 0)
1290                                 goto child_fail;
1291
1292                         if (chdir(arg_directory) < 0) {
1293                                 log_error("chdir(%s) failed: %m", arg_directory);
1294                                 goto child_fail;
1295                         }
1296
1297                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1298                                 log_error("mount(MS_MOVE) failed: %m");
1299                                 goto child_fail;
1300                         }
1301
1302                         if (chroot(".") < 0) {
1303                                 log_error("chroot() failed: %m");
1304                                 goto child_fail;
1305                         }
1306
1307                         if (chdir("/") < 0) {
1308                                 log_error("chdir() failed: %m");
1309                                 goto child_fail;
1310                         }
1311
1312                         umask(0022);
1313
1314                         loopback_setup();
1315
1316                         if (drop_capabilities() < 0) {
1317                                 log_error("drop_capabilities() failed: %m");
1318                                 goto child_fail;
1319                         }
1320
1321                         if (arg_user) {
1322
1323                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1324                                         log_error("get_user_creds() failed: %m");
1325                                         goto child_fail;
1326                                 }
1327
1328                                 if (mkdir_parents_label(home, 0775) < 0) {
1329                                         log_error("mkdir_parents_label() failed: %m");
1330                                         goto child_fail;
1331                                 }
1332
1333                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1334                                         log_error("mkdir_safe_label() failed: %m");
1335                                         goto child_fail;
1336                                 }
1337
1338                                 if (initgroups((const char*)arg_user, gid) < 0) {
1339                                         log_error("initgroups() failed: %m");
1340                                         goto child_fail;
1341                                 }
1342
1343                                 if (setresgid(gid, gid, gid) < 0) {
1344                                         log_error("setregid() failed: %m");
1345                                         goto child_fail;
1346                                 }
1347
1348                                 if (setresuid(uid, uid, uid) < 0) {
1349                                         log_error("setreuid() failed: %m");
1350                                         goto child_fail;
1351                                 }
1352                         }
1353
1354                         if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1355                             (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1356                             (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1357                                 log_oom();
1358                                 goto child_fail;
1359                         }
1360
1361                         if (arg_uuid) {
1362                                 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1363                                         log_oom();
1364                                         goto child_fail;
1365                                 }
1366                         }
1367
1368                         setup_hostname();
1369
1370                         if (arg_boot) {
1371                                 char **a;
1372                                 size_t l;
1373
1374                                 /* Automatically search for the init system */
1375
1376                                 l = 1 + argc - optind;
1377                                 a = newa(char*, l + 1);
1378                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1379
1380                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1381                                 execve(a[0], a, (char**) envp);
1382
1383                                 a[0] = (char*) "/lib/systemd/systemd";
1384                                 execve(a[0], a, (char**) envp);
1385
1386                                 a[0] = (char*) "/sbin/init";
1387                                 execve(a[0], a, (char**) envp);
1388                         } else if (argc > optind)
1389                                 execvpe(argv[optind], argv + optind, (char**) envp);
1390                         else {
1391                                 chdir(home ? home : "/root");
1392                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1393                         }
1394
1395                         log_error("execv() failed: %m");
1396
1397                 child_fail:
1398                         _exit(EXIT_FAILURE);
1399                 }
1400
1401                 if (process_pty(master, &mask) < 0)
1402                         goto finish;
1403
1404
1405                 if (saved_attr_valid)
1406                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1407
1408                 r = wait_for_terminate(pid, &status);
1409                 if (r < 0) {
1410                         r = EXIT_FAILURE;
1411                         break;
1412                 }
1413
1414                 if (status.si_code == CLD_EXITED) {
1415                         if (status.si_status != 0) {
1416                                 log_error("Container failed with error code %i.", status.si_status);
1417                                 r = status.si_status;
1418                                 break;
1419                         }
1420
1421                         log_debug("Container exited successfully.");
1422                         break;
1423                 } else if (status.si_code == CLD_KILLED &&
1424                            status.si_status == SIGINT) {
1425                         log_info("Container has been shut down.");
1426                         r = 0;
1427                         break;
1428                 } else if (status.si_code == CLD_KILLED &&
1429                            status.si_status == SIGHUP) {
1430                         log_info("Container is being rebooted.");
1431                         continue;
1432                 } else if (status.si_code == CLD_KILLED ||
1433                            status.si_code == CLD_DUMPED) {
1434
1435                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1436                         r = EXIT_FAILURE;
1437                         break;
1438                 } else {
1439                         log_error("Container failed due to unknown reason.");
1440                         r = EXIT_FAILURE;
1441                         break;
1442                 }
1443         }
1444
1445 finish:
1446         if (saved_attr_valid)
1447                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1448
1449         if (master >= 0)
1450                 close_nointr_nofail(master);
1451
1452         close_pipe(kmsg_socket_pair);
1453
1454         if (oldcg)
1455                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1456
1457         if (newcg)
1458                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1459
1460         free(arg_directory);
1461         strv_free(arg_controllers);
1462         free(oldcg);
1463         free(newcg);
1464
1465         return r;
1466 }