chiark / gitweb /
use "Out of memory." consistantly (or with "\n")
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56
57 typedef enum LinkJournal {
58         LINK_NO,
59         LINK_AUTO,
60         LINK_HOST,
61         LINK_GUEST
62 } LinkJournal;
63
64 static char *arg_directory = NULL;
65 static char *arg_user = NULL;
66 static char **arg_controllers = NULL;
67 static char *arg_uuid = NULL;
68 static bool arg_private_network = false;
69 static bool arg_read_only = false;
70 static bool arg_boot = false;
71 static LinkJournal arg_link_journal = LINK_AUTO;
72 static uint64_t arg_retain =
73         (1ULL << CAP_CHOWN) |
74         (1ULL << CAP_DAC_OVERRIDE) |
75         (1ULL << CAP_DAC_READ_SEARCH) |
76         (1ULL << CAP_FOWNER) |
77         (1ULL << CAP_FSETID) |
78         (1ULL << CAP_IPC_OWNER) |
79         (1ULL << CAP_KILL) |
80         (1ULL << CAP_LEASE) |
81         (1ULL << CAP_LINUX_IMMUTABLE) |
82         (1ULL << CAP_NET_BIND_SERVICE) |
83         (1ULL << CAP_NET_BROADCAST) |
84         (1ULL << CAP_NET_RAW) |
85         (1ULL << CAP_SETGID) |
86         (1ULL << CAP_SETFCAP) |
87         (1ULL << CAP_SETPCAP) |
88         (1ULL << CAP_SETUID) |
89         (1ULL << CAP_SYS_ADMIN) |
90         (1ULL << CAP_SYS_CHROOT) |
91         (1ULL << CAP_SYS_NICE) |
92         (1ULL << CAP_SYS_PTRACE) |
93         (1ULL << CAP_SYS_TTY_CONFIG) |
94         (1ULL << CAP_SYS_RESOURCE);
95
96 static int help(void) {
97
98         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
99                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
100                "  -h --help               Show this help\n"
101                "  -D --directory=NAME     Root directory for the container\n"
102                "  -b --boot               Boot up full system (i.e. invoke init)\n"
103                "  -u --user=USER          Run the command under specified user or uid\n"
104                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
105                "     --uuid=UUID          Set a specific machine UUID for the container\n"
106                "     --private-network    Disable network in container\n"
107                "     --read-only          Mount the root directory read-only\n"
108                "     --capability=CAP     In addition to the default, retain specified capability\n"
109                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
110                "  -j                      Equivalent to --link-journal=host\n",
111                program_invocation_short_name);
112
113         return 0;
114 }
115
116 static int parse_argv(int argc, char *argv[]) {
117
118         enum {
119                 ARG_PRIVATE_NETWORK = 0x100,
120                 ARG_UUID,
121                 ARG_READ_ONLY,
122                 ARG_CAPABILITY,
123                 ARG_LINK_JOURNAL
124         };
125
126         static const struct option options[] = {
127                 { "help",            no_argument,       NULL, 'h'                 },
128                 { "directory",       required_argument, NULL, 'D'                 },
129                 { "user",            required_argument, NULL, 'u'                 },
130                 { "controllers",     required_argument, NULL, 'C'                 },
131                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
132                 { "boot",            no_argument,       NULL, 'b'                 },
133                 { "uuid",            required_argument, NULL, ARG_UUID            },
134                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
135                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
136                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
137                 { NULL,              0,                 NULL, 0                   }
138         };
139
140         int c;
141
142         assert(argc >= 0);
143         assert(argv);
144
145         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
146
147                 switch (c) {
148
149                 case 'h':
150                         help();
151                         return 0;
152
153                 case 'D':
154                         free(arg_directory);
155                         arg_directory = canonicalize_file_name(optarg);
156                         if (!arg_directory) {
157                                 log_error("Failed to canonicalize root directory.");
158                                 return -ENOMEM;
159                         }
160
161                         break;
162
163                 case 'u':
164                         free(arg_user);
165                         if (!(arg_user = strdup(optarg))) {
166                                 log_error("Failed to duplicate user name.");
167                                 return -ENOMEM;
168                         }
169
170                         break;
171
172                 case 'C':
173                         strv_free(arg_controllers);
174                         arg_controllers = strv_split(optarg, ",");
175                         if (!arg_controllers) {
176                                 log_error("Failed to split controllers list.");
177                                 return -ENOMEM;
178                         }
179                         strv_uniq(arg_controllers);
180
181                         break;
182
183                 case ARG_PRIVATE_NETWORK:
184                         arg_private_network = true;
185                         break;
186
187                 case 'b':
188                         arg_boot = true;
189                         break;
190
191                 case ARG_UUID:
192                         arg_uuid = optarg;
193                         break;
194
195                 case ARG_READ_ONLY:
196                         arg_read_only = true;
197                         break;
198
199                 case ARG_CAPABILITY: {
200                         char *state, *word;
201                         size_t length;
202
203                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
204                                 cap_value_t cap;
205                                 char *t;
206
207                                 t = strndup(word, length);
208                                 if (!t) {
209                                         log_error("Out of memory.");
210                                         return -ENOMEM;
211                                 }
212
213                                 if (cap_from_name(t, &cap) < 0) {
214                                         log_error("Failed to parse capability %s.", t);
215                                         free(t);
216                                         return -EINVAL;
217                                 }
218
219                                 free(t);
220                                 arg_retain |= 1ULL << (uint64_t) cap;
221                         }
222
223                         break;
224                 }
225
226                 case 'j':
227                         arg_link_journal = LINK_GUEST;
228                         break;
229
230                 case ARG_LINK_JOURNAL:
231                         if (streq(optarg, "auto"))
232                                 arg_link_journal = LINK_AUTO;
233                         else if (streq(optarg, "no"))
234                                 arg_link_journal = LINK_NO;
235                         else if (streq(optarg, "guest"))
236                                 arg_link_journal = LINK_GUEST;
237                         else if (streq(optarg, "host"))
238                                 arg_link_journal = LINK_HOST;
239                         else {
240                                 log_error("Failed to parse link journal mode %s", optarg);
241                                 return -EINVAL;
242                         }
243
244                         break;
245
246                 case '?':
247                         return -EINVAL;
248
249                 default:
250                         log_error("Unknown option code %c", c);
251                         return -EINVAL;
252                 }
253         }
254
255         return 1;
256 }
257
258 static int mount_all(const char *dest) {
259
260         typedef struct MountPoint {
261                 const char *what;
262                 const char *where;
263                 const char *type;
264                 const char *options;
265                 unsigned long flags;
266                 bool fatal;
267         } MountPoint;
268
269         static const MountPoint mount_table[] = {
270                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
271                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
272                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
273                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
274                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
275                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
276                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
277                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278 #ifdef HAVE_SELINUX
279                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
280                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
281 #endif
282         };
283
284         unsigned k;
285         int r = 0;
286         char *where;
287
288         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
289                 int t;
290
291                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
292                         log_error("Out of memory.");
293
294                         if (r == 0)
295                                 r = -ENOMEM;
296
297                         break;
298                 }
299
300                 t = path_is_mount_point(where, false);
301                 if (t < 0) {
302                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
303                         free(where);
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 mkdir_p_label(where, 0755);
312
313                 if (mount(mount_table[k].what,
314                           where,
315                           mount_table[k].type,
316                           mount_table[k].flags,
317                           mount_table[k].options) < 0 &&
318                     mount_table[k].fatal) {
319
320                         log_error("mount(%s) failed: %m", where);
321
322                         if (r == 0)
323                                 r = -errno;
324                 }
325
326                 free(where);
327         }
328
329         return r;
330 }
331
332 static int setup_timezone(const char *dest) {
333         char *where;
334
335         assert(dest);
336
337         /* Fix the timezone, if possible */
338         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
339                 log_error("Out of memory.");
340                 return -ENOMEM;
341         }
342
343         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
344                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
345
346         free(where);
347
348         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
349                 log_error("Out of memory.");
350                 return -ENOMEM;
351         }
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
371                 log_error("Out of memory.");
372                 return -ENOMEM;
373         }
374
375         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
376                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
377
378         free(where);
379
380         return 0;
381 }
382
383 static int copy_devnodes(const char *dest) {
384
385         static const char devnodes[] =
386                 "null\0"
387                 "zero\0"
388                 "full\0"
389                 "random\0"
390                 "urandom\0"
391                 "tty\0"
392                 "ptmx\0"
393                 "rtc0\0";
394
395         const char *d;
396         int r = 0;
397         mode_t u;
398
399         assert(dest);
400
401         u = umask(0000);
402
403         NULSTR_FOREACH(d, devnodes) {
404                 struct stat st;
405                 char *from = NULL, *to = NULL;
406
407                 asprintf(&from, "/dev/%s", d);
408                 asprintf(&to, "%s/dev/%s", dest, d);
409
410                 if (!from || !to) {
411                         log_error("Failed to allocate devnode path");
412
413                         free(from);
414                         free(to);
415
416                         from = to = NULL;
417
418                         if (r == 0)
419                                 r = -ENOMEM;
420
421                         break;
422                 }
423
424                 if (stat(from, &st) < 0) {
425
426                         if (errno != ENOENT) {
427                                 log_error("Failed to stat %s: %m", from);
428                                 if (r == 0)
429                                         r = -errno;
430                         }
431
432                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
433
434                         log_error("%s is not a char or block device, cannot copy.", from);
435                         if (r == 0)
436                                 r = -EIO;
437
438                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
439
440                         log_error("mknod(%s) failed: %m", dest);
441                         if (r == 0)
442                                 r = -errno;
443                 }
444
445                 free(from);
446                 free(to);
447         }
448
449         umask(u);
450
451         return r;
452 }
453
454 static int setup_dev_console(const char *dest, const char *console) {
455         struct stat st;
456         char *to = NULL;
457         int r;
458         mode_t u;
459
460         assert(dest);
461         assert(console);
462
463         u = umask(0000);
464
465         if (stat(console, &st) < 0) {
466                 log_error("Failed to stat %s: %m", console);
467                 r = -errno;
468                 goto finish;
469
470         } else if (!S_ISCHR(st.st_mode)) {
471                 log_error("/dev/console is not a char device.");
472                 r = -EIO;
473                 goto finish;
474         }
475
476         r = chmod_and_chown(console, 0600, 0, 0);
477         if (r < 0) {
478                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
479                 goto finish;
480         }
481
482         if (asprintf(&to, "%s/dev/console", dest) < 0) {
483                 log_error("Out of memory.");
484                 r = -ENOMEM;
485                 goto finish;
486         }
487
488         /* We need to bind mount the right tty to /dev/console since
489          * ptys can only exist on pts file systems. To have something
490          * to bind mount things on we create a device node first, that
491          * has the right major/minor (note that the major minor
492          * doesn't actually matter here, since we mount it over
493          * anyway). */
494
495         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
496                 log_error("mknod() for /dev/console failed: %m");
497                 r = -errno;
498                 goto finish;
499         }
500
501         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
502                 log_error("Bind mount for /dev/console failed: %m");
503                 r = -errno;
504                 goto finish;
505         }
506
507 finish:
508         free(to);
509         umask(u);
510
511         return r;
512 }
513
514 static int setup_kmsg(const char *dest, int kmsg_socket) {
515         char *from = NULL, *to = NULL;
516         int r, fd, k;
517         mode_t u;
518         union {
519                 struct cmsghdr cmsghdr;
520                 uint8_t buf[CMSG_SPACE(sizeof(int))];
521         } control;
522         struct msghdr mh;
523         struct cmsghdr *cmsg;
524
525         assert(dest);
526         assert(kmsg_socket >= 0);
527
528         u = umask(0000);
529
530         /* We create the kmsg FIFO as /dev/kmsg, but immediately
531          * delete it after bind mounting it to /proc/kmsg. While FIFOs
532          * on the reading side behave very similar to /proc/kmsg,
533          * their writing side behaves differently from /dev/kmsg in
534          * that writing blocks when nothing is reading. In order to
535          * avoid any problems with containers deadlocking due to this
536          * we simply make /dev/kmsg unavailable to the container. */
537         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
538                 log_error("Out of memory.");
539                 r = -ENOMEM;
540                 goto finish;
541         }
542
543         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
544                 log_error("Out of memory.");
545                 r = -ENOMEM;
546                 goto finish;
547         }
548
549         if (mkfifo(from, 0600) < 0) {
550                 log_error("mkfifo() for /dev/kmsg failed: %m");
551                 r = -errno;
552                 goto finish;
553         }
554
555         r = chmod_and_chown(from, 0600, 0, 0);
556         if (r < 0) {
557                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
558                 goto finish;
559         }
560
561         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
562                 log_error("Bind mount for /proc/kmsg failed: %m");
563                 r = -errno;
564                 goto finish;
565         }
566
567         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
568         if (fd < 0) {
569                 log_error("Failed to open fifo: %m");
570                 r = -errno;
571                 goto finish;
572         }
573
574         zero(mh);
575         zero(control);
576
577         mh.msg_control = &control;
578         mh.msg_controllen = sizeof(control);
579
580         cmsg = CMSG_FIRSTHDR(&mh);
581         cmsg->cmsg_level = SOL_SOCKET;
582         cmsg->cmsg_type = SCM_RIGHTS;
583         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
584         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
585
586         mh.msg_controllen = cmsg->cmsg_len;
587
588         /* Store away the fd in the socket, so that it stays open as
589          * long as we run the child */
590         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
591         close_nointr_nofail(fd);
592
593         if (k < 0) {
594                 log_error("Failed to send FIFO fd: %m");
595                 r = -errno;
596                 goto finish;
597         }
598
599         /* And now make the FIFO unavailable as /dev/kmsg... */
600         unlink(from);
601
602 finish:
603         free(from);
604         free(to);
605         umask(u);
606
607         return r;
608 }
609
610 static int setup_hostname(void) {
611         char *hn;
612         int r = 0;
613
614         hn = path_get_file_name(arg_directory);
615         if (hn) {
616                 hn = strdup(hn);
617                 if (!hn)
618                         return -ENOMEM;
619
620                 hostname_cleanup(hn);
621
622                 if (!isempty(hn))
623                         if (sethostname(hn, strlen(hn)) < 0)
624                                 r = -errno;
625
626                 free(hn);
627         }
628
629         return r;
630 }
631
632 static int setup_journal(const char *directory) {
633         sd_id128_t machine_id;
634         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
635         int r;
636
637         if (arg_link_journal == LINK_NO)
638                 return 0;
639
640         p = strappend(directory, "/etc/machine-id");
641         if (!p) {
642                 log_error("Out of memory.");
643                 r = -ENOMEM;
644                 goto finish;
645         }
646
647         r = read_one_line_file(p, &b);
648         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
649                 r = 0;
650                 goto finish;
651         } else if (r < 0) {
652                 log_error("Failed to read machine ID: %s", strerror(-r));
653                 return r;
654         }
655
656         l = strstrip(b);
657         if (isempty(l) && arg_link_journal == LINK_AUTO) {
658                 r = 0;
659                 goto finish;
660         }
661
662         /* Verify validaty */
663         r = sd_id128_from_string(l, &machine_id);
664         if (r < 0) {
665                 log_error("Failed to parse machine ID: %s", strerror(-r));
666                 goto finish;
667         }
668
669         free(p);
670         p = strappend("/var/log/journal/", l);
671         q = strjoin(directory, "/var/log/journal/", l, NULL);
672         if (!p || !q) {
673                 log_error("Out of memory.");
674                 r = -ENOMEM;
675                 goto finish;
676         }
677
678         if (path_is_mount_point(p, false) > 0 ||
679             path_is_mount_point(q, false) > 0) {
680                 if (arg_link_journal != LINK_AUTO) {
681                         log_error("Journal already a mount point, refusing.");
682                         r = -EEXIST;
683                         goto finish;
684                 }
685
686                 r = 0;
687                 goto finish;
688         }
689
690         r = readlink_and_make_absolute(p, &d);
691         if (r >= 0) {
692                 if ((arg_link_journal == LINK_GUEST ||
693                      arg_link_journal == LINK_AUTO) &&
694                     path_equal(d, q)) {
695
696                         mkdir_p(q, 0755);
697
698                         r = 0;
699                         goto finish;
700                 }
701
702                 if (unlink(p) < 0) {
703                         log_error("Failed to remove symlink %s: %m", p);
704                         r = -errno;
705                         goto finish;
706                 }
707         } else if (r == -EINVAL) {
708
709                 if (arg_link_journal == LINK_GUEST &&
710                     rmdir(p) < 0) {
711
712                         if (errno == ENOTDIR)
713                                 log_error("%s already exists and is neither symlink nor directory.", p);
714                         else {
715                                 log_error("Failed to remove %s: %m", p);
716                                 r = -errno;
717                         }
718
719                         goto finish;
720                 }
721         } else if (r != -ENOENT) {
722                 log_error("readlink(%s) failed: %m", p);
723                 goto finish;
724         }
725
726         if (arg_link_journal == LINK_GUEST) {
727
728                 if (symlink(q, p) < 0) {
729                         log_error("Failed to symlink %s to %s: %m", q, p);
730                         r = -errno;
731                         goto finish;
732                 }
733
734                 mkdir_p(q, 0755);
735
736                 r = 0;
737                 goto finish;
738         }
739
740         if (arg_link_journal == LINK_HOST) {
741                 r = mkdir_p(p, 0755);
742                 if (r < 0) {
743                         log_error("Failed to create %s: %m", p);
744                         goto finish;
745                 }
746
747         } else if (access(p, F_OK) < 0) {
748                 r = 0;
749                 goto finish;
750         }
751
752         if (dir_is_empty(q) == 0) {
753                 log_error("%s not empty.", q);
754                 r = -ENOTEMPTY;
755                 goto finish;
756         }
757
758         r = mkdir_p(q, 0755);
759         if (r < 0) {
760                 log_error("Failed to create %s: %m", q);
761                 goto finish;
762         }
763
764         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
765                 log_error("Failed to bind mount journal from host into guest: %m");
766                 r = -errno;
767                 goto finish;
768         }
769
770         r = 0;
771
772 finish:
773         free(p);
774         free(q);
775         free(d);
776         free(b);
777         return r;
778
779 }
780
781 static int drop_capabilities(void) {
782         return capability_bounding_set_drop(~arg_retain, false);
783 }
784
785 static int is_os_tree(const char *path) {
786         int r;
787         char *p;
788         /* We use /bin/sh as flag file if something is an OS */
789
790         if (asprintf(&p, "%s/bin/sh", path) < 0)
791                 return -ENOMEM;
792
793         r = access(p, F_OK);
794         free(p);
795
796         return r < 0 ? 0 : 1;
797 }
798
799 static int process_pty(int master, sigset_t *mask) {
800
801         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
802         size_t in_buffer_full = 0, out_buffer_full = 0;
803         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
804         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
805         int ep = -1, signal_fd = -1, r;
806
807         fd_nonblock(STDIN_FILENO, 1);
808         fd_nonblock(STDOUT_FILENO, 1);
809         fd_nonblock(master, 1);
810
811         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
812         if (signal_fd < 0) {
813                 log_error("signalfd(): %m");
814                 r = -errno;
815                 goto finish;
816         }
817
818         ep = epoll_create1(EPOLL_CLOEXEC);
819         if (ep < 0) {
820                 log_error("Failed to create epoll: %m");
821                 r = -errno;
822                 goto finish;
823         }
824
825         zero(stdin_ev);
826         stdin_ev.events = EPOLLIN|EPOLLET;
827         stdin_ev.data.fd = STDIN_FILENO;
828
829         zero(stdout_ev);
830         stdout_ev.events = EPOLLOUT|EPOLLET;
831         stdout_ev.data.fd = STDOUT_FILENO;
832
833         zero(master_ev);
834         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
835         master_ev.data.fd = master;
836
837         zero(signal_ev);
838         signal_ev.events = EPOLLIN;
839         signal_ev.data.fd = signal_fd;
840
841         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
842             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
843             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
844             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
845                 log_error("Failed to regiser fds in epoll: %m");
846                 r = -errno;
847                 goto finish;
848         }
849
850         for (;;) {
851                 struct epoll_event ev[16];
852                 ssize_t k;
853                 int i, nfds;
854
855                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
856                 if (nfds < 0) {
857
858                         if (errno == EINTR || errno == EAGAIN)
859                                 continue;
860
861                         log_error("epoll_wait(): %m");
862                         r = -errno;
863                         goto finish;
864                 }
865
866                 assert(nfds >= 1);
867
868                 for (i = 0; i < nfds; i++) {
869                         if (ev[i].data.fd == STDIN_FILENO) {
870
871                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
872                                         stdin_readable = true;
873
874                         } else if (ev[i].data.fd == STDOUT_FILENO) {
875
876                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
877                                         stdout_writable = true;
878
879                         } else if (ev[i].data.fd == master) {
880
881                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
882                                         master_readable = true;
883
884                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
885                                         master_writable = true;
886
887                         } else if (ev[i].data.fd == signal_fd) {
888                                 struct signalfd_siginfo sfsi;
889                                 ssize_t n;
890
891                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
892                                 if (n != sizeof(sfsi)) {
893
894                                         if (n >= 0) {
895                                                 log_error("Failed to read from signalfd: invalid block size");
896                                                 r = -EIO;
897                                                 goto finish;
898                                         }
899
900                                         if (errno != EINTR && errno != EAGAIN) {
901                                                 log_error("Failed to read from signalfd: %m");
902                                                 r = -errno;
903                                                 goto finish;
904                                         }
905                                 } else {
906
907                                         if (sfsi.ssi_signo == SIGWINCH) {
908                                                 struct winsize ws;
909
910                                                 /* The window size changed, let's forward that. */
911                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
912                                                         ioctl(master, TIOCSWINSZ, &ws);
913                                         } else {
914                                                 r = 0;
915                                                 goto finish;
916                                         }
917                                 }
918                         }
919                 }
920
921                 while ((stdin_readable && in_buffer_full <= 0) ||
922                        (master_writable && in_buffer_full > 0) ||
923                        (master_readable && out_buffer_full <= 0) ||
924                        (stdout_writable && out_buffer_full > 0)) {
925
926                         if (stdin_readable && in_buffer_full < LINE_MAX) {
927
928                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
929                                 if (k < 0) {
930
931                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
932                                                 stdin_readable = false;
933                                         else {
934                                                 log_error("read(): %m");
935                                                 r = -errno;
936                                                 goto finish;
937                                         }
938                                 } else
939                                         in_buffer_full += (size_t) k;
940                         }
941
942                         if (master_writable && in_buffer_full > 0) {
943
944                                 k = write(master, in_buffer, in_buffer_full);
945                                 if (k < 0) {
946
947                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
948                                                 master_writable = false;
949                                         else {
950                                                 log_error("write(): %m");
951                                                 r = -errno;
952                                                 goto finish;
953                                         }
954
955                                 } else {
956                                         assert(in_buffer_full >= (size_t) k);
957                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
958                                         in_buffer_full -= k;
959                                 }
960                         }
961
962                         if (master_readable && out_buffer_full < LINE_MAX) {
963
964                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
965                                 if (k < 0) {
966
967                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
968                                                 master_readable = false;
969                                         else {
970                                                 log_error("read(): %m");
971                                                 r = -errno;
972                                                 goto finish;
973                                         }
974                                 }  else
975                                         out_buffer_full += (size_t) k;
976                         }
977
978                         if (stdout_writable && out_buffer_full > 0) {
979
980                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
981                                 if (k < 0) {
982
983                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984                                                 stdout_writable = false;
985                                         else {
986                                                 log_error("write(): %m");
987                                                 r = -errno;
988                                                 goto finish;
989                                         }
990
991                                 } else {
992                                         assert(out_buffer_full >= (size_t) k);
993                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
994                                         out_buffer_full -= k;
995                                 }
996                         }
997                 }
998         }
999
1000 finish:
1001         if (ep >= 0)
1002                 close_nointr_nofail(ep);
1003
1004         if (signal_fd >= 0)
1005                 close_nointr_nofail(signal_fd);
1006
1007         return r;
1008 }
1009
1010 int main(int argc, char *argv[]) {
1011         pid_t pid = 0;
1012         int r = EXIT_FAILURE, k;
1013         char *oldcg = NULL, *newcg = NULL;
1014         char **controller = NULL;
1015         int master = -1;
1016         const char *console = NULL;
1017         struct termios saved_attr, raw_attr;
1018         sigset_t mask;
1019         bool saved_attr_valid = false;
1020         struct winsize ws;
1021         int kmsg_socket_pair[2] = { -1, -1 };
1022
1023         log_parse_environment();
1024         log_open();
1025
1026         r = parse_argv(argc, argv);
1027         if (r <= 0)
1028                 goto finish;
1029
1030         if (arg_directory) {
1031                 char *p;
1032
1033                 p = path_make_absolute_cwd(arg_directory);
1034                 free(arg_directory);
1035                 arg_directory = p;
1036         } else
1037                 arg_directory = get_current_dir_name();
1038
1039         if (!arg_directory) {
1040                 log_error("Failed to determine path");
1041                 goto finish;
1042         }
1043
1044         path_kill_slashes(arg_directory);
1045
1046         if (geteuid() != 0) {
1047                 log_error("Need to be root.");
1048                 goto finish;
1049         }
1050
1051         if (sd_booted() <= 0) {
1052                 log_error("Not running on a systemd system.");
1053                 goto finish;
1054         }
1055
1056         if (path_equal(arg_directory, "/")) {
1057                 log_error("Spawning container on root directory not supported.");
1058                 goto finish;
1059         }
1060
1061         if (is_os_tree(arg_directory) <= 0) {
1062                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1063                 goto finish;
1064         }
1065
1066         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1067         if (k < 0) {
1068                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1069                 goto finish;
1070         }
1071
1072         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1073                 log_error("Failed to allocate cgroup path.");
1074                 goto finish;
1075         }
1076
1077         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1078         if (k < 0)  {
1079                 log_error("Failed to create cgroup: %s", strerror(-k));
1080                 goto finish;
1081         }
1082
1083         STRV_FOREACH(controller, arg_controllers) {
1084                 k = cg_create_and_attach(*controller, newcg, 0);
1085                 if (k < 0)
1086                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1087         }
1088
1089         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1090         if (master < 0) {
1091                 log_error("Failed to acquire pseudo tty: %m");
1092                 goto finish;
1093         }
1094
1095         console = ptsname(master);
1096         if (!console) {
1097                 log_error("Failed to determine tty name: %m");
1098                 goto finish;
1099         }
1100
1101         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1102
1103         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1104                 ioctl(master, TIOCSWINSZ, &ws);
1105
1106         if (unlockpt(master) < 0) {
1107                 log_error("Failed to unlock tty: %m");
1108                 goto finish;
1109         }
1110
1111         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1112                 log_error("Failed to get terminal attributes: %m");
1113                 goto finish;
1114         }
1115
1116         saved_attr_valid = true;
1117
1118         raw_attr = saved_attr;
1119         cfmakeraw(&raw_attr);
1120         raw_attr.c_lflag &= ~ECHO;
1121
1122         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1123                 log_error("Failed to set terminal attributes: %m");
1124                 goto finish;
1125         }
1126
1127         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1128                 log_error("Failed to create kmsg socket pair");
1129                 goto finish;
1130         }
1131
1132         assert_se(sigemptyset(&mask) == 0);
1133         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1134         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1135
1136         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1137         if (pid < 0) {
1138                 if (errno == EINVAL)
1139                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1140                 else
1141                         log_error("clone() failed: %m");
1142
1143                 goto finish;
1144         }
1145
1146         if (pid == 0) {
1147                 /* child */
1148
1149                 const char *home = NULL;
1150                 uid_t uid = (uid_t) -1;
1151                 gid_t gid = (gid_t) -1;
1152                 const char *envp[] = {
1153                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1154                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1155                         NULL, /* TERM */
1156                         NULL, /* HOME */
1157                         NULL, /* USER */
1158                         NULL, /* LOGNAME */
1159                         NULL, /* container_uuid */
1160                         NULL
1161                 };
1162
1163                 envp[2] = strv_find_prefix(environ, "TERM=");
1164
1165                 close_nointr_nofail(master);
1166
1167                 close_nointr(STDIN_FILENO);
1168                 close_nointr(STDOUT_FILENO);
1169                 close_nointr(STDERR_FILENO);
1170
1171                 close_all_fds(&kmsg_socket_pair[1], 1);
1172
1173                 reset_all_signal_handlers();
1174
1175                 assert_se(sigemptyset(&mask) == 0);
1176                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1177
1178                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1179                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1180                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1181                         goto child_fail;
1182
1183                 if (setsid() < 0) {
1184                         log_error("setsid() failed: %m");
1185                         goto child_fail;
1186                 }
1187
1188                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1189                         log_error("PR_SET_PDEATHSIG failed: %m");
1190                         goto child_fail;
1191                 }
1192
1193                 /* Mark / as private, in case somebody marked it shared */
1194                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0) {
1195                         log_error("MS_PRIVATE|MS_REC failed: %m");
1196                         goto child_fail;
1197                 }
1198
1199                 /* Turn directory into bind mount */
1200                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
1201                         log_error("Failed to make bind mount.");
1202                         goto child_fail;
1203                 }
1204
1205                 if (arg_read_only)
1206                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1207                                 log_error("Failed to make read-only.");
1208                                 goto child_fail;
1209                         }
1210
1211                 if (mount_all(arg_directory) < 0)
1212                         goto child_fail;
1213
1214                 if (copy_devnodes(arg_directory) < 0)
1215                         goto child_fail;
1216
1217                 if (setup_dev_console(arg_directory, console) < 0)
1218                         goto child_fail;
1219
1220                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1221                         goto child_fail;
1222
1223                 close_nointr_nofail(kmsg_socket_pair[1]);
1224
1225                 if (setup_timezone(arg_directory) < 0)
1226                         goto child_fail;
1227
1228                 if (setup_resolv_conf(arg_directory) < 0)
1229                         goto child_fail;
1230
1231                 if (setup_journal(arg_directory) < 0)
1232                         goto child_fail;
1233
1234                 if (chdir(arg_directory) < 0) {
1235                         log_error("chdir(%s) failed: %m", arg_directory);
1236                         goto child_fail;
1237                 }
1238
1239                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1240                         log_error("mount(MS_BIND) failed: %m");
1241                         goto child_fail;
1242                 }
1243
1244                 if (chroot(".") < 0) {
1245                         log_error("chroot() failed: %m");
1246                         goto child_fail;
1247                 }
1248
1249                 if (chdir("/") < 0) {
1250                         log_error("chdir() failed: %m");
1251                         goto child_fail;
1252                 }
1253
1254                 umask(0022);
1255
1256                 loopback_setup();
1257
1258                 if (drop_capabilities() < 0) {
1259                         log_error("drop_capabilities() failed: %m");
1260                         goto child_fail;
1261                 }
1262
1263                 if (arg_user) {
1264
1265                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1266                                 log_error("get_user_creds() failed: %m");
1267                                 goto child_fail;
1268                         }
1269
1270                         if (mkdir_parents_label(home, 0775) < 0) {
1271                                 log_error("mkdir_parents_label() failed: %m");
1272                                 goto child_fail;
1273                         }
1274
1275                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1276                                 log_error("mkdir_safe_label() failed: %m");
1277                                 goto child_fail;
1278                         }
1279
1280                         if (initgroups((const char*)arg_user, gid) < 0) {
1281                                 log_error("initgroups() failed: %m");
1282                                 goto child_fail;
1283                         }
1284
1285                         if (setresgid(gid, gid, gid) < 0) {
1286                                 log_error("setregid() failed: %m");
1287                                 goto child_fail;
1288                         }
1289
1290                         if (setresuid(uid, uid, uid) < 0) {
1291                                 log_error("setreuid() failed: %m");
1292                                 goto child_fail;
1293                         }
1294                 }
1295
1296                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1297                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1298                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1299                     log_error("Out of memory.");
1300                     goto child_fail;
1301                 }
1302
1303                 if (arg_uuid) {
1304                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1305                                 log_error("Out of memory.");
1306                                 goto child_fail;
1307                         }
1308                 }
1309
1310                 setup_hostname();
1311
1312                 if (arg_boot) {
1313                         char **a;
1314                         size_t l;
1315
1316                         /* Automatically search for the init system */
1317
1318                         l = 1 + argc - optind;
1319                         a = newa(char*, l + 1);
1320                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1321
1322                         a[0] = (char*) "/usr/lib/systemd/systemd";
1323                         execve(a[0], a, (char**) envp);
1324
1325                         a[0] = (char*) "/lib/systemd/systemd";
1326                         execve(a[0], a, (char**) envp);
1327
1328                         a[0] = (char*) "/sbin/init";
1329                         execve(a[0], a, (char**) envp);
1330                 } else if (argc > optind)
1331                         execvpe(argv[optind], argv + optind, (char**) envp);
1332                 else {
1333                         chdir(home ? home : "/root");
1334                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1335                 }
1336
1337                 log_error("execv() failed: %m");
1338
1339         child_fail:
1340                 _exit(EXIT_FAILURE);
1341         }
1342
1343         if (process_pty(master, &mask) < 0)
1344                 goto finish;
1345
1346         if (saved_attr_valid) {
1347                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1348                 saved_attr_valid = false;
1349         }
1350
1351         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1352
1353         if (r < 0)
1354                 r = EXIT_FAILURE;
1355
1356 finish:
1357         if (saved_attr_valid)
1358                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1359
1360         if (master >= 0)
1361                 close_nointr_nofail(master);
1362
1363         close_pipe(kmsg_socket_pair);
1364
1365         if (oldcg)
1366                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1367
1368         if (newcg)
1369                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1370
1371         free(arg_directory);
1372         strv_free(arg_controllers);
1373         free(oldcg);
1374         free(newcg);
1375
1376         return r;
1377 }