chiark / gitweb /
nspawn: move container into its own name=systemd cgroup
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36
37 #include "log.h"
38 #include "util.h"
39 #include "missing.h"
40 #include "cgroup-util.h"
41 #include "sd-daemon.h"
42
43 static char *arg_directory = NULL;
44
45 static int help(void) {
46
47         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
48                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
49                "  -h --help            Show this help\n"
50                "  -D --directory=NAME  Root directory for the container\n",
51                program_invocation_short_name);
52
53         return 0;
54 }
55
56 static int parse_argv(int argc, char *argv[]) {
57
58         static const struct option options[] = {
59                 { "help",      no_argument,       NULL, 'h' },
60                 { "directory", required_argument, NULL, 'D' },
61                 { NULL,        0,                 NULL, 0   }
62         };
63
64         int c;
65
66         assert(argc >= 0);
67         assert(argv);
68
69         while ((c = getopt_long(argc, argv, "+hD:", options, NULL)) >= 0) {
70
71                 switch (c) {
72
73                 case 'h':
74                         help();
75                         return 0;
76
77                 case 'D':
78                         free(arg_directory);
79                         if (!(arg_directory = strdup(optarg))) {
80                                 log_error("Failed to duplicate root directory.");
81                                 return -ENOMEM;
82                         }
83
84                         break;
85
86                 case '?':
87                         return -EINVAL;
88
89                 default:
90                         log_error("Unknown option code %c", c);
91                         return -EINVAL;
92                 }
93         }
94
95         return 1;
96 }
97
98 static int mount_all(const char *dest) {
99
100         typedef struct MountPoint {
101                 const char *what;
102                 const char *where;
103                 const char *type;
104                 const char *options;
105                 unsigned long flags;
106                 bool fatal;
107         } MountPoint;
108
109         static const MountPoint mount_table[] = {
110                 { "proc",      "/proc",     "proc",      NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
111                 { "/proc/sys", "/proc/sys", "bind",      NULL,        MS_BIND, true },                      /* Bind mount first */
112                 { "/proc/sys", "/proc/sys", "bind",      NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
113                 { "sysfs",     "/sys",      "sysfs",     NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, true },
114                 { "tmpfs",     "/dev",      "tmpfs",     "mode=755",  MS_NOSUID, true },
115                 { "/dev/pts",  "/dev/pts",  "bind",      NULL,        MS_BIND, true },
116                 { "tmpfs",     "/dev/.run", "tmpfs",     "mode=755",  MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
117 #ifdef HAVE_SELINUX
118                 { "selinux",   "/selinux",  "selinuxfs", NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, false },
119 #endif
120         };
121
122         unsigned k;
123         int r = 0;
124
125         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
126                 char *where;
127                 int t;
128
129                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
130                         log_error("Out of memory");
131
132                         if (r == 0)
133                                 r = -ENOMEM;
134
135                         break;
136                 }
137
138                 if ((t = path_is_mount_point(where)) < 0) {
139                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
140                         free(where);
141
142                         if (r == 0)
143                                 r = t;
144
145                         continue;
146                 }
147
148                 mkdir_p(where, 0755);
149
150                 if (mount(mount_table[k].what,
151                           where,
152                           mount_table[k].type,
153                           mount_table[k].flags,
154                           mount_table[k].options) < 0 &&
155                     mount_table[k].fatal) {
156
157                         log_error("mount(%s) failed: %m", where);
158
159                         if (r == 0)
160                                 r = -errno;
161                 }
162
163                 free(where);
164         }
165
166         return r;
167 }
168
169 static int copy_devnodes(const char *dest) {
170
171         static const char devnodes[] =
172                 "null\0"
173                 "zero\0"
174                 "full\0"
175                 "random\0"
176                 "urandom\0"
177                 "tty\0"
178                 "ptmx\0"
179                 "kmsg\0"
180                 "rtc0\0";
181
182         const char *d;
183         int r = 0, k;
184         char *tty = NULL;
185         dev_t tty_devnum;
186         mode_t u;
187
188         u = umask(0000);
189
190         NULSTR_FOREACH(d, devnodes) {
191                 char *from = NULL, *to = NULL;
192                 struct stat st;
193
194                 asprintf(&from, "/dev/%s", d);
195                 asprintf(&to, "%s/dev/%s", dest, d);
196
197                 if (!from || !to) {
198                         log_error("Failed to allocate devnode path");
199
200                         free(from);
201                         free(to);
202
203                         if (r == 0)
204                                 r = -ENOMEM;
205
206                         break;
207                 }
208
209                 if (stat(from, &st) < 0) {
210
211                         if (errno != ENOENT) {
212                                 log_error("Failed to stat %s: %m", from);
213
214                                 if (r == 0)
215                                         r = -errno;
216                         }
217
218                 } else {
219                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
220                                 log_error("mknod(%s) failed: %m", dest);
221
222                                 if (r == 0)
223                                         r = -errno;
224                         }
225                 }
226
227                 free(from);
228                 free(to);
229         }
230
231         if ((k = get_ctty(&tty, &tty_devnum)) < 0) {
232                 log_error("Failed to determine controlling tty: %s", strerror(-k));
233
234                 if (r == 0)
235                         r = k;
236         } else {
237                 char *from = NULL, *to = NULL;
238
239                 asprintf(&from, "/dev/%s", tty);
240                 asprintf(&to, "%s/dev/console", dest);
241
242                 if (!from || !to) {
243                         log_error("Out of memory");
244
245                         if (r == 0)
246                                 r = k;
247                 } else {
248                         /* We need to bind mount our own tty on
249                          * /dev/console, since ptys cannot be used
250                          * unless on a devpts file system. But to bind
251                          * mount it we first have to create a device
252                          * node where we can bind mount it on. This is
253                          * kinda ugly since the TTY will very likely
254                          * be owned by a user/group that does not
255                          * exist in the container. */
256
257                         if (mknod(to, S_IFCHR|0600, tty_devnum) < 0) {
258                                 log_error("mknod for /dev/console failed: %m");
259
260                                 if (r == 0)
261                                         r = -errno;
262                         }
263
264                         if (mount(from, to, "bind", MS_BIND|MS_RDONLY, NULL) < 0) {
265                                 log_error("bind mount for /dev/console failed: %m");
266
267                                 if (r == 0)
268                                         r = -errno;
269                         }
270                 }
271
272                 free(from);
273                 free(to);
274         }
275
276         free(tty);
277
278         umask(u);
279
280         return r;
281 }
282
283 static int drop_capabilities(void) {
284         static const unsigned long retain[] = {
285                 CAP_CHOWN,
286                 CAP_DAC_OVERRIDE,
287                 CAP_DAC_READ_SEARCH,
288                 CAP_FOWNER,
289                 CAP_FSETID,
290                 CAP_IPC_OWNER,
291                 CAP_KILL,
292                 CAP_LEASE,
293                 CAP_LINUX_IMMUTABLE,
294                 CAP_NET_BIND_SERVICE,
295                 CAP_NET_BROADCAST,
296                 CAP_NET_RAW,
297                 CAP_SETGID,
298                 CAP_SETFCAP,
299                 CAP_SETPCAP,
300                 CAP_SETUID,
301                 CAP_SYS_ADMIN,
302                 CAP_SYS_CHROOT,
303                 CAP_SYS_NICE,
304                 CAP_SYS_PTRACE,
305                 CAP_SYS_TTY_CONFIG
306         };
307
308         unsigned long l;
309
310         for (l = 0; l <= MAX(63LU, (unsigned long) CAP_LAST_CAP); l ++) {
311                 unsigned i;
312
313                 for (i = 0; i < ELEMENTSOF(retain); i++)
314                         if (retain[i] == l)
315                                 break;
316
317                 if (i < ELEMENTSOF(retain))
318                         continue;
319
320                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
321
322                         /* If this capability is not known, EINVAL
323                          * will be returned, let's ignore this. */
324                         if (errno == EINVAL)
325                                 continue;
326
327                         log_error("PR_CAPBSET_DROP failed: %m");
328                         return -errno;
329                 }
330         }
331
332         return 0;
333 }
334
335 static int is_os_tree(const char *path) {
336         int r;
337         char *p;
338         /* We use /bin/sh as flag file if something is an OS */
339
340         if (asprintf(&p, "%s/bin/sh", path) < 0)
341                 return -ENOMEM;
342
343         r = access(p, F_OK);
344         free(p);
345
346         return r < 0 ? 0 : 1;
347 }
348
349
350 int main(int argc, char *argv[]) {
351         pid_t pid = 0;
352         int r = EXIT_FAILURE, k;
353         char *oldcg = NULL, *newcg = NULL;
354
355         log_parse_environment();
356         log_open();
357
358         if ((r = parse_argv(argc, argv)) <= 0)
359                 goto finish;
360
361         if (arg_directory) {
362                 char *p;
363
364                 p = path_make_absolute_cwd(arg_directory);
365                 free(arg_directory);
366                 arg_directory = p;
367         } else
368                 arg_directory = get_current_dir_name();
369
370         if (!arg_directory) {
371                 log_error("Failed to determine path");
372                 goto finish;
373         }
374
375         path_kill_slashes(arg_directory);
376
377         if (geteuid() != 0) {
378                 log_error("Need to be root.");
379                 goto finish;
380         }
381
382         if (sd_booted() <= 0) {
383                 log_error("Not running on a systemd system.");
384                 goto finish;
385         }
386
387         if (path_equal(arg_directory, "/")) {
388                 log_error("Spawning container on root directory not supported.");
389                 goto finish;
390         }
391
392         if (is_os_tree(arg_directory) <= 0) {
393                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
394                 goto finish;
395         }
396
397         log_info("Spawning namespace container on %s.", arg_directory);
398
399         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
400                 log_error("Failed to determine current cgroup: %s", strerror(-k));
401                 goto finish;
402         }
403
404         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
405                 log_error("Failed to allocate cgroup path.");
406                 goto finish;
407         }
408
409         if ((k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0)) < 0)  {
410                 log_error("Failed to create cgroup: %s", strerror(-k));
411                 goto finish;
412         }
413
414         if ((pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS, NULL)) < 0) {
415                 log_error("clone() failed: %m");
416                 goto finish;
417         }
418
419         if (pid == 0) {
420                 const char *hn;
421                 const char *envp[] = {
422                         "HOME=/root",
423                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
424                         NULL
425                 };
426
427                 /* child */
428
429                 if (mount_all(arg_directory) < 0)
430                         goto child_fail;
431
432                 if (copy_devnodes(arg_directory) < 0)
433                         goto child_fail;
434
435                 if (chdir(arg_directory) < 0) {
436                         log_error("chdir(%s) failed: %m", arg_directory);
437                         goto child_fail;
438                 }
439                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
440                         log_error("mount(MS_MOVE) failed: %m");
441                         goto child_fail;
442                 }
443
444                 if (chroot(".") < 0) {
445                         log_error("chroot() failed: %m");
446                         goto child_fail;
447                 }
448
449                 if (chdir("/") < 0) {
450                         log_error("chdir() failed: %m");
451                         goto child_fail;
452                 }
453
454                 if (drop_capabilities() < 0)
455                         goto child_fail;
456
457                 if ((hn = file_name_from_path(arg_directory)))
458                         sethostname(hn, strlen(hn));
459
460                 if (argc > optind)
461                         execvpe(argv[optind], argv + optind, (char**) envp);
462                 else {
463                         chdir("/root");
464                         execle("/bin/bash", "-bash", NULL, (char**) envp);
465                 }
466
467                 log_error("execv() failed: %m");
468
469         child_fail:
470                 _exit(EXIT_FAILURE);
471         }
472
473         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
474
475         if (r < 0)
476                 r = EXIT_FAILURE;
477
478 finish:
479         if (oldcg)
480                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
481
482         if (newcg)
483                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
484
485         free(arg_directory);
486         free(oldcg);
487         free(newcg);
488
489         return r;
490 }