chiark / gitweb /
nspawn: _cleanup_free_ more
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58
59 typedef enum LinkJournal {
60         LINK_NO,
61         LINK_AUTO,
62         LINK_HOST,
63         LINK_GUEST
64 } LinkJournal;
65
66 static char *arg_directory = NULL;
67 static char *arg_user = NULL;
68 static char **arg_controllers = NULL;
69 static char *arg_uuid = NULL;
70 static bool arg_private_network = false;
71 static bool arg_read_only = false;
72 static bool arg_boot = false;
73 static LinkJournal arg_link_journal = LINK_AUTO;
74 static uint64_t arg_retain =
75         (1ULL << CAP_CHOWN) |
76         (1ULL << CAP_DAC_OVERRIDE) |
77         (1ULL << CAP_DAC_READ_SEARCH) |
78         (1ULL << CAP_FOWNER) |
79         (1ULL << CAP_FSETID) |
80         (1ULL << CAP_IPC_OWNER) |
81         (1ULL << CAP_KILL) |
82         (1ULL << CAP_LEASE) |
83         (1ULL << CAP_LINUX_IMMUTABLE) |
84         (1ULL << CAP_NET_BIND_SERVICE) |
85         (1ULL << CAP_NET_BROADCAST) |
86         (1ULL << CAP_NET_RAW) |
87         (1ULL << CAP_SETGID) |
88         (1ULL << CAP_SETFCAP) |
89         (1ULL << CAP_SETPCAP) |
90         (1ULL << CAP_SETUID) |
91         (1ULL << CAP_SYS_ADMIN) |
92         (1ULL << CAP_SYS_CHROOT) |
93         (1ULL << CAP_SYS_NICE) |
94         (1ULL << CAP_SYS_PTRACE) |
95         (1ULL << CAP_SYS_TTY_CONFIG) |
96         (1ULL << CAP_SYS_RESOURCE) |
97         (1ULL << CAP_SYS_BOOT);
98
99 static int help(void) {
100
101         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
102                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
103                "  -h --help               Show this help\n"
104                "  -D --directory=NAME     Root directory for the container\n"
105                "  -b --boot               Boot up full system (i.e. invoke init)\n"
106                "  -u --user=USER          Run the command under specified user or uid\n"
107                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
108                "     --uuid=UUID          Set a specific machine UUID for the container\n"
109                "     --private-network    Disable network in container\n"
110                "     --read-only          Mount the root directory read-only\n"
111                "     --capability=CAP     In addition to the default, retain specified capability\n"
112                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
113                "  -j                      Equivalent to --link-journal=host\n",
114                program_invocation_short_name);
115
116         return 0;
117 }
118
119 static int parse_argv(int argc, char *argv[]) {
120
121         enum {
122                 ARG_PRIVATE_NETWORK = 0x100,
123                 ARG_UUID,
124                 ARG_READ_ONLY,
125                 ARG_CAPABILITY,
126                 ARG_LINK_JOURNAL
127         };
128
129         static const struct option options[] = {
130                 { "help",            no_argument,       NULL, 'h'                 },
131                 { "directory",       required_argument, NULL, 'D'                 },
132                 { "user",            required_argument, NULL, 'u'                 },
133                 { "controllers",     required_argument, NULL, 'C'                 },
134                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
135                 { "boot",            no_argument,       NULL, 'b'                 },
136                 { "uuid",            required_argument, NULL, ARG_UUID            },
137                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
138                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
139                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
140                 { NULL,              0,                 NULL, 0                   }
141         };
142
143         int c;
144
145         assert(argc >= 0);
146         assert(argv);
147
148         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
149
150                 switch (c) {
151
152                 case 'h':
153                         help();
154                         return 0;
155
156                 case 'D':
157                         free(arg_directory);
158                         arg_directory = canonicalize_file_name(optarg);
159                         if (!arg_directory) {
160                                 log_error("Failed to canonicalize root directory.");
161                                 return -ENOMEM;
162                         }
163
164                         break;
165
166                 case 'u':
167                         free(arg_user);
168                         if (!(arg_user = strdup(optarg))) {
169                                 log_error("Failed to duplicate user name.");
170                                 return -ENOMEM;
171                         }
172
173                         break;
174
175                 case 'C':
176                         strv_free(arg_controllers);
177                         arg_controllers = strv_split(optarg, ",");
178                         if (!arg_controllers) {
179                                 log_error("Failed to split controllers list.");
180                                 return -ENOMEM;
181                         }
182                         strv_uniq(arg_controllers);
183
184                         break;
185
186                 case ARG_PRIVATE_NETWORK:
187                         arg_private_network = true;
188                         break;
189
190                 case 'b':
191                         arg_boot = true;
192                         break;
193
194                 case ARG_UUID:
195                         arg_uuid = optarg;
196                         break;
197
198                 case ARG_READ_ONLY:
199                         arg_read_only = true;
200                         break;
201
202                 case ARG_CAPABILITY: {
203                         char *state, *word;
204                         size_t length;
205
206                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
207                                 cap_value_t cap;
208                                 char *t;
209
210                                 t = strndup(word, length);
211                                 if (!t)
212                                         return log_oom();
213
214                                 if (cap_from_name(t, &cap) < 0) {
215                                         log_error("Failed to parse capability %s.", t);
216                                         free(t);
217                                         return -EINVAL;
218                                 }
219
220                                 free(t);
221                                 arg_retain |= 1ULL << (uint64_t) cap;
222                         }
223
224                         break;
225                 }
226
227                 case 'j':
228                         arg_link_journal = LINK_GUEST;
229                         break;
230
231                 case ARG_LINK_JOURNAL:
232                         if (streq(optarg, "auto"))
233                                 arg_link_journal = LINK_AUTO;
234                         else if (streq(optarg, "no"))
235                                 arg_link_journal = LINK_NO;
236                         else if (streq(optarg, "guest"))
237                                 arg_link_journal = LINK_GUEST;
238                         else if (streq(optarg, "host"))
239                                 arg_link_journal = LINK_HOST;
240                         else {
241                                 log_error("Failed to parse link journal mode %s", optarg);
242                                 return -EINVAL;
243                         }
244
245                         break;
246
247                 case '?':
248                         return -EINVAL;
249
250                 default:
251                         log_error("Unknown option code %c", c);
252                         return -EINVAL;
253                 }
254         }
255
256         return 1;
257 }
258
259 static int mount_all(const char *dest) {
260
261         typedef struct MountPoint {
262                 const char *what;
263                 const char *where;
264                 const char *type;
265                 const char *options;
266                 unsigned long flags;
267                 bool fatal;
268         } MountPoint;
269
270         static const MountPoint mount_table[] = {
271                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
272                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
273                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
274                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
275                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
276                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
277                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
278                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
279 #ifdef HAVE_SELINUX
280                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
281                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
282 #endif
283         };
284
285         unsigned k;
286         int r = 0;
287         char _cleanup_free_ *where = NULL;
288
289         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290                 int t;
291
292                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
293                         log_oom();
294
295                         if (r == 0)
296                                 r = -ENOMEM;
297
298                         break;
299                 }
300
301                 t = path_is_mount_point(where, true);
302                 if (t < 0) {
303                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
304
305                         if (r == 0)
306                                 r = t;
307
308                         continue;
309                 }
310
311                 /* Skip this entry if it is not a remount. */
312                 if (mount_table[k].what && t > 0)
313                         continue;
314
315                 mkdir_p_label(where, 0755);
316
317                 if (mount(mount_table[k].what,
318                           where,
319                           mount_table[k].type,
320                           mount_table[k].flags,
321                           mount_table[k].options) < 0 &&
322                     mount_table[k].fatal) {
323
324                         log_error("mount(%s) failed: %m", where);
325
326                         if (r == 0)
327                                 r = -errno;
328                 }
329         }
330
331         return r;
332 }
333
334 static int setup_timezone(const char *dest) {
335         char *where;
336
337         assert(dest);
338
339         /* Fix the timezone, if possible */
340         where = strappend(dest, "/etc/localtime");
341         if (!where)
342                 return log_oom();
343
344         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
345                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
346
347         free(where);
348
349         where = strappend(dest, "/etc/timezone");
350         if (!where)
351                 return log_oom();
352
353         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
354                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
355
356         free(where);
357
358         return 0;
359 }
360
361 static int setup_resolv_conf(const char *dest) {
362         char *where;
363
364         assert(dest);
365
366         if (arg_private_network)
367                 return 0;
368
369         /* Fix resolv.conf, if possible */
370         where = strappend(dest, "/etc/resolv.conf");
371         if (!where)
372                 return log_oom();
373
374         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
375                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
376
377         free(where);
378
379         return 0;
380 }
381
382 static int setup_boot_id(const char *dest) {
383         char _cleanup_free_ *from = NULL, *to = NULL;
384         sd_id128_t rnd;
385         char as_uuid[37];
386         int r;
387
388         assert(dest);
389
390         /* Generate a new randomized boot ID, so that each boot-up of
391          * the container gets a new one */
392
393         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
394         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
395         if (!from || !to)
396                 return log_oom();
397
398         r = sd_id128_randomize(&rnd);
399         if (r < 0) {
400                 log_error("Failed to generate random boot id: %s", strerror(-r));
401                 return r;
402         }
403
404         snprintf(as_uuid, sizeof(as_uuid),
405                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
406                  SD_ID128_FORMAT_VAL(rnd));
407         char_array_0(as_uuid);
408
409         r = write_one_line_file(from, as_uuid);
410         if (r < 0) {
411                 log_error("Failed to write boot id: %s", strerror(-r));
412                 return r;
413         }
414
415         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
416                 log_error("Failed to bind mount boot id: %m");
417                 r = -errno;
418         } else
419                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
420
421         unlink(from);
422         return r;
423 }
424
425 static int copy_devnodes(const char *dest) {
426
427         static const char devnodes[] =
428                 "null\0"
429                 "zero\0"
430                 "full\0"
431                 "random\0"
432                 "urandom\0"
433                 "tty\0"
434                 "ptmx\0";
435
436         const char *d;
437         int r = 0;
438         mode_t u;
439
440         assert(dest);
441
442         u = umask(0000);
443
444         NULSTR_FOREACH(d, devnodes) {
445                 struct stat st;
446                 char _cleanup_free_ *from = NULL, *to = NULL;
447
448                 asprintf(&from, "/dev/%s", d);
449                 asprintf(&to, "%s/dev/%s", dest, d);
450
451                 if (!from || !to) {
452                         log_oom();
453
454                         if (r == 0)
455                                 r = -ENOMEM;
456
457                         break;
458                 }
459
460                 if (stat(from, &st) < 0) {
461
462                         if (errno != ENOENT) {
463                                 log_error("Failed to stat %s: %m", from);
464                                 if (r == 0)
465                                         r = -errno;
466                         }
467
468                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
469
470                         log_error("%s is not a char or block device, cannot copy", from);
471                         if (r == 0)
472                                 r = -EIO;
473
474                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
475
476                         log_error("mknod(%s) failed: %m", dest);
477                         if (r == 0)
478                                 r = -errno;
479                 }
480         }
481
482         umask(u);
483
484         return r;
485 }
486
487 static int setup_dev_console(const char *dest, const char *console) {
488         struct stat st;
489         char _cleanup_free_ *to = NULL;
490         int r;
491         mode_t u;
492
493         assert(dest);
494         assert(console);
495
496         u = umask(0000);
497
498         if (stat(console, &st) < 0) {
499                 log_error("Failed to stat %s: %m", console);
500                 r = -errno;
501                 goto finish;
502
503         } else if (!S_ISCHR(st.st_mode)) {
504                 log_error("/dev/console is not a char device.");
505                 r = -EIO;
506                 goto finish;
507         }
508
509         r = chmod_and_chown(console, 0600, 0, 0);
510         if (r < 0) {
511                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
512                 goto finish;
513         }
514
515         if (asprintf(&to, "%s/dev/console", dest) < 0) {
516                 r = log_oom();
517                 goto finish;
518         }
519
520         /* We need to bind mount the right tty to /dev/console since
521          * ptys can only exist on pts file systems. To have something
522          * to bind mount things on we create a device node first, that
523          * has the right major/minor (note that the major minor
524          * doesn't actually matter here, since we mount it over
525          * anyway). */
526
527         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
528                 log_error("mknod() for /dev/console failed: %m");
529                 r = -errno;
530                 goto finish;
531         }
532
533         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
534                 log_error("Bind mount for /dev/console failed: %m");
535                 r = -errno;
536                 goto finish;
537         }
538
539 finish:
540         umask(u);
541
542         return r;
543 }
544
545 static int setup_kmsg(const char *dest, int kmsg_socket) {
546         char _cleanup_free_ *from = NULL, *to = NULL;
547         int r, fd, k;
548         mode_t u;
549         union {
550                 struct cmsghdr cmsghdr;
551                 uint8_t buf[CMSG_SPACE(sizeof(int))];
552         } control;
553         struct msghdr mh;
554         struct cmsghdr *cmsg;
555
556         assert(dest);
557         assert(kmsg_socket >= 0);
558
559         u = umask(0000);
560
561         /* We create the kmsg FIFO as /dev/kmsg, but immediately
562          * delete it after bind mounting it to /proc/kmsg. While FIFOs
563          * on the reading side behave very similar to /proc/kmsg,
564          * their writing side behaves differently from /dev/kmsg in
565          * that writing blocks when nothing is reading. In order to
566          * avoid any problems with containers deadlocking due to this
567          * we simply make /dev/kmsg unavailable to the container. */
568         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
569                 r = log_oom();
570                 goto finish;
571         }
572
573         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
574                 r = log_oom();
575                 goto finish;
576         }
577
578         if (mkfifo(from, 0600) < 0) {
579                 log_error("mkfifo() for /dev/kmsg failed: %m");
580                 r = -errno;
581                 goto finish;
582         }
583
584         r = chmod_and_chown(from, 0600, 0, 0);
585         if (r < 0) {
586                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
587                 goto finish;
588         }
589
590         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
591                 log_error("Bind mount for /proc/kmsg failed: %m");
592                 r = -errno;
593                 goto finish;
594         }
595
596         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
597         if (fd < 0) {
598                 log_error("Failed to open fifo: %m");
599                 r = -errno;
600                 goto finish;
601         }
602
603         zero(mh);
604         zero(control);
605
606         mh.msg_control = &control;
607         mh.msg_controllen = sizeof(control);
608
609         cmsg = CMSG_FIRSTHDR(&mh);
610         cmsg->cmsg_level = SOL_SOCKET;
611         cmsg->cmsg_type = SCM_RIGHTS;
612         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
613         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
614
615         mh.msg_controllen = cmsg->cmsg_len;
616
617         /* Store away the fd in the socket, so that it stays open as
618          * long as we run the child */
619         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
620         close_nointr_nofail(fd);
621
622         if (k < 0) {
623                 log_error("Failed to send FIFO fd: %m");
624                 r = -errno;
625                 goto finish;
626         }
627
628         /* And now make the FIFO unavailable as /dev/kmsg... */
629         unlink(from);
630
631 finish:
632         umask(u);
633
634         return r;
635 }
636
637 static int setup_hostname(void) {
638         char *hn;
639         int r = 0;
640
641         hn = path_get_file_name(arg_directory);
642         if (hn) {
643                 hn = strdup(hn);
644                 if (!hn)
645                         return -ENOMEM;
646
647                 hostname_cleanup(hn);
648
649                 if (!isempty(hn))
650                         if (sethostname(hn, strlen(hn)) < 0)
651                                 r = -errno;
652
653                 free(hn);
654         }
655
656         return r;
657 }
658
659 static int setup_journal(const char *directory) {
660         sd_id128_t machine_id;
661         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
662         int r;
663
664         if (arg_link_journal == LINK_NO)
665                 return 0;
666
667         p = strappend(directory, "/etc/machine-id");
668         if (!p) {
669                 r = log_oom();
670                 goto finish;
671         }
672
673         r = read_one_line_file(p, &b);
674         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
675                 r = 0;
676                 goto finish;
677         } else if (r < 0) {
678                 log_error("Failed to read machine ID: %s", strerror(-r));
679                 return r;
680         }
681
682         l = strstrip(b);
683         if (isempty(l) && arg_link_journal == LINK_AUTO) {
684                 r = 0;
685                 goto finish;
686         }
687
688         /* Verify validaty */
689         r = sd_id128_from_string(l, &machine_id);
690         if (r < 0) {
691                 log_error("Failed to parse machine ID: %s", strerror(-r));
692                 goto finish;
693         }
694
695         free(p);
696         p = strappend("/var/log/journal/", l);
697         q = strjoin(directory, "/var/log/journal/", l, NULL);
698         if (!p || !q) {
699                 r = log_oom();
700                 goto finish;
701         }
702
703         if (path_is_mount_point(p, false) > 0 ||
704             path_is_mount_point(q, false) > 0) {
705                 if (arg_link_journal != LINK_AUTO) {
706                         log_error("Journal already a mount point, refusing.");
707                         r = -EEXIST;
708                         goto finish;
709                 }
710
711                 r = 0;
712                 goto finish;
713         }
714
715         r = readlink_and_make_absolute(p, &d);
716         if (r >= 0) {
717                 if ((arg_link_journal == LINK_GUEST ||
718                      arg_link_journal == LINK_AUTO) &&
719                     path_equal(d, q)) {
720
721                         mkdir_p(q, 0755);
722
723                         r = 0;
724                         goto finish;
725                 }
726
727                 if (unlink(p) < 0) {
728                         log_error("Failed to remove symlink %s: %m", p);
729                         r = -errno;
730                         goto finish;
731                 }
732         } else if (r == -EINVAL) {
733
734                 if (arg_link_journal == LINK_GUEST &&
735                     rmdir(p) < 0) {
736
737                         if (errno == ENOTDIR)
738                                 log_error("%s already exists and is neither symlink nor directory.", p);
739                         else {
740                                 log_error("Failed to remove %s: %m", p);
741                                 r = -errno;
742                         }
743
744                         goto finish;
745                 }
746         } else if (r != -ENOENT) {
747                 log_error("readlink(%s) failed: %m", p);
748                 goto finish;
749         }
750
751         if (arg_link_journal == LINK_GUEST) {
752
753                 if (symlink(q, p) < 0) {
754                         log_error("Failed to symlink %s to %s: %m", q, p);
755                         r = -errno;
756                         goto finish;
757                 }
758
759                 mkdir_p(q, 0755);
760
761                 r = 0;
762                 goto finish;
763         }
764
765         if (arg_link_journal == LINK_HOST) {
766                 r = mkdir_p(p, 0755);
767                 if (r < 0) {
768                         log_error("Failed to create %s: %m", p);
769                         goto finish;
770                 }
771
772         } else if (access(p, F_OK) < 0) {
773                 r = 0;
774                 goto finish;
775         }
776
777         if (dir_is_empty(q) == 0) {
778                 log_error("%s not empty.", q);
779                 r = -ENOTEMPTY;
780                 goto finish;
781         }
782
783         r = mkdir_p(q, 0755);
784         if (r < 0) {
785                 log_error("Failed to create %s: %m", q);
786                 goto finish;
787         }
788
789         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
790                 log_error("Failed to bind mount journal from host into guest: %m");
791                 r = -errno;
792                 goto finish;
793         }
794
795         r = 0;
796
797 finish:
798         free(p);
799         free(q);
800         free(d);
801         free(b);
802         return r;
803
804 }
805
806 static int drop_capabilities(void) {
807         return capability_bounding_set_drop(~arg_retain, false);
808 }
809
810 static int is_os_tree(const char *path) {
811         int r;
812         char *p;
813         /* We use /bin/sh as flag file if something is an OS */
814
815         if (asprintf(&p, "%s/bin/sh", path) < 0)
816                 return -ENOMEM;
817
818         r = access(p, F_OK);
819         free(p);
820
821         return r < 0 ? 0 : 1;
822 }
823
824 static int process_pty(int master, sigset_t *mask) {
825
826         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
827         size_t in_buffer_full = 0, out_buffer_full = 0;
828         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
829         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
830         int ep = -1, signal_fd = -1, r;
831
832         fd_nonblock(STDIN_FILENO, 1);
833         fd_nonblock(STDOUT_FILENO, 1);
834         fd_nonblock(master, 1);
835
836         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
837         if (signal_fd < 0) {
838                 log_error("signalfd(): %m");
839                 r = -errno;
840                 goto finish;
841         }
842
843         ep = epoll_create1(EPOLL_CLOEXEC);
844         if (ep < 0) {
845                 log_error("Failed to create epoll: %m");
846                 r = -errno;
847                 goto finish;
848         }
849
850         zero(stdin_ev);
851         stdin_ev.events = EPOLLIN|EPOLLET;
852         stdin_ev.data.fd = STDIN_FILENO;
853
854         zero(stdout_ev);
855         stdout_ev.events = EPOLLOUT|EPOLLET;
856         stdout_ev.data.fd = STDOUT_FILENO;
857
858         zero(master_ev);
859         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
860         master_ev.data.fd = master;
861
862         zero(signal_ev);
863         signal_ev.events = EPOLLIN;
864         signal_ev.data.fd = signal_fd;
865
866         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
867             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
868             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
869             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
870                 log_error("Failed to regiser fds in epoll: %m");
871                 r = -errno;
872                 goto finish;
873         }
874
875         for (;;) {
876                 struct epoll_event ev[16];
877                 ssize_t k;
878                 int i, nfds;
879
880                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
881                 if (nfds < 0) {
882
883                         if (errno == EINTR || errno == EAGAIN)
884                                 continue;
885
886                         log_error("epoll_wait(): %m");
887                         r = -errno;
888                         goto finish;
889                 }
890
891                 assert(nfds >= 1);
892
893                 for (i = 0; i < nfds; i++) {
894                         if (ev[i].data.fd == STDIN_FILENO) {
895
896                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
897                                         stdin_readable = true;
898
899                         } else if (ev[i].data.fd == STDOUT_FILENO) {
900
901                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
902                                         stdout_writable = true;
903
904                         } else if (ev[i].data.fd == master) {
905
906                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
907                                         master_readable = true;
908
909                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
910                                         master_writable = true;
911
912                         } else if (ev[i].data.fd == signal_fd) {
913                                 struct signalfd_siginfo sfsi;
914                                 ssize_t n;
915
916                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
917                                 if (n != sizeof(sfsi)) {
918
919                                         if (n >= 0) {
920                                                 log_error("Failed to read from signalfd: invalid block size");
921                                                 r = -EIO;
922                                                 goto finish;
923                                         }
924
925                                         if (errno != EINTR && errno != EAGAIN) {
926                                                 log_error("Failed to read from signalfd: %m");
927                                                 r = -errno;
928                                                 goto finish;
929                                         }
930                                 } else {
931
932                                         if (sfsi.ssi_signo == SIGWINCH) {
933                                                 struct winsize ws;
934
935                                                 /* The window size changed, let's forward that. */
936                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
937                                                         ioctl(master, TIOCSWINSZ, &ws);
938                                         } else {
939                                                 r = 0;
940                                                 goto finish;
941                                         }
942                                 }
943                         }
944                 }
945
946                 while ((stdin_readable && in_buffer_full <= 0) ||
947                        (master_writable && in_buffer_full > 0) ||
948                        (master_readable && out_buffer_full <= 0) ||
949                        (stdout_writable && out_buffer_full > 0)) {
950
951                         if (stdin_readable && in_buffer_full < LINE_MAX) {
952
953                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
954                                 if (k < 0) {
955
956                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
957                                                 stdin_readable = false;
958                                         else {
959                                                 log_error("read(): %m");
960                                                 r = -errno;
961                                                 goto finish;
962                                         }
963                                 } else
964                                         in_buffer_full += (size_t) k;
965                         }
966
967                         if (master_writable && in_buffer_full > 0) {
968
969                                 k = write(master, in_buffer, in_buffer_full);
970                                 if (k < 0) {
971
972                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
973                                                 master_writable = false;
974                                         else {
975                                                 log_error("write(): %m");
976                                                 r = -errno;
977                                                 goto finish;
978                                         }
979
980                                 } else {
981                                         assert(in_buffer_full >= (size_t) k);
982                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
983                                         in_buffer_full -= k;
984                                 }
985                         }
986
987                         if (master_readable && out_buffer_full < LINE_MAX) {
988
989                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
990                                 if (k < 0) {
991
992                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
993                                                 master_readable = false;
994                                         else {
995                                                 log_error("read(): %m");
996                                                 r = -errno;
997                                                 goto finish;
998                                         }
999                                 }  else
1000                                         out_buffer_full += (size_t) k;
1001                         }
1002
1003                         if (stdout_writable && out_buffer_full > 0) {
1004
1005                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1006                                 if (k < 0) {
1007
1008                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1009                                                 stdout_writable = false;
1010                                         else {
1011                                                 log_error("write(): %m");
1012                                                 r = -errno;
1013                                                 goto finish;
1014                                         }
1015
1016                                 } else {
1017                                         assert(out_buffer_full >= (size_t) k);
1018                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1019                                         out_buffer_full -= k;
1020                                 }
1021                         }
1022                 }
1023         }
1024
1025 finish:
1026         if (ep >= 0)
1027                 close_nointr_nofail(ep);
1028
1029         if (signal_fd >= 0)
1030                 close_nointr_nofail(signal_fd);
1031
1032         return r;
1033 }
1034
1035 int main(int argc, char *argv[]) {
1036         pid_t pid = 0;
1037         int r = EXIT_FAILURE, k;
1038         char *oldcg = NULL, *newcg = NULL;
1039         char **controller = NULL;
1040         int master = -1;
1041         const char *console = NULL;
1042         struct termios saved_attr, raw_attr;
1043         sigset_t mask;
1044         bool saved_attr_valid = false;
1045         struct winsize ws;
1046         int kmsg_socket_pair[2] = { -1, -1 };
1047
1048         log_parse_environment();
1049         log_open();
1050
1051         r = parse_argv(argc, argv);
1052         if (r <= 0)
1053                 goto finish;
1054
1055         if (arg_directory) {
1056                 char *p;
1057
1058                 p = path_make_absolute_cwd(arg_directory);
1059                 free(arg_directory);
1060                 arg_directory = p;
1061         } else
1062                 arg_directory = get_current_dir_name();
1063
1064         if (!arg_directory) {
1065                 log_error("Failed to determine path");
1066                 goto finish;
1067         }
1068
1069         path_kill_slashes(arg_directory);
1070
1071         if (geteuid() != 0) {
1072                 log_error("Need to be root.");
1073                 goto finish;
1074         }
1075
1076         if (sd_booted() <= 0) {
1077                 log_error("Not running on a systemd system.");
1078                 goto finish;
1079         }
1080
1081         if (path_equal(arg_directory, "/")) {
1082                 log_error("Spawning container on root directory not supported.");
1083                 goto finish;
1084         }
1085
1086         if (is_os_tree(arg_directory) <= 0) {
1087                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1088                 goto finish;
1089         }
1090
1091         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1092         if (k < 0) {
1093                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1094                 goto finish;
1095         }
1096
1097         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1098                 log_error("Failed to allocate cgroup path.");
1099                 goto finish;
1100         }
1101
1102         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1103         if (k < 0)  {
1104                 log_error("Failed to create cgroup: %s", strerror(-k));
1105                 goto finish;
1106         }
1107
1108         STRV_FOREACH(controller, arg_controllers) {
1109                 k = cg_create_and_attach(*controller, newcg, 0);
1110                 if (k < 0)
1111                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1112         }
1113
1114         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1115         if (master < 0) {
1116                 log_error("Failed to acquire pseudo tty: %m");
1117                 goto finish;
1118         }
1119
1120         console = ptsname(master);
1121         if (!console) {
1122                 log_error("Failed to determine tty name: %m");
1123                 goto finish;
1124         }
1125
1126         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1127
1128         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1129                 ioctl(master, TIOCSWINSZ, &ws);
1130
1131         if (unlockpt(master) < 0) {
1132                 log_error("Failed to unlock tty: %m");
1133                 goto finish;
1134         }
1135
1136         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1137                 log_error("Failed to get terminal attributes: %m");
1138                 goto finish;
1139         }
1140
1141         saved_attr_valid = true;
1142
1143         raw_attr = saved_attr;
1144         cfmakeraw(&raw_attr);
1145         raw_attr.c_lflag &= ~ECHO;
1146
1147         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1148                 log_error("Failed to create kmsg socket pair");
1149                 goto finish;
1150         }
1151
1152         assert_se(sigemptyset(&mask) == 0);
1153         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1154         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1155
1156         for (;;) {
1157                 siginfo_t status;
1158
1159                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1160                         log_error("Failed to set terminal attributes: %m");
1161                         goto finish;
1162                 }
1163
1164                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1165                 if (pid < 0) {
1166                         if (errno == EINVAL)
1167                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1168                         else
1169                                 log_error("clone() failed: %m");
1170
1171                         goto finish;
1172                 }
1173
1174                 if (pid == 0) {
1175                         /* child */
1176
1177                         const char *home = NULL;
1178                         uid_t uid = (uid_t) -1;
1179                         gid_t gid = (gid_t) -1;
1180                         const char *envp[] = {
1181                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1182                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1183                                 NULL, /* TERM */
1184                                 NULL, /* HOME */
1185                                 NULL, /* USER */
1186                                 NULL, /* LOGNAME */
1187                                 NULL, /* container_uuid */
1188                                 NULL
1189                         };
1190
1191                         envp[2] = strv_find_prefix(environ, "TERM=");
1192
1193                         close_nointr_nofail(master);
1194
1195                         close_nointr(STDIN_FILENO);
1196                         close_nointr(STDOUT_FILENO);
1197                         close_nointr(STDERR_FILENO);
1198
1199                         close_all_fds(&kmsg_socket_pair[1], 1);
1200
1201                         reset_all_signal_handlers();
1202
1203                         assert_se(sigemptyset(&mask) == 0);
1204                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1205
1206                         if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1207                             dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1208                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1209                                 goto child_fail;
1210
1211                         if (setsid() < 0) {
1212                                 log_error("setsid() failed: %m");
1213                                 goto child_fail;
1214                         }
1215
1216                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1217                                 log_error("PR_SET_PDEATHSIG failed: %m");
1218                                 goto child_fail;
1219                         }
1220
1221                         /* Mark everything as slave, so that we still
1222                          * receive mounts from the real root, but don't
1223                          * propagate mounts to the real root. */
1224                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1225                                 log_error("MS_SLAVE|MS_REC failed: %m");
1226                                 goto child_fail;
1227                         }
1228
1229                         /* Turn directory into bind mount */
1230                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1231                                 log_error("Failed to make bind mount.");
1232                                 goto child_fail;
1233                         }
1234
1235                         if (arg_read_only)
1236                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1237                                         log_error("Failed to make read-only.");
1238                                         goto child_fail;
1239                                 }
1240
1241                         if (mount_all(arg_directory) < 0)
1242                                 goto child_fail;
1243
1244                         if (copy_devnodes(arg_directory) < 0)
1245                                 goto child_fail;
1246
1247                         dev_setup(arg_directory);
1248
1249                         if (setup_dev_console(arg_directory, console) < 0)
1250                                 goto child_fail;
1251
1252                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1253                                 goto child_fail;
1254
1255                         close_nointr_nofail(kmsg_socket_pair[1]);
1256
1257                         if (setup_boot_id(arg_directory) < 0)
1258                                 goto child_fail;
1259
1260                         if (setup_timezone(arg_directory) < 0)
1261                                 goto child_fail;
1262
1263                         if (setup_resolv_conf(arg_directory) < 0)
1264                                 goto child_fail;
1265
1266                         if (setup_journal(arg_directory) < 0)
1267                                 goto child_fail;
1268
1269                         if (chdir(arg_directory) < 0) {
1270                                 log_error("chdir(%s) failed: %m", arg_directory);
1271                                 goto child_fail;
1272                         }
1273
1274                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1275                                 log_error("mount(MS_MOVE) failed: %m");
1276                                 goto child_fail;
1277                         }
1278
1279                         if (chroot(".") < 0) {
1280                                 log_error("chroot() failed: %m");
1281                                 goto child_fail;
1282                         }
1283
1284                         if (chdir("/") < 0) {
1285                                 log_error("chdir() failed: %m");
1286                                 goto child_fail;
1287                         }
1288
1289                         umask(0022);
1290
1291                         loopback_setup();
1292
1293                         if (drop_capabilities() < 0) {
1294                                 log_error("drop_capabilities() failed: %m");
1295                                 goto child_fail;
1296                         }
1297
1298                         if (arg_user) {
1299
1300                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1301                                         log_error("get_user_creds() failed: %m");
1302                                         goto child_fail;
1303                                 }
1304
1305                                 if (mkdir_parents_label(home, 0775) < 0) {
1306                                         log_error("mkdir_parents_label() failed: %m");
1307                                         goto child_fail;
1308                                 }
1309
1310                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1311                                         log_error("mkdir_safe_label() failed: %m");
1312                                         goto child_fail;
1313                                 }
1314
1315                                 if (initgroups((const char*)arg_user, gid) < 0) {
1316                                         log_error("initgroups() failed: %m");
1317                                         goto child_fail;
1318                                 }
1319
1320                                 if (setresgid(gid, gid, gid) < 0) {
1321                                         log_error("setregid() failed: %m");
1322                                         goto child_fail;
1323                                 }
1324
1325                                 if (setresuid(uid, uid, uid) < 0) {
1326                                         log_error("setreuid() failed: %m");
1327                                         goto child_fail;
1328                                 }
1329                         }
1330
1331                         if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1332                             (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1333                             (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1334                                 log_oom();
1335                                 goto child_fail;
1336                         }
1337
1338                         if (arg_uuid) {
1339                                 if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1340                                         log_oom();
1341                                         goto child_fail;
1342                                 }
1343                         }
1344
1345                         setup_hostname();
1346
1347                         if (arg_boot) {
1348                                 char **a;
1349                                 size_t l;
1350
1351                                 /* Automatically search for the init system */
1352
1353                                 l = 1 + argc - optind;
1354                                 a = newa(char*, l + 1);
1355                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1356
1357                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1358                                 execve(a[0], a, (char**) envp);
1359
1360                                 a[0] = (char*) "/lib/systemd/systemd";
1361                                 execve(a[0], a, (char**) envp);
1362
1363                                 a[0] = (char*) "/sbin/init";
1364                                 execve(a[0], a, (char**) envp);
1365                         } else if (argc > optind)
1366                                 execvpe(argv[optind], argv + optind, (char**) envp);
1367                         else {
1368                                 chdir(home ? home : "/root");
1369                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1370                         }
1371
1372                         log_error("execv() failed: %m");
1373
1374                 child_fail:
1375                         _exit(EXIT_FAILURE);
1376                 }
1377
1378                 if (process_pty(master, &mask) < 0)
1379                         goto finish;
1380
1381
1382                 if (saved_attr_valid)
1383                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1384
1385                 r = wait_for_terminate(pid, &status);
1386                 if (r < 0) {
1387                         r = EXIT_FAILURE;
1388                         break;
1389                 }
1390
1391                 if (status.si_code == CLD_EXITED) {
1392                         if (status.si_status != 0) {
1393                                 log_error("Container failed with error code %i.", status.si_status);
1394                                 r = status.si_status;
1395                                 break;
1396                         }
1397
1398                         log_debug("Container exited successfully.");
1399                         break;
1400                 } else if (status.si_code == CLD_KILLED &&
1401                            status.si_status == SIGINT) {
1402                         log_info("Container has been shut down.");
1403                         r = 0;
1404                         break;
1405                 } else if (status.si_code == CLD_KILLED &&
1406                            status.si_status == SIGHUP) {
1407                         log_info("Container is being rebooted.");
1408                         continue;
1409                 } else if (status.si_code == CLD_KILLED ||
1410                            status.si_code == CLD_DUMPED) {
1411
1412                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1413                         r = EXIT_FAILURE;
1414                         break;
1415                 } else {
1416                         log_error("Container failed due to unknown reason.");
1417                         r = EXIT_FAILURE;
1418                         break;
1419                 }
1420         }
1421
1422 finish:
1423         if (saved_attr_valid)
1424                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1425
1426         if (master >= 0)
1427                 close_nointr_nofail(master);
1428
1429         close_pipe(kmsg_socket_pair);
1430
1431         if (oldcg)
1432                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1433
1434         if (newcg)
1435                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1436
1437         free(arg_directory);
1438         strv_free(arg_controllers);
1439         free(oldcg);
1440         free(newcg);
1441
1442         return r;
1443 }