chiark / gitweb /
b8962e9894480a0010326c2ba60800f9400bc1d7
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 typedef enum LinkJournal {
64         LINK_NO,
65         LINK_AUTO,
66         LINK_HOST,
67         LINK_GUEST
68 } LinkJournal;
69
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
79         (1ULL << CAP_CHOWN) |
80         (1ULL << CAP_DAC_OVERRIDE) |
81         (1ULL << CAP_DAC_READ_SEARCH) |
82         (1ULL << CAP_FOWNER) |
83         (1ULL << CAP_FSETID) |
84         (1ULL << CAP_IPC_OWNER) |
85         (1ULL << CAP_KILL) |
86         (1ULL << CAP_LEASE) |
87         (1ULL << CAP_LINUX_IMMUTABLE) |
88         (1ULL << CAP_NET_BIND_SERVICE) |
89         (1ULL << CAP_NET_BROADCAST) |
90         (1ULL << CAP_NET_RAW) |
91         (1ULL << CAP_SETGID) |
92         (1ULL << CAP_SETFCAP) |
93         (1ULL << CAP_SETPCAP) |
94         (1ULL << CAP_SETUID) |
95         (1ULL << CAP_SYS_ADMIN) |
96         (1ULL << CAP_SYS_CHROOT) |
97         (1ULL << CAP_SYS_NICE) |
98         (1ULL << CAP_SYS_PTRACE) |
99         (1ULL << CAP_SYS_TTY_CONFIG) |
100         (1ULL << CAP_SYS_RESOURCE) |
101         (1ULL << CAP_SYS_BOOT) |
102         (1ULL << CAP_AUDIT_WRITE) |
103         (1ULL << CAP_AUDIT_CONTROL);
104
105 static int help(void) {
106
107         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
108                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
109                "  -h --help               Show this help\n"
110                "  --version               Print version string\n"
111                "  -D --directory=NAME     Root directory for the container\n"
112                "  -b --boot               Boot up full system (i.e. invoke init)\n"
113                "  -u --user=USER          Run the command under specified user or uid\n"
114                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
115                "     --uuid=UUID          Set a specific machine UUID for the container\n"
116                "     --private-network    Disable network in container\n"
117                "     --read-only          Mount the root directory read-only\n"
118                "     --capability=CAP     In addition to the default, retain specified capability\n"
119                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
120                "  -j                      Equivalent to --link-journal=host\n",
121                program_invocation_short_name);
122
123         return 0;
124 }
125
126 static int parse_argv(int argc, char *argv[]) {
127
128         enum {
129                 ARG_VERSION = 0x100,
130                 ARG_PRIVATE_NETWORK,
131                 ARG_UUID,
132                 ARG_READ_ONLY,
133                 ARG_CAPABILITY,
134                 ARG_LINK_JOURNAL
135         };
136
137         static const struct option options[] = {
138                 { "help",            no_argument,       NULL, 'h'                 },
139                 { "version",         no_argument,       NULL, ARG_VERSION         },
140                 { "directory",       required_argument, NULL, 'D'                 },
141                 { "user",            required_argument, NULL, 'u'                 },
142                 { "controllers",     required_argument, NULL, 'C'                 },
143                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
144                 { "boot",            no_argument,       NULL, 'b'                 },
145                 { "uuid",            required_argument, NULL, ARG_UUID            },
146                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
147                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
148                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
149                 { NULL,              0,                 NULL, 0                   }
150         };
151
152         int c;
153
154         assert(argc >= 0);
155         assert(argv);
156
157         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158
159                 switch (c) {
160
161                 case 'h':
162                         help();
163                         return 0;
164
165                 case ARG_VERSION:
166                         puts(PACKAGE_STRING);
167                         puts(SYSTEMD_FEATURES);
168                         return 0;
169
170                 case 'D':
171                         free(arg_directory);
172                         arg_directory = canonicalize_file_name(optarg);
173                         if (!arg_directory) {
174                                 log_error("Failed to canonicalize root directory.");
175                                 return -ENOMEM;
176                         }
177
178                         break;
179
180                 case 'u':
181                         free(arg_user);
182                         if (!(arg_user = strdup(optarg))) {
183                                 log_error("Failed to duplicate user name.");
184                                 return -ENOMEM;
185                         }
186
187                         break;
188
189                 case 'C':
190                         strv_free(arg_controllers);
191                         arg_controllers = strv_split(optarg, ",");
192                         if (!arg_controllers) {
193                                 log_error("Failed to split controllers list.");
194                                 return -ENOMEM;
195                         }
196                         strv_uniq(arg_controllers);
197
198                         break;
199
200                 case ARG_PRIVATE_NETWORK:
201                         arg_private_network = true;
202                         break;
203
204                 case 'b':
205                         arg_boot = true;
206                         break;
207
208                 case ARG_UUID:
209                         arg_uuid = optarg;
210                         break;
211
212                 case ARG_READ_ONLY:
213                         arg_read_only = true;
214                         break;
215
216                 case ARG_CAPABILITY: {
217                         char *state, *word;
218                         size_t length;
219
220                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
221                                 cap_value_t cap;
222                                 char *t;
223
224                                 t = strndup(word, length);
225                                 if (!t)
226                                         return log_oom();
227
228                                 if (cap_from_name(t, &cap) < 0) {
229                                         log_error("Failed to parse capability %s.", t);
230                                         free(t);
231                                         return -EINVAL;
232                                 }
233
234                                 free(t);
235                                 arg_retain |= 1ULL << (uint64_t) cap;
236                         }
237
238                         break;
239                 }
240
241                 case 'j':
242                         arg_link_journal = LINK_GUEST;
243                         break;
244
245                 case ARG_LINK_JOURNAL:
246                         if (streq(optarg, "auto"))
247                                 arg_link_journal = LINK_AUTO;
248                         else if (streq(optarg, "no"))
249                                 arg_link_journal = LINK_NO;
250                         else if (streq(optarg, "guest"))
251                                 arg_link_journal = LINK_GUEST;
252                         else if (streq(optarg, "host"))
253                                 arg_link_journal = LINK_HOST;
254                         else {
255                                 log_error("Failed to parse link journal mode %s", optarg);
256                                 return -EINVAL;
257                         }
258
259                         break;
260
261                 case '?':
262                         return -EINVAL;
263
264                 default:
265                         log_error("Unknown option code %c", c);
266                         return -EINVAL;
267                 }
268         }
269
270         return 1;
271 }
272
273 static int mount_all(const char *dest) {
274
275         typedef struct MountPoint {
276                 const char *what;
277                 const char *where;
278                 const char *type;
279                 const char *options;
280                 unsigned long flags;
281                 bool fatal;
282         } MountPoint;
283
284         static const MountPoint mount_table[] = {
285                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
286                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
287                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
288                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
289                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
290                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
291                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
292                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
293 #ifdef HAVE_SELINUX
294                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
295                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
296 #endif
297         };
298
299         unsigned k;
300         int r = 0;
301
302         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
303                 char _cleanup_free_ *where = NULL;
304                 int t;
305
306                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
307                         log_oom();
308
309                         if (r == 0)
310                                 r = -ENOMEM;
311
312                         break;
313                 }
314
315                 t = path_is_mount_point(where, true);
316                 if (t < 0) {
317                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
318
319                         if (r == 0)
320                                 r = t;
321
322                         continue;
323                 }
324
325                 /* Skip this entry if it is not a remount. */
326                 if (mount_table[k].what && t > 0)
327                         continue;
328
329                 mkdir_p_label(where, 0755);
330
331                 if (mount(mount_table[k].what,
332                           where,
333                           mount_table[k].type,
334                           mount_table[k].flags,
335                           mount_table[k].options) < 0 &&
336                     mount_table[k].fatal) {
337
338                         log_error("mount(%s) failed: %m", where);
339
340                         if (r == 0)
341                                 r = -errno;
342                 }
343         }
344
345         return r;
346 }
347
348 static int setup_timezone(const char *dest) {
349         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
350         char *z, *y;
351         int r;
352
353         assert(dest);
354
355         /* Fix the timezone, if possible */
356         r = readlink_malloc("/etc/localtime", &p);
357         if (r < 0) {
358                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
359                 return 0;
360         }
361
362         z = path_startswith(p, "../usr/share/zoneinfo/");
363         if (!z)
364                 z = path_startswith(p, "/usr/share/zoneinfo/");
365         if (!z) {
366                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
367                 return 0;
368         }
369
370         where = strappend(dest, "/etc/localtime");
371         if (!where)
372                 return log_oom();
373
374         r = readlink_malloc(where, &q);
375         if (r >= 0) {
376                 y = path_startswith(q, "../usr/share/zoneinfo/");
377                 if (!y)
378                         y = path_startswith(q, "/usr/share/zoneinfo/");
379
380
381                 /* Already pointing to the right place? Then do nothing .. */
382                 if (y && streq(y, z))
383                         return 0;
384         }
385
386         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
387         if (!check)
388                 return log_oom();
389
390         if (access(check, F_OK) < 0) {
391                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
392                 return 0;
393         }
394
395         what = strappend("../usr/share/zoneinfo/", z);
396         if (!what)
397                 return log_oom();
398
399         unlink(where);
400         if (symlink(what, where) < 0) {
401                 log_error("Failed to correct timezone of container: %m");
402                 return 0;
403         }
404
405         return 0;
406 }
407
408 static int setup_resolv_conf(const char *dest) {
409         char *where;
410
411         assert(dest);
412
413         if (arg_private_network)
414                 return 0;
415
416         /* Fix resolv.conf, if possible */
417         where = strappend(dest, "/etc/resolv.conf");
418         if (!where)
419                 return log_oom();
420
421         /* We don't really care for the results of this really. If it
422          * fails, it fails, but meh... */
423         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
424                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
425
426         free(where);
427
428         return 0;
429 }
430
431 static int setup_boot_id(const char *dest) {
432         char _cleanup_free_ *from = NULL, *to = NULL;
433         sd_id128_t rnd;
434         char as_uuid[37];
435         int r;
436
437         assert(dest);
438
439         /* Generate a new randomized boot ID, so that each boot-up of
440          * the container gets a new one */
441
442         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
443         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
444         if (!from || !to)
445                 return log_oom();
446
447         r = sd_id128_randomize(&rnd);
448         if (r < 0) {
449                 log_error("Failed to generate random boot id: %s", strerror(-r));
450                 return r;
451         }
452
453         snprintf(as_uuid, sizeof(as_uuid),
454                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
455                  SD_ID128_FORMAT_VAL(rnd));
456         char_array_0(as_uuid);
457
458         r = write_one_line_file(from, as_uuid);
459         if (r < 0) {
460                 log_error("Failed to write boot id: %s", strerror(-r));
461                 return r;
462         }
463
464         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
465                 log_error("Failed to bind mount boot id: %m");
466                 r = -errno;
467         } else
468                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
469
470         unlink(from);
471         return r;
472 }
473
474 static int copy_devnodes(const char *dest) {
475
476         static const char devnodes[] =
477                 "null\0"
478                 "zero\0"
479                 "full\0"
480                 "random\0"
481                 "urandom\0"
482                 "tty\0"
483                 "ptmx\0";
484
485         const char *d;
486         int r = 0;
487         mode_t _cleanup_umask_ u;
488
489         assert(dest);
490
491         u = umask(0000);
492
493         NULSTR_FOREACH(d, devnodes) {
494                 struct stat st;
495                 char _cleanup_free_ *from = NULL, *to = NULL;
496
497                 asprintf(&from, "/dev/%s", d);
498                 asprintf(&to, "%s/dev/%s", dest, d);
499
500                 if (!from || !to) {
501                         log_oom();
502
503                         if (r == 0)
504                                 r = -ENOMEM;
505
506                         break;
507                 }
508
509                 if (stat(from, &st) < 0) {
510
511                         if (errno != ENOENT) {
512                                 log_error("Failed to stat %s: %m", from);
513                                 if (r == 0)
514                                         r = -errno;
515                         }
516
517                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
518
519                         log_error("%s is not a char or block device, cannot copy", from);
520                         if (r == 0)
521                                 r = -EIO;
522
523                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
524
525                         log_error("mknod(%s) failed: %m", dest);
526                         if (r == 0)
527                                 r = -errno;
528                 }
529         }
530
531         return r;
532 }
533
534 static int setup_dev_console(const char *dest, const char *console) {
535         struct stat st;
536         char _cleanup_free_ *to = NULL;
537         int r;
538         mode_t _cleanup_umask_ u;
539
540         assert(dest);
541         assert(console);
542
543         u = umask(0000);
544
545         if (stat(console, &st) < 0) {
546                 log_error("Failed to stat %s: %m", console);
547                 return -errno;
548
549         } else if (!S_ISCHR(st.st_mode)) {
550                 log_error("/dev/console is not a char device");
551                 return -EIO;
552         }
553
554         r = chmod_and_chown(console, 0600, 0, 0);
555         if (r < 0) {
556                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
557                 return r;
558         }
559
560         if (asprintf(&to, "%s/dev/console", dest) < 0)
561                 return log_oom();
562
563         /* We need to bind mount the right tty to /dev/console since
564          * ptys can only exist on pts file systems. To have something
565          * to bind mount things on we create a device node first, that
566          * has the right major/minor (note that the major minor
567          * doesn't actually matter here, since we mount it over
568          * anyway). */
569
570         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
571                 log_error("mknod() for /dev/console failed: %m");
572                 return -errno;
573         }
574
575         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
576                 log_error("Bind mount for /dev/console failed: %m");
577                 return -errno;
578         }
579
580         return 0;
581 }
582
583 static int setup_kmsg(const char *dest, int kmsg_socket) {
584         char _cleanup_free_ *from = NULL, *to = NULL;
585         int r, fd, k;
586         mode_t _cleanup_umask_ u;
587         union {
588                 struct cmsghdr cmsghdr;
589                 uint8_t buf[CMSG_SPACE(sizeof(int))];
590         } control;
591         struct msghdr mh;
592         struct cmsghdr *cmsg;
593
594         assert(dest);
595         assert(kmsg_socket >= 0);
596
597         u = umask(0000);
598
599         /* We create the kmsg FIFO as /dev/kmsg, but immediately
600          * delete it after bind mounting it to /proc/kmsg. While FIFOs
601          * on the reading side behave very similar to /proc/kmsg,
602          * their writing side behaves differently from /dev/kmsg in
603          * that writing blocks when nothing is reading. In order to
604          * avoid any problems with containers deadlocking due to this
605          * we simply make /dev/kmsg unavailable to the container. */
606         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
607             asprintf(&to, "%s/proc/kmsg", dest) < 0)
608                 return log_oom();
609
610         if (mkfifo(from, 0600) < 0) {
611                 log_error("mkfifo() for /dev/kmsg failed: %m");
612                 return -errno;
613         }
614
615         r = chmod_and_chown(from, 0600, 0, 0);
616         if (r < 0) {
617                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
618                 return r;
619         }
620
621         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
622                 log_error("Bind mount for /proc/kmsg failed: %m");
623                 return -errno;
624         }
625
626         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
627         if (fd < 0) {
628                 log_error("Failed to open fifo: %m");
629                 return -errno;
630         }
631
632         zero(mh);
633         zero(control);
634
635         mh.msg_control = &control;
636         mh.msg_controllen = sizeof(control);
637
638         cmsg = CMSG_FIRSTHDR(&mh);
639         cmsg->cmsg_level = SOL_SOCKET;
640         cmsg->cmsg_type = SCM_RIGHTS;
641         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
642         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
643
644         mh.msg_controllen = cmsg->cmsg_len;
645
646         /* Store away the fd in the socket, so that it stays open as
647          * long as we run the child */
648         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
649         close_nointr_nofail(fd);
650
651         if (k < 0) {
652                 log_error("Failed to send FIFO fd: %m");
653                 return -errno;
654         }
655
656         /* And now make the FIFO unavailable as /dev/kmsg... */
657         unlink(from);
658         return 0;
659 }
660
661 static int setup_hostname(void) {
662         char *hn;
663         int r = 0;
664
665         hn = path_get_file_name(arg_directory);
666         if (hn) {
667                 hn = strdup(hn);
668                 if (!hn)
669                         return -ENOMEM;
670
671                 hostname_cleanup(hn);
672
673                 if (!isempty(hn))
674                         if (sethostname(hn, strlen(hn)) < 0)
675                                 r = -errno;
676
677                 free(hn);
678         }
679
680         return r;
681 }
682
683 static int setup_journal(const char *directory) {
684         sd_id128_t machine_id;
685         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
686         char *id;
687         int r;
688
689         if (arg_link_journal == LINK_NO)
690                 return 0;
691
692         p = strappend(directory, "/etc/machine-id");
693         if (!p)
694                 return log_oom();
695
696         r = read_one_line_file(p, &b);
697         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
698                 return 0;
699         else if (r < 0) {
700                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
701                 return r;
702         }
703
704         id = strstrip(b);
705         if (isempty(id) && arg_link_journal == LINK_AUTO)
706                 return 0;
707
708         /* Verify validity */
709         r = sd_id128_from_string(id, &machine_id);
710         if (r < 0) {
711                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
712                 return r;
713         }
714
715         free(p);
716         p = strappend("/var/log/journal/", id);
717         q = strjoin(directory, "/var/log/journal/", id, NULL);
718         if (!p || !q)
719                 return log_oom();
720
721         if (path_is_mount_point(p, false) > 0) {
722                 if (arg_link_journal != LINK_AUTO) {
723                         log_error("%s: already a mount point, refusing to use for journal", p);
724                         return -EEXIST;
725                 }
726
727                 return 0;
728         }
729
730         if (path_is_mount_point(q, false) > 0) {
731                 if (arg_link_journal != LINK_AUTO) {
732                         log_error("%s: already a mount point, refusing to use for journal", q);
733                         return -EEXIST;
734                 }
735
736                 return 0;
737         }
738
739         r = readlink_and_make_absolute(p, &d);
740         if (r >= 0) {
741                 if ((arg_link_journal == LINK_GUEST ||
742                      arg_link_journal == LINK_AUTO) &&
743                     path_equal(d, q)) {
744
745                         r = mkdir_p(q, 0755);
746                         if (r < 0)
747                                 log_warning("failed to create directory %s: %m", q);
748                         return 0;
749                 }
750
751                 if (unlink(p) < 0) {
752                         log_error("Failed to remove symlink %s: %m", p);
753                         return -errno;
754                 }
755         } else if (r == -EINVAL) {
756
757                 if (arg_link_journal == LINK_GUEST &&
758                     rmdir(p) < 0) {
759
760                         if (errno == ENOTDIR) {
761                                 log_error("%s already exists and is neither a symlink nor a directory", p);
762                                 return r;
763                         } else {
764                                 log_error("Failed to remove %s: %m", p);
765                                 return -errno;
766                         }
767                 }
768         } else if (r != -ENOENT) {
769                 log_error("readlink(%s) failed: %m", p);
770                 return r;
771         }
772
773         if (arg_link_journal == LINK_GUEST) {
774
775                 if (symlink(q, p) < 0) {
776                         log_error("Failed to symlink %s to %s: %m", q, p);
777                         return -errno;
778                 }
779
780                 r = mkdir_p(q, 0755);
781                 if (r < 0)
782                         log_warning("failed to create directory %s: %m", q);
783                 return 0;
784         }
785
786         if (arg_link_journal == LINK_HOST) {
787                 r = mkdir_p(p, 0755);
788                 if (r < 0) {
789                         log_error("Failed to create %s: %m", p);
790                         return r;
791                 }
792
793         } else if (access(p, F_OK) < 0)
794                 return 0;
795
796         if (dir_is_empty(q) == 0) {
797                 log_error("%s not empty.", q);
798                 return -ENOTEMPTY;
799         }
800
801         r = mkdir_p(q, 0755);
802         if (r < 0) {
803                 log_error("Failed to create %s: %m", q);
804                 return r;
805         }
806
807         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
808                 log_error("Failed to bind mount journal from host into guest: %m");
809                 return -errno;
810         }
811
812         return 0;
813 }
814
815 static int drop_capabilities(void) {
816         return capability_bounding_set_drop(~arg_retain, false);
817 }
818
819 static int is_os_tree(const char *path) {
820         int r;
821         char *p;
822         /* We use /bin/sh as flag file if something is an OS */
823
824         if (asprintf(&p, "%s/bin/sh", path) < 0)
825                 return -ENOMEM;
826
827         r = access(p, F_OK);
828         free(p);
829
830         return r < 0 ? 0 : 1;
831 }
832
833 static int process_pty(int master, pid_t pid, sigset_t *mask) {
834
835         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
836         size_t in_buffer_full = 0, out_buffer_full = 0;
837         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
838         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
839         int ep = -1, signal_fd = -1, r;
840         bool tried_orderly_shutdown = false;
841
842         assert(master >= 0);
843         assert(pid > 0);
844         assert(mask);
845
846         fd_nonblock(STDIN_FILENO, 1);
847         fd_nonblock(STDOUT_FILENO, 1);
848         fd_nonblock(master, 1);
849
850         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
851         if (signal_fd < 0) {
852                 log_error("signalfd(): %m");
853                 r = -errno;
854                 goto finish;
855         }
856
857         ep = epoll_create1(EPOLL_CLOEXEC);
858         if (ep < 0) {
859                 log_error("Failed to create epoll: %m");
860                 r = -errno;
861                 goto finish;
862         }
863
864         /* We read from STDIN only if this is actually a TTY,
865          * otherwise we assume non-interactivity. */
866         if (isatty(STDIN_FILENO)) {
867                 zero(stdin_ev);
868                 stdin_ev.events = EPOLLIN|EPOLLET;
869                 stdin_ev.data.fd = STDIN_FILENO;
870
871                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
872                         log_error("Failed to register STDIN in epoll: %m");
873                         r = -errno;
874                         goto finish;
875                 }
876         }
877
878         zero(stdout_ev);
879         stdout_ev.events = EPOLLOUT|EPOLLET;
880         stdout_ev.data.fd = STDOUT_FILENO;
881
882         zero(master_ev);
883         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
884         master_ev.data.fd = master;
885
886         zero(signal_ev);
887         signal_ev.events = EPOLLIN;
888         signal_ev.data.fd = signal_fd;
889
890         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
891                 if (errno != EPERM) {
892                         log_error("Failed to register stdout in epoll: %m");
893                         r = -errno;
894                         goto finish;
895                 }
896                 /* stdout without epoll support. Likely redirected to regular file. */
897                 stdout_writable = true;
898         }
899
900         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
901             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
902                 log_error("Failed to register fds in epoll: %m");
903                 r = -errno;
904                 goto finish;
905         }
906
907         for (;;) {
908                 struct epoll_event ev[16];
909                 ssize_t k;
910                 int i, nfds;
911
912                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
913                 if (nfds < 0) {
914
915                         if (errno == EINTR || errno == EAGAIN)
916                                 continue;
917
918                         log_error("epoll_wait(): %m");
919                         r = -errno;
920                         goto finish;
921                 }
922
923                 assert(nfds >= 1);
924
925                 for (i = 0; i < nfds; i++) {
926                         if (ev[i].data.fd == STDIN_FILENO) {
927
928                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
929                                         stdin_readable = true;
930
931                         } else if (ev[i].data.fd == STDOUT_FILENO) {
932
933                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
934                                         stdout_writable = true;
935
936                         } else if (ev[i].data.fd == master) {
937
938                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
939                                         master_readable = true;
940
941                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
942                                         master_writable = true;
943
944                         } else if (ev[i].data.fd == signal_fd) {
945                                 struct signalfd_siginfo sfsi;
946                                 ssize_t n;
947
948                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
949                                 if (n != sizeof(sfsi)) {
950
951                                         if (n >= 0) {
952                                                 log_error("Failed to read from signalfd: invalid block size");
953                                                 r = -EIO;
954                                                 goto finish;
955                                         }
956
957                                         if (errno != EINTR && errno != EAGAIN) {
958                                                 log_error("Failed to read from signalfd: %m");
959                                                 r = -errno;
960                                                 goto finish;
961                                         }
962                                 } else {
963
964                                         if (sfsi.ssi_signo == SIGWINCH) {
965                                                 struct winsize ws;
966
967                                                 /* The window size changed, let's forward that. */
968                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
969                                                         ioctl(master, TIOCSWINSZ, &ws);
970                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
971
972                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
973
974                                                 /* This only works for systemd... */
975                                                 tried_orderly_shutdown = true;
976                                                 kill(pid, SIGRTMIN+3);
977
978                                         } else {
979                                                 r = 0;
980                                                 goto finish;
981                                         }
982                                 }
983                         }
984                 }
985
986                 while ((stdin_readable && in_buffer_full <= 0) ||
987                        (master_writable && in_buffer_full > 0) ||
988                        (master_readable && out_buffer_full <= 0) ||
989                        (stdout_writable && out_buffer_full > 0)) {
990
991                         if (stdin_readable && in_buffer_full < LINE_MAX) {
992
993                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
994                                 if (k < 0) {
995
996                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
997                                                 stdin_readable = false;
998                                         else {
999                                                 log_error("read(): %m");
1000                                                 r = -errno;
1001                                                 goto finish;
1002                                         }
1003                                 } else
1004                                         in_buffer_full += (size_t) k;
1005                         }
1006
1007                         if (master_writable && in_buffer_full > 0) {
1008
1009                                 k = write(master, in_buffer, in_buffer_full);
1010                                 if (k < 0) {
1011
1012                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1013                                                 master_writable = false;
1014                                         else {
1015                                                 log_error("write(): %m");
1016                                                 r = -errno;
1017                                                 goto finish;
1018                                         }
1019
1020                                 } else {
1021                                         assert(in_buffer_full >= (size_t) k);
1022                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1023                                         in_buffer_full -= k;
1024                                 }
1025                         }
1026
1027                         if (master_readable && out_buffer_full < LINE_MAX) {
1028
1029                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1030                                 if (k < 0) {
1031
1032                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1033                                                 master_readable = false;
1034                                         else {
1035                                                 log_error("read(): %m");
1036                                                 r = -errno;
1037                                                 goto finish;
1038                                         }
1039                                 }  else
1040                                         out_buffer_full += (size_t) k;
1041                         }
1042
1043                         if (stdout_writable && out_buffer_full > 0) {
1044
1045                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1046                                 if (k < 0) {
1047
1048                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1049                                                 stdout_writable = false;
1050                                         else {
1051                                                 log_error("write(): %m");
1052                                                 r = -errno;
1053                                                 goto finish;
1054                                         }
1055
1056                                 } else {
1057                                         assert(out_buffer_full >= (size_t) k);
1058                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1059                                         out_buffer_full -= k;
1060                                 }
1061                         }
1062                 }
1063         }
1064
1065 finish:
1066         if (ep >= 0)
1067                 close_nointr_nofail(ep);
1068
1069         if (signal_fd >= 0)
1070                 close_nointr_nofail(signal_fd);
1071
1072         return r;
1073 }
1074
1075 int main(int argc, char *argv[]) {
1076         pid_t pid = 0;
1077         int r = EXIT_FAILURE, k;
1078         char *oldcg = NULL, *newcg = NULL;
1079         char **controller = NULL;
1080         int master = -1, n_fd_passed;
1081         const char *console = NULL;
1082         struct termios saved_attr, raw_attr;
1083         sigset_t mask;
1084         bool saved_attr_valid = false;
1085         struct winsize ws;
1086         int kmsg_socket_pair[2] = { -1, -1 };
1087         FDSet *fds = NULL;
1088
1089         log_parse_environment();
1090         log_open();
1091
1092         r = parse_argv(argc, argv);
1093         if (r <= 0)
1094                 goto finish;
1095
1096         if (arg_directory) {
1097                 char *p;
1098
1099                 p = path_make_absolute_cwd(arg_directory);
1100                 free(arg_directory);
1101                 arg_directory = p;
1102         } else
1103                 arg_directory = get_current_dir_name();
1104
1105         if (!arg_directory) {
1106                 log_error("Failed to determine path");
1107                 goto finish;
1108         }
1109
1110         path_kill_slashes(arg_directory);
1111
1112         if (geteuid() != 0) {
1113                 log_error("Need to be root.");
1114                 goto finish;
1115         }
1116
1117         if (sd_booted() <= 0) {
1118                 log_error("Not running on a systemd system.");
1119                 goto finish;
1120         }
1121
1122         if (path_equal(arg_directory, "/")) {
1123                 log_error("Spawning container on root directory not supported.");
1124                 goto finish;
1125         }
1126
1127         if (is_os_tree(arg_directory) <= 0) {
1128                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1129                 goto finish;
1130         }
1131
1132         log_close();
1133         n_fd_passed = sd_listen_fds(false);
1134         if (n_fd_passed > 0) {
1135                 k = fdset_new_listen_fds(&fds, false);
1136                 if (k < 0) {
1137                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1138                         goto finish;
1139                 }
1140         }
1141         fdset_close_others(fds);
1142         log_open();
1143
1144         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1145         if (k < 0) {
1146                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1147                 goto finish;
1148         }
1149
1150         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1151                 log_error("Failed to allocate cgroup path.");
1152                 goto finish;
1153         }
1154
1155         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1156         if (k < 0)  {
1157                 log_error("Failed to create cgroup: %s", strerror(-k));
1158                 goto finish;
1159         }
1160
1161         STRV_FOREACH(controller, arg_controllers) {
1162                 k = cg_create_and_attach(*controller, newcg, 0);
1163                 if (k < 0)
1164                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1165         }
1166
1167         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1168         if (master < 0) {
1169                 log_error("Failed to acquire pseudo tty: %m");
1170                 goto finish;
1171         }
1172
1173         console = ptsname(master);
1174         if (!console) {
1175                 log_error("Failed to determine tty name: %m");
1176                 goto finish;
1177         }
1178
1179         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1180
1181         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1182                 ioctl(master, TIOCSWINSZ, &ws);
1183
1184         if (unlockpt(master) < 0) {
1185                 log_error("Failed to unlock tty: %m");
1186                 goto finish;
1187         }
1188
1189         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1190                 saved_attr_valid = true;
1191
1192                 raw_attr = saved_attr;
1193                 cfmakeraw(&raw_attr);
1194                 raw_attr.c_lflag &= ~ECHO;
1195         }
1196
1197         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1198                 log_error("Failed to create kmsg socket pair");
1199                 goto finish;
1200         }
1201
1202         assert_se(sigemptyset(&mask) == 0);
1203         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1204         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1205
1206         for (;;) {
1207                 siginfo_t status;
1208                 int pipefd[2];
1209
1210                 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1211                         log_error("pipe2(): %m");
1212                         goto finish;
1213                 }
1214
1215                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1216                 if (pid < 0) {
1217                         if (errno == EINVAL)
1218                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1219                         else
1220                                 log_error("clone() failed: %m");
1221
1222                         goto finish;
1223                 }
1224
1225                 if (pid == 0) {
1226                         /* child */
1227                         const char *home = NULL;
1228                         uid_t uid = (uid_t) -1;
1229                         gid_t gid = (gid_t) -1;
1230                         unsigned n_env = 0;
1231                         const char *envp[] = {
1232                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1234                                 NULL, /* TERM */
1235                                 NULL, /* HOME */
1236                                 NULL, /* USER */
1237                                 NULL, /* LOGNAME */
1238                                 NULL, /* container_uuid */
1239                                 NULL, /* LISTEN_FDS */
1240                                 NULL, /* LISTEN_PID */
1241                                 NULL
1242                         };
1243
1244                         envp[2] = strv_find_prefix(environ, "TERM=");
1245                         n_env = 3;
1246
1247                         close_nointr_nofail(pipefd[1]);
1248                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1249                         close_nointr_nofail(pipefd[0]);
1250
1251                         close_nointr_nofail(master);
1252                         master = -1;
1253
1254                         if (saved_attr_valid) {
1255                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1256                                         log_error("Failed to set terminal attributes: %m");
1257                                         goto child_fail;
1258                                 }
1259                         }
1260
1261                         close_nointr(STDIN_FILENO);
1262                         close_nointr(STDOUT_FILENO);
1263                         close_nointr(STDERR_FILENO);
1264
1265                         close_nointr_nofail(kmsg_socket_pair[0]);
1266                         kmsg_socket_pair[0] = -1;
1267
1268                         reset_all_signal_handlers();
1269
1270                         assert_se(sigemptyset(&mask) == 0);
1271                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1272
1273                         k = open_terminal(console, O_RDWR);
1274                         if (k != STDIN_FILENO) {
1275                                 if (k >= 0) {
1276                                         close_nointr_nofail(k);
1277                                         k = -EINVAL;
1278                                 }
1279
1280                                 log_error("Failed to open console: %s", strerror(-k));
1281                                 goto child_fail;
1282                         }
1283
1284                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1285                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1286                                 log_error("Failed to duplicate console: %m");
1287                                 goto child_fail;
1288                         }
1289
1290                         if (setsid() < 0) {
1291                                 log_error("setsid() failed: %m");
1292                                 goto child_fail;
1293                         }
1294
1295                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1296                                 log_error("PR_SET_PDEATHSIG failed: %m");
1297                                 goto child_fail;
1298                         }
1299
1300                         /* Mark everything as slave, so that we still
1301                          * receive mounts from the real root, but don't
1302                          * propagate mounts to the real root. */
1303                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1304                                 log_error("MS_SLAVE|MS_REC failed: %m");
1305                                 goto child_fail;
1306                         }
1307
1308                         /* Turn directory into bind mount */
1309                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1310                                 log_error("Failed to make bind mount.");
1311                                 goto child_fail;
1312                         }
1313
1314                         if (arg_read_only)
1315                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1316                                         log_error("Failed to make read-only.");
1317                                         goto child_fail;
1318                                 }
1319
1320                         if (mount_all(arg_directory) < 0)
1321                                 goto child_fail;
1322
1323                         if (copy_devnodes(arg_directory) < 0)
1324                                 goto child_fail;
1325
1326                         dev_setup(arg_directory);
1327
1328                         if (setup_dev_console(arg_directory, console) < 0)
1329                                 goto child_fail;
1330
1331                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1332                                 goto child_fail;
1333
1334                         close_nointr_nofail(kmsg_socket_pair[1]);
1335                         kmsg_socket_pair[1] = -1;
1336
1337                         if (setup_boot_id(arg_directory) < 0)
1338                                 goto child_fail;
1339
1340                         if (setup_timezone(arg_directory) < 0)
1341                                 goto child_fail;
1342
1343                         if (setup_resolv_conf(arg_directory) < 0)
1344                                 goto child_fail;
1345
1346                         if (setup_journal(arg_directory) < 0)
1347                                 goto child_fail;
1348
1349                         if (chdir(arg_directory) < 0) {
1350                                 log_error("chdir(%s) failed: %m", arg_directory);
1351                                 goto child_fail;
1352                         }
1353
1354                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1355                                 log_error("mount(MS_MOVE) failed: %m");
1356                                 goto child_fail;
1357                         }
1358
1359                         if (chroot(".") < 0) {
1360                                 log_error("chroot() failed: %m");
1361                                 goto child_fail;
1362                         }
1363
1364                         if (chdir("/") < 0) {
1365                                 log_error("chdir() failed: %m");
1366                                 goto child_fail;
1367                         }
1368
1369                         umask(0022);
1370
1371                         loopback_setup();
1372
1373                         if (drop_capabilities() < 0) {
1374                                 log_error("drop_capabilities() failed: %m");
1375                                 goto child_fail;
1376                         }
1377
1378                         if (arg_user) {
1379
1380                                 /* Note that this resolves user names
1381                                  * inside the container, and hence
1382                                  * accesses the NSS modules from the
1383                                  * container and not the host. This is
1384                                  * a bit weird... */
1385
1386                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1387                                         log_error("get_user_creds() failed: %m");
1388                                         goto child_fail;
1389                                 }
1390
1391                                 if (mkdir_parents_label(home, 0775) < 0) {
1392                                         log_error("mkdir_parents_label() failed: %m");
1393                                         goto child_fail;
1394                                 }
1395
1396                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1397                                         log_error("mkdir_safe_label() failed: %m");
1398                                         goto child_fail;
1399                                 }
1400
1401                                 if (initgroups((const char*)arg_user, gid) < 0) {
1402                                         log_error("initgroups() failed: %m");
1403                                         goto child_fail;
1404                                 }
1405
1406                                 if (setresgid(gid, gid, gid) < 0) {
1407                                         log_error("setregid() failed: %m");
1408                                         goto child_fail;
1409                                 }
1410
1411                                 if (setresuid(uid, uid, uid) < 0) {
1412                                         log_error("setreuid() failed: %m");
1413                                         goto child_fail;
1414                                 }
1415                         } else {
1416                                 /* Reset everything fully to 0, just in case */
1417
1418                                 if (setgroups(0, NULL) < 0) {
1419                                         log_error("setgroups() failed: %m");
1420                                         goto child_fail;
1421                                 }
1422
1423                                 if (setresgid(0, 0, 0) < 0) {
1424                                         log_error("setregid() failed: %m");
1425                                         goto child_fail;
1426                                 }
1427
1428                                 if (setresuid(0, 0, 0) < 0) {
1429                                         log_error("setreuid() failed: %m");
1430                                         goto child_fail;
1431                                 }
1432                         }
1433
1434                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1435                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1436                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1437                                 log_oom();
1438                                 goto child_fail;
1439                         }
1440
1441                         if (arg_uuid) {
1442                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1443                                         log_oom();
1444                                         goto child_fail;
1445                                 }
1446                         }
1447
1448                         if (fdset_size(fds) > 0) {
1449                                 k = fdset_cloexec(fds, false);
1450                                 if (k < 0) {
1451                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1452                                         goto child_fail;
1453                                 }
1454
1455                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1456                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1457                                         log_oom();
1458                                         goto child_fail;
1459                                 }
1460                         }
1461
1462                         setup_hostname();
1463
1464                         if (arg_boot) {
1465                                 char **a;
1466                                 size_t l;
1467
1468                                 /* Automatically search for the init system */
1469
1470                                 l = 1 + argc - optind;
1471                                 a = newa(char*, l + 1);
1472                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1473
1474                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1475                                 execve(a[0], a, (char**) envp);
1476
1477                                 a[0] = (char*) "/lib/systemd/systemd";
1478                                 execve(a[0], a, (char**) envp);
1479
1480                                 a[0] = (char*) "/sbin/init";
1481                                 execve(a[0], a, (char**) envp);
1482                         } else if (argc > optind)
1483                                 execvpe(argv[optind], argv + optind, (char**) envp);
1484                         else {
1485                                 chdir(home ? home : "/root");
1486                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1487                         }
1488
1489                         log_error("execv() failed: %m");
1490
1491                 child_fail:
1492                         _exit(EXIT_FAILURE);
1493                 }
1494
1495                 log_info("Init process in the container running as PID %d", pid);
1496                 close_nointr_nofail(pipefd[0]);
1497                 close_nointr_nofail(pipefd[1]);
1498
1499                 fdset_free(fds);
1500                 fds = NULL;
1501
1502                 if (process_pty(master, pid, &mask) < 0)
1503                         goto finish;
1504
1505                 if (saved_attr_valid)
1506                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1507
1508                 r = wait_for_terminate(pid, &status);
1509                 if (r < 0) {
1510                         r = EXIT_FAILURE;
1511                         break;
1512                 }
1513
1514                 if (status.si_code == CLD_EXITED) {
1515                         if (status.si_status != 0) {
1516                                 log_error("Container failed with error code %i.", status.si_status);
1517                                 r = status.si_status;
1518                                 break;
1519                         }
1520
1521                         log_debug("Container exited successfully.");
1522                         break;
1523                 } else if (status.si_code == CLD_KILLED &&
1524                            status.si_status == SIGINT) {
1525                         log_info("Container has been shut down.");
1526                         r = 0;
1527                         break;
1528                 } else if (status.si_code == CLD_KILLED &&
1529                            status.si_status == SIGHUP) {
1530                         log_info("Container is being rebooted.");
1531                         continue;
1532                 } else if (status.si_code == CLD_KILLED ||
1533                            status.si_code == CLD_DUMPED) {
1534
1535                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1536                         r = EXIT_FAILURE;
1537                         break;
1538                 } else {
1539                         log_error("Container failed due to unknown reason.");
1540                         r = EXIT_FAILURE;
1541                         break;
1542                 }
1543         }
1544
1545 finish:
1546         if (saved_attr_valid)
1547                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1548
1549         if (master >= 0)
1550                 close_nointr_nofail(master);
1551
1552         close_pipe(kmsg_socket_pair);
1553
1554         if (oldcg)
1555                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1556
1557         if (newcg)
1558                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1559
1560         free(arg_directory);
1561         strv_free(arg_controllers);
1562         free(oldcg);
1563         free(newcg);
1564
1565         fdset_free(fds);
1566
1567         return r;
1568 }