chiark / gitweb /
nspawn: no need to subscribe to netlink messages if we just want to execute one operation
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #include "sd-daemon.h"
52 #include "sd-bus.h"
53 #include "sd-id128.h"
54 #include "sd-rtnl.h"
55 #include "log.h"
56 #include "util.h"
57 #include "mkdir.h"
58 #include "macro.h"
59 #include "audit.h"
60 #include "missing.h"
61 #include "cgroup-util.h"
62 #include "strv.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
66 #include "fdset.h"
67 #include "build.h"
68 #include "fileio.h"
69 #include "bus-util.h"
70 #include "bus-error.h"
71 #include "ptyfwd.h"
72 #include "bus-kernel.h"
73 #include "env-util.h"
74 #include "def.h"
75 #include "rtnl-util.h"
76
77 typedef enum LinkJournal {
78         LINK_NO,
79         LINK_AUTO,
80         LINK_HOST,
81         LINK_GUEST
82 } LinkJournal;
83
84 static char *arg_directory = NULL;
85 static char *arg_user = NULL;
86 static sd_id128_t arg_uuid = {};
87 static char *arg_machine = NULL;
88 static char *arg_selinux_context = NULL;
89 static char *arg_selinux_apifs_context = NULL;
90 static const char *arg_slice = NULL;
91 static bool arg_private_network = false;
92 static bool arg_read_only = false;
93 static bool arg_boot = false;
94 static LinkJournal arg_link_journal = LINK_AUTO;
95 static uint64_t arg_retain =
96         (1ULL << CAP_CHOWN) |
97         (1ULL << CAP_DAC_OVERRIDE) |
98         (1ULL << CAP_DAC_READ_SEARCH) |
99         (1ULL << CAP_FOWNER) |
100         (1ULL << CAP_FSETID) |
101         (1ULL << CAP_IPC_OWNER) |
102         (1ULL << CAP_KILL) |
103         (1ULL << CAP_LEASE) |
104         (1ULL << CAP_LINUX_IMMUTABLE) |
105         (1ULL << CAP_NET_BIND_SERVICE) |
106         (1ULL << CAP_NET_BROADCAST) |
107         (1ULL << CAP_NET_RAW) |
108         (1ULL << CAP_SETGID) |
109         (1ULL << CAP_SETFCAP) |
110         (1ULL << CAP_SETPCAP) |
111         (1ULL << CAP_SETUID) |
112         (1ULL << CAP_SYS_ADMIN) |
113         (1ULL << CAP_SYS_CHROOT) |
114         (1ULL << CAP_SYS_NICE) |
115         (1ULL << CAP_SYS_PTRACE) |
116         (1ULL << CAP_SYS_TTY_CONFIG) |
117         (1ULL << CAP_SYS_RESOURCE) |
118         (1ULL << CAP_SYS_BOOT) |
119         (1ULL << CAP_AUDIT_WRITE) |
120         (1ULL << CAP_AUDIT_CONTROL) |
121         (1ULL << CAP_MKNOD);
122 static char **arg_bind = NULL;
123 static char **arg_bind_ro = NULL;
124 static char **arg_setenv = NULL;
125 static bool arg_quiet = false;
126 static bool arg_share_system = false;
127 static bool arg_register = true;
128 static bool arg_keep_unit = false;
129 static char **arg_network_interfaces = NULL;
130
131 static int help(void) {
132
133         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
135                "  -h --help                 Show this help\n"
136                "     --version              Print version string\n"
137                "  -D --directory=NAME       Root directory for the container\n"
138                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
139                "  -u --user=USER            Run the command under specified user or uid\n"
140                "     --uuid=UUID            Set a specific machine UUID for the container\n"
141                "  -M --machine=NAME         Set the machine name for the container\n"
142                "  -S --slice=SLICE          Place the container in the specified slice\n"
143                "  -Z --selinux-context=SECLABEL\n"
144                "                            Set the SELinux security context to be used by\n"
145                "                            processes in the container\n"
146                "  -L --selinux-apifs-context=SECLABEL\n"
147                "                            Set the SELinux security context to be used by\n"
148                "                            API/tmpfs file systems in the container\n"
149                "     --private-network      Disable network in container\n"
150                "     --network-interface=INTERFACE\n"
151                "                            Assign an existing network interface to the container\n"
152                "     --share-system         Share system namespaces with host\n"
153                "     --read-only            Mount the root directory read-only\n"
154                "     --capability=CAP       In addition to the default, retain specified\n"
155                "                            capability\n"
156                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
157                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
158                "  -j                        Equivalent to --link-journal=host\n"
159                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
160                "                            the container\n"
161                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
162                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
163                "     --register=BOOLEAN     Register container as machine\n"
164                "     --keep-unit            Do not register a scope for the machine, reuse\n"
165                "                            the service unit nspawn is running in\n"
166                "  -q --quiet                Do not show status information\n",
167                program_invocation_short_name);
168
169         return 0;
170 }
171
172 static int parse_argv(int argc, char *argv[]) {
173
174         enum {
175                 ARG_VERSION = 0x100,
176                 ARG_PRIVATE_NETWORK,
177                 ARG_UUID,
178                 ARG_READ_ONLY,
179                 ARG_CAPABILITY,
180                 ARG_DROP_CAPABILITY,
181                 ARG_LINK_JOURNAL,
182                 ARG_BIND,
183                 ARG_BIND_RO,
184                 ARG_SETENV,
185                 ARG_SHARE_SYSTEM,
186                 ARG_REGISTER,
187                 ARG_KEEP_UNIT,
188                 ARG_NETWORK_INTERFACE
189         };
190
191         static const struct option options[] = {
192                 { "help",                  no_argument,       NULL, 'h'                   },
193                 { "version",               no_argument,       NULL, ARG_VERSION           },
194                 { "directory",             required_argument, NULL, 'D'                   },
195                 { "user",                  required_argument, NULL, 'u'                   },
196                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
197                 { "boot",                  no_argument,       NULL, 'b'                   },
198                 { "uuid",                  required_argument, NULL, ARG_UUID              },
199                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
200                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
201                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
202                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
203                 { "bind",                  required_argument, NULL, ARG_BIND              },
204                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
205                 { "machine",               required_argument, NULL, 'M'                   },
206                 { "slice",                 required_argument, NULL, 'S'                   },
207                 { "setenv",                required_argument, NULL, ARG_SETENV            },
208                 { "selinux-context",       required_argument, NULL, 'Z'                   },
209                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
210                 { "quiet",                 no_argument,       NULL, 'q'                   },
211                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
212                 { "register",              required_argument, NULL, ARG_REGISTER          },
213                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
214                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
215                 {}
216         };
217
218         int c, r;
219         uint64_t plus = 0, minus = 0;
220
221         assert(argc >= 0);
222         assert(argv);
223
224         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
225
226                 switch (c) {
227
228                 case 'h':
229                         return help();
230
231                 case ARG_VERSION:
232                         puts(PACKAGE_STRING);
233                         puts(SYSTEMD_FEATURES);
234                         return 0;
235
236                 case 'D':
237                         free(arg_directory);
238                         arg_directory = canonicalize_file_name(optarg);
239                         if (!arg_directory) {
240                                 log_error("Invalid root directory: %m");
241                                 return -ENOMEM;
242                         }
243
244                         break;
245
246                 case 'u':
247                         free(arg_user);
248                         arg_user = strdup(optarg);
249                         if (!arg_user)
250                                 return log_oom();
251
252                         break;
253
254                 case ARG_NETWORK_INTERFACE:
255                         if (strv_push(&arg_network_interfaces, optarg) < 0)
256                                 return log_oom();
257
258                         /* fall through */
259
260                 case ARG_PRIVATE_NETWORK:
261                         arg_private_network = true;
262                         break;
263
264                 case 'b':
265                         arg_boot = true;
266                         break;
267
268                 case ARG_UUID:
269                         r = sd_id128_from_string(optarg, &arg_uuid);
270                         if (r < 0) {
271                                 log_error("Invalid UUID: %s", optarg);
272                                 return r;
273                         }
274                         break;
275
276                 case 'S':
277                         arg_slice = strdup(optarg);
278                         if (!arg_slice)
279                                 return log_oom();
280
281                         break;
282
283                 case 'M':
284                         if (isempty(optarg)) {
285                                 free(arg_machine);
286                                 arg_machine = NULL;
287                         } else {
288
289                                 if (!hostname_is_valid(optarg)) {
290                                         log_error("Invalid machine name: %s", optarg);
291                                         return -EINVAL;
292                                 }
293
294                                 free(arg_machine);
295                                 arg_machine = strdup(optarg);
296                                 if (!arg_machine)
297                                         return log_oom();
298
299                                 break;
300                         }
301
302                 case 'Z':
303                         arg_selinux_context = optarg;
304                         break;
305
306                 case 'L':
307                         arg_selinux_apifs_context = optarg;
308                         break;
309
310                 case ARG_READ_ONLY:
311                         arg_read_only = true;
312                         break;
313
314                 case ARG_CAPABILITY:
315                 case ARG_DROP_CAPABILITY: {
316                         char *state, *word;
317                         size_t length;
318
319                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
320                                 _cleanup_free_ char *t;
321                                 cap_value_t cap;
322
323                                 t = strndup(word, length);
324                                 if (!t)
325                                         return log_oom();
326
327                                 if (streq(t, "all")) {
328                                         if (c == ARG_CAPABILITY)
329                                                 plus = (uint64_t) -1;
330                                         else
331                                                 minus = (uint64_t) -1;
332                                 } else {
333                                         if (cap_from_name(t, &cap) < 0) {
334                                                 log_error("Failed to parse capability %s.", t);
335                                                 return -EINVAL;
336                                         }
337
338                                         if (c == ARG_CAPABILITY)
339                                                 plus |= 1ULL << (uint64_t) cap;
340                                         else
341                                                 minus |= 1ULL << (uint64_t) cap;
342                                 }
343                         }
344
345                         break;
346                 }
347
348                 case 'j':
349                         arg_link_journal = LINK_GUEST;
350                         break;
351
352                 case ARG_LINK_JOURNAL:
353                         if (streq(optarg, "auto"))
354                                 arg_link_journal = LINK_AUTO;
355                         else if (streq(optarg, "no"))
356                                 arg_link_journal = LINK_NO;
357                         else if (streq(optarg, "guest"))
358                                 arg_link_journal = LINK_GUEST;
359                         else if (streq(optarg, "host"))
360                                 arg_link_journal = LINK_HOST;
361                         else {
362                                 log_error("Failed to parse link journal mode %s", optarg);
363                                 return -EINVAL;
364                         }
365
366                         break;
367
368                 case ARG_BIND:
369                 case ARG_BIND_RO: {
370                         _cleanup_free_ char *a = NULL, *b = NULL;
371                         char *e;
372                         char ***x;
373
374                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
375
376                         e = strchr(optarg, ':');
377                         if (e) {
378                                 a = strndup(optarg, e - optarg);
379                                 b = strdup(e + 1);
380                         } else {
381                                 a = strdup(optarg);
382                                 b = strdup(optarg);
383                         }
384
385                         if (!a || !b)
386                                 return log_oom();
387
388                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
389                                 log_error("Invalid bind mount specification: %s", optarg);
390                                 return -EINVAL;
391                         }
392
393                         r = strv_extend(x, a);
394                         if (r < 0)
395                                 return log_oom();
396
397                         r = strv_extend(x, b);
398                         if (r < 0)
399                                 return log_oom();
400
401                         break;
402                 }
403
404                 case ARG_SETENV: {
405                         char **n;
406
407                         if (!env_assignment_is_valid(optarg)) {
408                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
409                                 return -EINVAL;
410                         }
411
412                         n = strv_env_set(arg_setenv, optarg);
413                         if (!n)
414                                 return log_oom();
415
416                         strv_free(arg_setenv);
417                         arg_setenv = n;
418                         break;
419                 }
420
421                 case 'q':
422                         arg_quiet = true;
423                         break;
424
425                 case ARG_SHARE_SYSTEM:
426                         arg_share_system = true;
427                         break;
428
429                 case ARG_REGISTER:
430                         r = parse_boolean(optarg);
431                         if (r < 0) {
432                                 log_error("Failed to parse --register= argument: %s", optarg);
433                                 return r;
434                         }
435
436                         arg_register = r;
437                         break;
438
439                 case ARG_KEEP_UNIT:
440                         arg_keep_unit = true;
441                         break;
442
443                 case '?':
444                         return -EINVAL;
445
446                 default:
447                         assert_not_reached("Unhandled option");
448                 }
449         }
450
451         if (arg_share_system)
452                 arg_register = false;
453
454         if (arg_boot && arg_share_system) {
455                 log_error("--boot and --share-system may not be combined.");
456                 return -EINVAL;
457         }
458
459         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
460                 log_error("--keep-unit may not be used when invoked from a user session.");
461                 return -EINVAL;
462         }
463
464         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
465
466         return 1;
467 }
468
469 static int mount_all(const char *dest) {
470
471         typedef struct MountPoint {
472                 const char *what;
473                 const char *where;
474                 const char *type;
475                 const char *options;
476                 unsigned long flags;
477                 bool fatal;
478         } MountPoint;
479
480         static const MountPoint mount_table[] = {
481                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
482                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
483                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
484                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
485                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
486                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
487                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
488                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
489 #ifdef HAVE_SELINUX
490                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
491                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
492 #endif
493         };
494
495         unsigned k;
496         int r = 0;
497
498         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
499                 _cleanup_free_ char *where = NULL;
500 #ifdef HAVE_SELINUX
501                 _cleanup_free_ char *options = NULL;
502 #endif
503                 const char *o;
504                 int t;
505
506                 where = strjoin(dest, "/", mount_table[k].where, NULL);
507                 if (!where)
508                         return log_oom();
509
510                 t = path_is_mount_point(where, true);
511                 if (t < 0) {
512                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
513
514                         if (r == 0)
515                                 r = t;
516
517                         continue;
518                 }
519
520                 /* Skip this entry if it is not a remount. */
521                 if (mount_table[k].what && t > 0)
522                         continue;
523
524                 mkdir_p(where, 0755);
525
526 #ifdef HAVE_SELINUX
527                 if (arg_selinux_apifs_context &&
528                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
529                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
530                         if (!options)
531                                 return log_oom();
532
533                         o = options;
534                 } else
535 #endif
536                         o = mount_table[k].options;
537
538
539                 if (mount(mount_table[k].what,
540                           where,
541                           mount_table[k].type,
542                           mount_table[k].flags,
543                           o) < 0 &&
544                     mount_table[k].fatal) {
545
546                         log_error("mount(%s) failed: %m", where);
547
548                         if (r == 0)
549                                 r = -errno;
550                 }
551         }
552
553         return r;
554 }
555
556 static int mount_binds(const char *dest, char **l, unsigned long flags) {
557         char **x, **y;
558
559         STRV_FOREACH_PAIR(x, y, l) {
560                 char *where;
561                 struct stat source_st, dest_st;
562                 int r;
563
564                 if (stat(*x, &source_st) < 0) {
565                         log_error("failed to stat %s: %m", *x);
566                         return -errno;
567                 }
568
569                 where = strappenda(dest, *y);
570                 r = stat(where, &dest_st);
571                 if (r == 0) {
572                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
573                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
574                                                 *x, where);
575                                 return -EINVAL;
576                         }
577                 } else if (errno == ENOENT) {
578                         r = mkdir_parents_label(where, 0755);
579                         if (r < 0) {
580                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
581                                 return r;
582                         }
583                 } else {
584                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
585                         return -errno;
586                 }
587                 /* Create the mount point, but be conservative -- refuse to create block
588                 * and char devices. */
589                 if (S_ISDIR(source_st.st_mode))
590                         mkdir_label(where, 0755);
591                 else if (S_ISFIFO(source_st.st_mode))
592                         mkfifo(where, 0644);
593                 else if (S_ISSOCK(source_st.st_mode))
594                         mknod(where, 0644 | S_IFSOCK, 0);
595                 else if (S_ISREG(source_st.st_mode))
596                         touch(where);
597                 else {
598                         log_error("Refusing to create mountpoint for file: %s", *x);
599                         return -ENOTSUP;
600                 }
601
602                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
603                         log_error("mount(%s) failed: %m", where);
604                         return -errno;
605                 }
606
607                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
608                         log_error("mount(%s) failed: %m", where);
609                         return -errno;
610                 }
611         }
612
613         return 0;
614 }
615
616 static int setup_timezone(const char *dest) {
617         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
618         char *z, *y;
619         int r;
620
621         assert(dest);
622
623         /* Fix the timezone, if possible */
624         r = readlink_malloc("/etc/localtime", &p);
625         if (r < 0) {
626                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
627                 return 0;
628         }
629
630         z = path_startswith(p, "../usr/share/zoneinfo/");
631         if (!z)
632                 z = path_startswith(p, "/usr/share/zoneinfo/");
633         if (!z) {
634                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
635                 return 0;
636         }
637
638         where = strappend(dest, "/etc/localtime");
639         if (!where)
640                 return log_oom();
641
642         r = readlink_malloc(where, &q);
643         if (r >= 0) {
644                 y = path_startswith(q, "../usr/share/zoneinfo/");
645                 if (!y)
646                         y = path_startswith(q, "/usr/share/zoneinfo/");
647
648
649                 /* Already pointing to the right place? Then do nothing .. */
650                 if (y && streq(y, z))
651                         return 0;
652         }
653
654         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
655         if (!check)
656                 return log_oom();
657
658         if (access(check, F_OK) < 0) {
659                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
660                 return 0;
661         }
662
663         what = strappend("../usr/share/zoneinfo/", z);
664         if (!what)
665                 return log_oom();
666
667         unlink(where);
668         if (symlink(what, where) < 0) {
669                 log_error("Failed to correct timezone of container: %m");
670                 return 0;
671         }
672
673         return 0;
674 }
675
676 static int setup_resolv_conf(const char *dest) {
677         char _cleanup_free_ *where = NULL;
678
679         assert(dest);
680
681         if (arg_private_network)
682                 return 0;
683
684         /* Fix resolv.conf, if possible */
685         where = strappend(dest, "/etc/resolv.conf");
686         if (!where)
687                 return log_oom();
688
689         /* We don't really care for the results of this really. If it
690          * fails, it fails, but meh... */
691         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
692
693         return 0;
694 }
695
696 static int setup_boot_id(const char *dest) {
697         _cleanup_free_ char *from = NULL, *to = NULL;
698         sd_id128_t rnd;
699         char as_uuid[37];
700         int r;
701
702         assert(dest);
703
704         if (arg_share_system)
705                 return 0;
706
707         /* Generate a new randomized boot ID, so that each boot-up of
708          * the container gets a new one */
709
710         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
711         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
712         if (!from || !to)
713                 return log_oom();
714
715         r = sd_id128_randomize(&rnd);
716         if (r < 0) {
717                 log_error("Failed to generate random boot id: %s", strerror(-r));
718                 return r;
719         }
720
721         snprintf(as_uuid, sizeof(as_uuid),
722                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
723                  SD_ID128_FORMAT_VAL(rnd));
724         char_array_0(as_uuid);
725
726         r = write_string_file(from, as_uuid);
727         if (r < 0) {
728                 log_error("Failed to write boot id: %s", strerror(-r));
729                 return r;
730         }
731
732         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733                 log_error("Failed to bind mount boot id: %m");
734                 r = -errno;
735         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
736                 log_warning("Failed to make boot id read-only: %m");
737
738         unlink(from);
739         return r;
740 }
741
742 static int copy_devnodes(const char *dest) {
743
744         static const char devnodes[] =
745                 "null\0"
746                 "zero\0"
747                 "full\0"
748                 "random\0"
749                 "urandom\0"
750                 "tty\0";
751
752         const char *d;
753         int r = 0;
754         _cleanup_umask_ mode_t u;
755
756         assert(dest);
757
758         u = umask(0000);
759
760         NULSTR_FOREACH(d, devnodes) {
761                 _cleanup_free_ char *from = NULL, *to = NULL;
762                 struct stat st;
763
764                 from = strappend("/dev/", d);
765                 to = strjoin(dest, "/dev/", d, NULL);
766                 if (!from || !to)
767                         return log_oom();
768
769                 if (stat(from, &st) < 0) {
770
771                         if (errno != ENOENT) {
772                                 log_error("Failed to stat %s: %m", from);
773                                 return -errno;
774                         }
775
776                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
777
778                         log_error("%s is not a char or block device, cannot copy", from);
779                         return -EIO;
780
781                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
782
783                         log_error("mknod(%s) failed: %m", dest);
784                         return  -errno;
785                 }
786         }
787
788         return r;
789 }
790
791 static int setup_ptmx(const char *dest) {
792         _cleanup_free_ char *p = NULL;
793
794         p = strappend(dest, "/dev/ptmx");
795         if (!p)
796                 return log_oom();
797
798         if (symlink("pts/ptmx", p) < 0) {
799                 log_error("Failed to create /dev/ptmx symlink: %m");
800                 return -errno;
801         }
802
803         return 0;
804 }
805
806 static int setup_dev_console(const char *dest, const char *console) {
807         struct stat st;
808         _cleanup_free_ char *to = NULL;
809         int r;
810         _cleanup_umask_ mode_t u;
811
812         assert(dest);
813         assert(console);
814
815         u = umask(0000);
816
817         if (stat(console, &st) < 0) {
818                 log_error("Failed to stat %s: %m", console);
819                 return -errno;
820
821         } else if (!S_ISCHR(st.st_mode)) {
822                 log_error("/dev/console is not a char device");
823                 return -EIO;
824         }
825
826         r = chmod_and_chown(console, 0600, 0, 0);
827         if (r < 0) {
828                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
829                 return r;
830         }
831
832         if (asprintf(&to, "%s/dev/console", dest) < 0)
833                 return log_oom();
834
835         /* We need to bind mount the right tty to /dev/console since
836          * ptys can only exist on pts file systems. To have something
837          * to bind mount things on we create a device node first, that
838          * has the right major/minor (note that the major minor
839          * doesn't actually matter here, since we mount it over
840          * anyway). */
841
842         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
843                 log_error("mknod() for /dev/console failed: %m");
844                 return -errno;
845         }
846
847         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
848                 log_error("Bind mount for /dev/console failed: %m");
849                 return -errno;
850         }
851
852         return 0;
853 }
854
855 static int setup_kmsg(const char *dest, int kmsg_socket) {
856         _cleanup_free_ char *from = NULL, *to = NULL;
857         int r, fd, k;
858         _cleanup_umask_ mode_t u;
859         union {
860                 struct cmsghdr cmsghdr;
861                 uint8_t buf[CMSG_SPACE(sizeof(int))];
862         } control = {};
863         struct msghdr mh = {
864                 .msg_control = &control,
865                 .msg_controllen = sizeof(control),
866         };
867         struct cmsghdr *cmsg;
868
869         assert(dest);
870         assert(kmsg_socket >= 0);
871
872         u = umask(0000);
873
874         /* We create the kmsg FIFO as /dev/kmsg, but immediately
875          * delete it after bind mounting it to /proc/kmsg. While FIFOs
876          * on the reading side behave very similar to /proc/kmsg,
877          * their writing side behaves differently from /dev/kmsg in
878          * that writing blocks when nothing is reading. In order to
879          * avoid any problems with containers deadlocking due to this
880          * we simply make /dev/kmsg unavailable to the container. */
881         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
882             asprintf(&to, "%s/proc/kmsg", dest) < 0)
883                 return log_oom();
884
885         if (mkfifo(from, 0600) < 0) {
886                 log_error("mkfifo() for /dev/kmsg failed: %m");
887                 return -errno;
888         }
889
890         r = chmod_and_chown(from, 0600, 0, 0);
891         if (r < 0) {
892                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
893                 return r;
894         }
895
896         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
897                 log_error("Bind mount for /proc/kmsg failed: %m");
898                 return -errno;
899         }
900
901         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
902         if (fd < 0) {
903                 log_error("Failed to open fifo: %m");
904                 return -errno;
905         }
906
907         cmsg = CMSG_FIRSTHDR(&mh);
908         cmsg->cmsg_level = SOL_SOCKET;
909         cmsg->cmsg_type = SCM_RIGHTS;
910         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
911         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
912
913         mh.msg_controllen = cmsg->cmsg_len;
914
915         /* Store away the fd in the socket, so that it stays open as
916          * long as we run the child */
917         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
918         close_nointr_nofail(fd);
919
920         if (k < 0) {
921                 log_error("Failed to send FIFO fd: %m");
922                 return -errno;
923         }
924
925         /* And now make the FIFO unavailable as /dev/kmsg... */
926         unlink(from);
927         return 0;
928 }
929
930 static int setup_hostname(void) {
931
932         if (arg_share_system)
933                 return 0;
934
935         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
936                 return -errno;
937
938         return 0;
939 }
940
941 static int setup_journal(const char *directory) {
942         sd_id128_t machine_id, this_id;
943         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
944         char *id;
945         int r;
946
947         p = strappend(directory, "/etc/machine-id");
948         if (!p)
949                 return log_oom();
950
951         r = read_one_line_file(p, &b);
952         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
953                 return 0;
954         else if (r < 0) {
955                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
956                 return r;
957         }
958
959         id = strstrip(b);
960         if (isempty(id) && arg_link_journal == LINK_AUTO)
961                 return 0;
962
963         /* Verify validity */
964         r = sd_id128_from_string(id, &machine_id);
965         if (r < 0) {
966                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
967                 return r;
968         }
969
970         r = sd_id128_get_machine(&this_id);
971         if (r < 0) {
972                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
973                 return r;
974         }
975
976         if (sd_id128_equal(machine_id, this_id)) {
977                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
978                          "Host and machine ids are equal (%s): refusing to link journals", id);
979                 if (arg_link_journal == LINK_AUTO)
980                         return 0;
981                 return
982                         -EEXIST;
983         }
984
985         if (arg_link_journal == LINK_NO)
986                 return 0;
987
988         free(p);
989         p = strappend("/var/log/journal/", id);
990         q = strjoin(directory, "/var/log/journal/", id, NULL);
991         if (!p || !q)
992                 return log_oom();
993
994         if (path_is_mount_point(p, false) > 0) {
995                 if (arg_link_journal != LINK_AUTO) {
996                         log_error("%s: already a mount point, refusing to use for journal", p);
997                         return -EEXIST;
998                 }
999
1000                 return 0;
1001         }
1002
1003         if (path_is_mount_point(q, false) > 0) {
1004                 if (arg_link_journal != LINK_AUTO) {
1005                         log_error("%s: already a mount point, refusing to use for journal", q);
1006                         return -EEXIST;
1007                 }
1008
1009                 return 0;
1010         }
1011
1012         r = readlink_and_make_absolute(p, &d);
1013         if (r >= 0) {
1014                 if ((arg_link_journal == LINK_GUEST ||
1015                      arg_link_journal == LINK_AUTO) &&
1016                     path_equal(d, q)) {
1017
1018                         r = mkdir_p(q, 0755);
1019                         if (r < 0)
1020                                 log_warning("failed to create directory %s: %m", q);
1021                         return 0;
1022                 }
1023
1024                 if (unlink(p) < 0) {
1025                         log_error("Failed to remove symlink %s: %m", p);
1026                         return -errno;
1027                 }
1028         } else if (r == -EINVAL) {
1029
1030                 if (arg_link_journal == LINK_GUEST &&
1031                     rmdir(p) < 0) {
1032
1033                         if (errno == ENOTDIR) {
1034                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1035                                 return r;
1036                         } else {
1037                                 log_error("Failed to remove %s: %m", p);
1038                                 return -errno;
1039                         }
1040                 }
1041         } else if (r != -ENOENT) {
1042                 log_error("readlink(%s) failed: %m", p);
1043                 return r;
1044         }
1045
1046         if (arg_link_journal == LINK_GUEST) {
1047
1048                 if (symlink(q, p) < 0) {
1049                         log_error("Failed to symlink %s to %s: %m", q, p);
1050                         return -errno;
1051                 }
1052
1053                 r = mkdir_p(q, 0755);
1054                 if (r < 0)
1055                         log_warning("failed to create directory %s: %m", q);
1056                 return 0;
1057         }
1058
1059         if (arg_link_journal == LINK_HOST) {
1060                 r = mkdir_p(p, 0755);
1061                 if (r < 0) {
1062                         log_error("Failed to create %s: %m", p);
1063                         return r;
1064                 }
1065
1066         } else if (access(p, F_OK) < 0)
1067                 return 0;
1068
1069         if (dir_is_empty(q) == 0) {
1070                 log_error("%s not empty.", q);
1071                 return -ENOTEMPTY;
1072         }
1073
1074         r = mkdir_p(q, 0755);
1075         if (r < 0) {
1076                 log_error("Failed to create %s: %m", q);
1077                 return r;
1078         }
1079
1080         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1081                 log_error("Failed to bind mount journal from host into guest: %m");
1082                 return -errno;
1083         }
1084
1085         return 0;
1086 }
1087
1088 static int setup_kdbus(const char *dest, const char *path) {
1089         const char *p;
1090
1091         if (!path)
1092                 return 0;
1093
1094         p = strappenda(dest, "/dev/kdbus");
1095         if (mkdir(p, 0755) < 0) {
1096                 log_error("Failed to create kdbus path: %m");
1097                 return  -errno;
1098         }
1099
1100         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1101                 log_error("Failed to mount kdbus domain path: %m");
1102                 return -errno;
1103         }
1104
1105         return 0;
1106 }
1107
1108 static int drop_capabilities(void) {
1109         return capability_bounding_set_drop(~arg_retain, false);
1110 }
1111
1112 static int register_machine(pid_t pid) {
1113         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1114         _cleanup_bus_unref_ sd_bus *bus = NULL;
1115         int r;
1116
1117         if (!arg_register)
1118                 return 0;
1119
1120         r = sd_bus_default_system(&bus);
1121         if (r < 0) {
1122                 log_error("Failed to open system bus: %s", strerror(-r));
1123                 return r;
1124         }
1125
1126         if (arg_keep_unit) {
1127                 r = sd_bus_call_method(
1128                                 bus,
1129                                 "org.freedesktop.machine1",
1130                                 "/org/freedesktop/machine1",
1131                                 "org.freedesktop.machine1.Manager",
1132                                 "RegisterMachine",
1133                                 &error,
1134                                 NULL,
1135                                 "sayssus",
1136                                 arg_machine,
1137                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1138                                 "nspawn",
1139                                 "container",
1140                                 (uint32_t) pid,
1141                                 strempty(arg_directory));
1142         } else {
1143                 r = sd_bus_call_method(
1144                                 bus,
1145                                 "org.freedesktop.machine1",
1146                                 "/org/freedesktop/machine1",
1147                                 "org.freedesktop.machine1.Manager",
1148                                 "CreateMachine",
1149                                 &error,
1150                                 NULL,
1151                                 "sayssusa(sv)",
1152                                 arg_machine,
1153                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1154                                 "nspawn",
1155                                 "container",
1156                                 (uint32_t) pid,
1157                                 strempty(arg_directory),
1158                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1159         }
1160
1161         if (r < 0) {
1162                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1163                 return r;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int terminate_machine(pid_t pid) {
1170         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1172         _cleanup_bus_unref_ sd_bus *bus = NULL;
1173         const char *path;
1174         int r;
1175
1176         if (!arg_register)
1177                 return 0;
1178
1179         r = sd_bus_default_system(&bus);
1180         if (r < 0) {
1181                 log_error("Failed to open system bus: %s", strerror(-r));
1182                 return r;
1183         }
1184
1185         r = sd_bus_call_method(
1186                         bus,
1187                         "org.freedesktop.machine1",
1188                         "/org/freedesktop/machine1",
1189                         "org.freedesktop.machine1.Manager",
1190                         "GetMachineByPID",
1191                         &error,
1192                         &reply,
1193                         "u",
1194                         (uint32_t) pid);
1195         if (r < 0) {
1196                 /* Note that the machine might already have been
1197                  * cleaned up automatically, hence don't consider it a
1198                  * failure if we cannot get the machine object. */
1199                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1200                 return 0;
1201         }
1202
1203         r = sd_bus_message_read(reply, "o", &path);
1204         if (r < 0)
1205                 return bus_log_parse_error(r);
1206
1207         r = sd_bus_call_method(
1208                         bus,
1209                         "org.freedesktop.machine1",
1210                         path,
1211                         "org.freedesktop.machine1.Machine",
1212                         "Terminate",
1213                         &error,
1214                         NULL,
1215                         NULL);
1216         if (r < 0) {
1217                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1218                 return 0;
1219         }
1220
1221         return 0;
1222 }
1223
1224 static int reset_audit_loginuid(void) {
1225         _cleanup_free_ char *p = NULL;
1226         int r;
1227
1228         if (arg_share_system)
1229                 return 0;
1230
1231         r = read_one_line_file("/proc/self/loginuid", &p);
1232         if (r == -EEXIST)
1233                 return 0;
1234         if (r < 0) {
1235                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1236                 return r;
1237         }
1238
1239         /* Already reset? */
1240         if (streq(p, "4294967295"))
1241                 return 0;
1242
1243         r = write_string_file("/proc/self/loginuid", "4294967295");
1244         if (r < 0) {
1245                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1246                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1247                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1248                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1249                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1250
1251                 sleep(5);
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int move_network_interfaces(pid_t pid) {
1258         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1259         char **i;
1260         int r;
1261
1262         if (!arg_private_network)
1263                 return 0;
1264
1265         if (strv_isempty(arg_network_interfaces))
1266                 return 0;
1267
1268         r = sd_rtnl_open(0, &rtnl);
1269         if (r < 0) {
1270                 log_error("Failed to connect to netlink: %s", strerror(-r));
1271                 return r;
1272         }
1273
1274         STRV_FOREACH(i, arg_network_interfaces) {
1275                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1276                 int ifi;
1277
1278                 ifi = (int) if_nametoindex(*i);
1279                 if (ifi <= 0) {
1280                         log_error("Failed to resolve interface %s: %m", *i);
1281                         return -errno;
1282                 }
1283
1284                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1285                 if (r < 0) {
1286                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1287                         return r;
1288                 }
1289
1290                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1291                 if (r < 0) {
1292                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1293                         return r;
1294                 }
1295
1296                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1297                 if (r < 0) {
1298                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1299                         return r;
1300                 }
1301         }
1302
1303         return 0;
1304 }
1305
1306 int main(int argc, char *argv[]) {
1307         pid_t pid = 0;
1308         int r = EXIT_FAILURE, k;
1309         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1310         int n_fd_passed;
1311         const char *console = NULL;
1312         sigset_t mask;
1313         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1314         _cleanup_fdset_free_ FDSet *fds = NULL;
1315         _cleanup_free_ char *kdbus_domain = NULL;
1316
1317         log_parse_environment();
1318         log_open();
1319
1320         k = parse_argv(argc, argv);
1321         if (k < 0)
1322                 goto finish;
1323         else if (k == 0) {
1324                 r = EXIT_SUCCESS;
1325                 goto finish;
1326         }
1327
1328         if (arg_directory) {
1329                 char *p;
1330
1331                 p = path_make_absolute_cwd(arg_directory);
1332                 free(arg_directory);
1333                 arg_directory = p;
1334         } else
1335                 arg_directory = get_current_dir_name();
1336
1337         if (!arg_directory) {
1338                 log_error("Failed to determine path, please use -D.");
1339                 goto finish;
1340         }
1341
1342         path_kill_slashes(arg_directory);
1343
1344         if (!arg_machine) {
1345                 arg_machine = strdup(basename(arg_directory));
1346                 if (!arg_machine) {
1347                         log_oom();
1348                         goto finish;
1349                 }
1350
1351                 hostname_cleanup(arg_machine, false);
1352                 if (isempty(arg_machine)) {
1353                         log_error("Failed to determine machine name automatically, please use -M.");
1354                         goto finish;
1355                 }
1356         }
1357
1358         if (geteuid() != 0) {
1359                 log_error("Need to be root.");
1360                 goto finish;
1361         }
1362
1363         if (sd_booted() <= 0) {
1364                 log_error("Not running on a systemd system.");
1365                 goto finish;
1366         }
1367
1368         if (path_equal(arg_directory, "/")) {
1369                 log_error("Spawning container on root directory not supported.");
1370                 goto finish;
1371         }
1372
1373         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1374                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1375                 goto finish;
1376         }
1377
1378         log_close();
1379         n_fd_passed = sd_listen_fds(false);
1380         if (n_fd_passed > 0) {
1381                 k = fdset_new_listen_fds(&fds, false);
1382                 if (k < 0) {
1383                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1384                         goto finish;
1385                 }
1386         }
1387         fdset_close_others(fds);
1388         log_open();
1389
1390         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1391         if (master < 0) {
1392                 log_error("Failed to acquire pseudo tty: %m");
1393                 goto finish;
1394         }
1395
1396         console = ptsname(master);
1397         if (!console) {
1398                 log_error("Failed to determine tty name: %m");
1399                 goto finish;
1400         }
1401
1402         if (!arg_quiet)
1403                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1404
1405         if (unlockpt(master) < 0) {
1406                 log_error("Failed to unlock tty: %m");
1407                 goto finish;
1408         }
1409
1410
1411         if (access("/dev/kdbus/control", F_OK) >= 0) {
1412
1413                 if (arg_share_system) {
1414                         kdbus_domain = strdup("/dev/kdbus");
1415                         if (!kdbus_domain) {
1416                                 log_oom();
1417                                 goto finish;
1418                         }
1419                 } else {
1420                         const char *ns;
1421
1422                         ns = strappenda("machine-", arg_machine);
1423                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1424                         if (r < 0)
1425                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1426                         else
1427                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1428                 }
1429         }
1430
1431         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1432                 log_error("Failed to create kmsg socket pair: %m");
1433                 goto finish;
1434         }
1435
1436         sd_notify(0, "READY=1");
1437
1438         assert_se(sigemptyset(&mask) == 0);
1439         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1440         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1441
1442         for (;;) {
1443                 siginfo_t status;
1444
1445                 sync_fd = eventfd(0, EFD_CLOEXEC);
1446                 if (sync_fd < 0) {
1447                         log_error("Failed to create event fd: %m");
1448                         goto finish;
1449                 }
1450
1451                 pid = syscall(__NR_clone,
1452                               SIGCHLD|CLONE_NEWNS|
1453                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1454                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1455                 if (pid < 0) {
1456                         if (errno == EINVAL)
1457                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1458                         else
1459                                 log_error("clone() failed: %m");
1460
1461                         goto finish;
1462                 }
1463
1464                 if (pid == 0) {
1465                         /* child */
1466                         const char *home = NULL;
1467                         uid_t uid = (uid_t) -1;
1468                         gid_t gid = (gid_t) -1;
1469                         unsigned n_env = 2;
1470                         const char *envp[] = {
1471                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1472                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1473                                 NULL, /* TERM */
1474                                 NULL, /* HOME */
1475                                 NULL, /* USER */
1476                                 NULL, /* LOGNAME */
1477                                 NULL, /* container_uuid */
1478                                 NULL, /* LISTEN_FDS */
1479                                 NULL, /* LISTEN_PID */
1480                                 NULL
1481                         };
1482                         char **env_use;
1483                         eventfd_t x;
1484
1485                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1486                         if (envp[n_env])
1487                                 n_env ++;
1488
1489                         close_nointr_nofail(master);
1490                         master = -1;
1491
1492                         close_nointr(STDIN_FILENO);
1493                         close_nointr(STDOUT_FILENO);
1494                         close_nointr(STDERR_FILENO);
1495
1496                         close_nointr_nofail(kmsg_socket_pair[0]);
1497                         kmsg_socket_pair[0] = -1;
1498
1499                         reset_all_signal_handlers();
1500
1501                         assert_se(sigemptyset(&mask) == 0);
1502                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1503
1504                         k = open_terminal(console, O_RDWR);
1505                         if (k != STDIN_FILENO) {
1506                                 if (k >= 0) {
1507                                         close_nointr_nofail(k);
1508                                         k = -EINVAL;
1509                                 }
1510
1511                                 log_error("Failed to open console: %s", strerror(-k));
1512                                 goto child_fail;
1513                         }
1514
1515                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1516                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1517                                 log_error("Failed to duplicate console: %m");
1518                                 goto child_fail;
1519                         }
1520
1521                         if (setsid() < 0) {
1522                                 log_error("setsid() failed: %m");
1523                                 goto child_fail;
1524                         }
1525
1526                         if (reset_audit_loginuid() < 0)
1527                                 goto child_fail;
1528
1529                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1530                                 log_error("PR_SET_PDEATHSIG failed: %m");
1531                                 goto child_fail;
1532                         }
1533
1534                         /* Mark everything as slave, so that we still
1535                          * receive mounts from the real root, but don't
1536                          * propagate mounts to the real root. */
1537                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1538                                 log_error("MS_SLAVE|MS_REC failed: %m");
1539                                 goto child_fail;
1540                         }
1541
1542                         /* Turn directory into bind mount */
1543                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1544                                 log_error("Failed to make bind mount.");
1545                                 goto child_fail;
1546                         }
1547
1548                         if (arg_read_only)
1549                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1550                                         log_error("Failed to make read-only.");
1551                                         goto child_fail;
1552                                 }
1553
1554                         if (mount_all(arg_directory) < 0)
1555                                 goto child_fail;
1556
1557                         if (copy_devnodes(arg_directory) < 0)
1558                                 goto child_fail;
1559
1560                         if (setup_ptmx(arg_directory) < 0)
1561                                 goto child_fail;
1562
1563                         dev_setup(arg_directory);
1564
1565                         if (setup_dev_console(arg_directory, console) < 0)
1566                                 goto child_fail;
1567
1568                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1569                                 goto child_fail;
1570
1571                         close_nointr_nofail(kmsg_socket_pair[1]);
1572                         kmsg_socket_pair[1] = -1;
1573
1574                         if (setup_boot_id(arg_directory) < 0)
1575                                 goto child_fail;
1576
1577                         if (setup_timezone(arg_directory) < 0)
1578                                 goto child_fail;
1579
1580                         if (setup_resolv_conf(arg_directory) < 0)
1581                                 goto child_fail;
1582
1583                         if (setup_journal(arg_directory) < 0)
1584                                 goto child_fail;
1585
1586                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1587                                 goto child_fail;
1588
1589                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1590                                 goto child_fail;
1591
1592                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1593                                 goto child_fail;
1594
1595                         if (chdir(arg_directory) < 0) {
1596                                 log_error("chdir(%s) failed: %m", arg_directory);
1597                                 goto child_fail;
1598                         }
1599
1600                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1601                                 log_error("mount(MS_MOVE) failed: %m");
1602                                 goto child_fail;
1603                         }
1604
1605                         if (chroot(".") < 0) {
1606                                 log_error("chroot() failed: %m");
1607                                 goto child_fail;
1608                         }
1609
1610                         if (chdir("/") < 0) {
1611                                 log_error("chdir() failed: %m");
1612                                 goto child_fail;
1613                         }
1614
1615                         umask(0022);
1616
1617                         if (arg_private_network)
1618                                 loopback_setup();
1619
1620                         if (drop_capabilities() < 0) {
1621                                 log_error("drop_capabilities() failed: %m");
1622                                 goto child_fail;
1623                         }
1624
1625                         if (arg_user) {
1626
1627                                 /* Note that this resolves user names
1628                                  * inside the container, and hence
1629                                  * accesses the NSS modules from the
1630                                  * container and not the host. This is
1631                                  * a bit weird... */
1632
1633                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1634                                         log_error("get_user_creds() failed: %m");
1635                                         goto child_fail;
1636                                 }
1637
1638                                 if (mkdir_parents_label(home, 0775) < 0) {
1639                                         log_error("mkdir_parents_label() failed: %m");
1640                                         goto child_fail;
1641                                 }
1642
1643                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1644                                         log_error("mkdir_safe_label() failed: %m");
1645                                         goto child_fail;
1646                                 }
1647
1648                                 if (initgroups((const char*)arg_user, gid) < 0) {
1649                                         log_error("initgroups() failed: %m");
1650                                         goto child_fail;
1651                                 }
1652
1653                                 if (setresgid(gid, gid, gid) < 0) {
1654                                         log_error("setregid() failed: %m");
1655                                         goto child_fail;
1656                                 }
1657
1658                                 if (setresuid(uid, uid, uid) < 0) {
1659                                         log_error("setreuid() failed: %m");
1660                                         goto child_fail;
1661                                 }
1662                         } else {
1663                                 /* Reset everything fully to 0, just in case */
1664
1665                                 if (setgroups(0, NULL) < 0) {
1666                                         log_error("setgroups() failed: %m");
1667                                         goto child_fail;
1668                                 }
1669
1670                                 if (setresgid(0, 0, 0) < 0) {
1671                                         log_error("setregid() failed: %m");
1672                                         goto child_fail;
1673                                 }
1674
1675                                 if (setresuid(0, 0, 0) < 0) {
1676                                         log_error("setreuid() failed: %m");
1677                                         goto child_fail;
1678                                 }
1679                         }
1680
1681                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1682                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1683                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1684                                 log_oom();
1685                                 goto child_fail;
1686                         }
1687
1688                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1689                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1690                                         log_oom();
1691                                         goto child_fail;
1692                                 }
1693                         }
1694
1695                         if (fdset_size(fds) > 0) {
1696                                 k = fdset_cloexec(fds, false);
1697                                 if (k < 0) {
1698                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1699                                         goto child_fail;
1700                                 }
1701
1702                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1703                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1704                                         log_oom();
1705                                         goto child_fail;
1706                                 }
1707                         }
1708
1709                         setup_hostname();
1710
1711                         eventfd_read(sync_fd, &x);
1712                         close_nointr_nofail(sync_fd);
1713                         sync_fd = -1;
1714
1715                         if (!strv_isempty(arg_setenv)) {
1716                                 char **n;
1717
1718                                 n = strv_env_merge(2, envp, arg_setenv);
1719                                 if (!n) {
1720                                         log_oom();
1721                                         goto child_fail;
1722                                 }
1723
1724                                 env_use = n;
1725                         } else
1726                                 env_use = (char**) envp;
1727
1728 #ifdef HAVE_SELINUX
1729                         if (arg_selinux_context)
1730                                 if (setexeccon(arg_selinux_context) < 0)
1731                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1732 #endif
1733                         if (arg_boot) {
1734                                 char **a;
1735                                 size_t l;
1736
1737                                 /* Automatically search for the init system */
1738
1739                                 l = 1 + argc - optind;
1740                                 a = newa(char*, l + 1);
1741                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1742
1743                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1744                                 execve(a[0], a, env_use);
1745
1746                                 a[0] = (char*) "/lib/systemd/systemd";
1747                                 execve(a[0], a, env_use);
1748
1749                                 a[0] = (char*) "/sbin/init";
1750                                 execve(a[0], a, env_use);
1751                         } else if (argc > optind)
1752                                 execvpe(argv[optind], argv + optind, env_use);
1753                         else {
1754                                 chdir(home ? home : "/root");
1755                                 execle("/bin/bash", "-bash", NULL, env_use);
1756                         }
1757
1758                         log_error("execv() failed: %m");
1759
1760                 child_fail:
1761                         _exit(EXIT_FAILURE);
1762                 }
1763
1764                 fdset_free(fds);
1765                 fds = NULL;
1766
1767                 r = register_machine(pid);
1768                 if (r < 0)
1769                         goto finish;
1770
1771                 r = move_network_interfaces(pid);
1772                 if (r < 0)
1773                         goto finish;
1774
1775                 eventfd_write(sync_fd, 1);
1776                 close_nointr_nofail(sync_fd);
1777                 sync_fd = -1;
1778
1779                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1780                 if (k < 0) {
1781                         r = EXIT_FAILURE;
1782                         break;
1783                 }
1784
1785                 if (!arg_quiet)
1786                         putc('\n', stdout);
1787
1788                 /* Kill if it is not dead yet anyway */
1789                 terminate_machine(pid);
1790
1791                 /* Redundant, but better safe than sorry */
1792                 kill(pid, SIGKILL);
1793
1794                 k = wait_for_terminate(pid, &status);
1795                 pid = 0;
1796
1797                 if (k < 0) {
1798                         r = EXIT_FAILURE;
1799                         break;
1800                 }
1801
1802                 if (status.si_code == CLD_EXITED) {
1803                         r = status.si_status;
1804                         if (status.si_status != 0) {
1805                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1806                                 break;
1807                         }
1808
1809                         if (!arg_quiet)
1810                                 log_debug("Container %s exited successfully.", arg_machine);
1811                         break;
1812                 } else if (status.si_code == CLD_KILLED &&
1813                            status.si_status == SIGINT) {
1814
1815                         if (!arg_quiet)
1816                                 log_info("Container %s has been shut down.", arg_machine);
1817                         r = 0;
1818                         break;
1819                 } else if (status.si_code == CLD_KILLED &&
1820                            status.si_status == SIGHUP) {
1821
1822                         if (!arg_quiet)
1823                                 log_info("Container %s is being rebooted.", arg_machine);
1824                         continue;
1825                 } else if (status.si_code == CLD_KILLED ||
1826                            status.si_code == CLD_DUMPED) {
1827
1828                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1829                         r = EXIT_FAILURE;
1830                         break;
1831                 } else {
1832                         log_error("Container %s failed due to unknown reason.", arg_machine);
1833                         r = EXIT_FAILURE;
1834                         break;
1835                 }
1836         }
1837
1838 finish:
1839         if (pid > 0)
1840                 kill(pid, SIGKILL);
1841
1842         free(arg_directory);
1843         free(arg_machine);
1844         free(arg_setenv);
1845         free(arg_network_interfaces);
1846
1847         return r;
1848 }