chiark / gitweb /
01b8c3203d2ccb3499f407e5328af746a8702b39
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95
96 #ifdef HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99
100 typedef enum ContainerStatus {
101         CONTAINER_TERMINATED,
102         CONTAINER_REBOOTED
103 } ContainerStatus;
104
105 typedef enum LinkJournal {
106         LINK_NO,
107         LINK_AUTO,
108         LINK_HOST,
109         LINK_GUEST
110 } LinkJournal;
111
112 typedef enum Volatile {
113         VOLATILE_NO,
114         VOLATILE_YES,
115         VOLATILE_STATE,
116 } Volatile;
117
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133         (1ULL << CAP_CHOWN) |
134         (1ULL << CAP_DAC_OVERRIDE) |
135         (1ULL << CAP_DAC_READ_SEARCH) |
136         (1ULL << CAP_FOWNER) |
137         (1ULL << CAP_FSETID) |
138         (1ULL << CAP_IPC_OWNER) |
139         (1ULL << CAP_KILL) |
140         (1ULL << CAP_LEASE) |
141         (1ULL << CAP_LINUX_IMMUTABLE) |
142         (1ULL << CAP_NET_BIND_SERVICE) |
143         (1ULL << CAP_NET_BROADCAST) |
144         (1ULL << CAP_NET_RAW) |
145         (1ULL << CAP_SETGID) |
146         (1ULL << CAP_SETFCAP) |
147         (1ULL << CAP_SETPCAP) |
148         (1ULL << CAP_SETUID) |
149         (1ULL << CAP_SYS_ADMIN) |
150         (1ULL << CAP_SYS_CHROOT) |
151         (1ULL << CAP_SYS_NICE) |
152         (1ULL << CAP_SYS_PTRACE) |
153         (1ULL << CAP_SYS_TTY_CONFIG) |
154         (1ULL << CAP_SYS_RESOURCE) |
155         (1ULL << CAP_SYS_BOOT) |
156         (1ULL << CAP_AUDIT_WRITE) |
157         (1ULL << CAP_AUDIT_CONTROL) |
158         (1ULL << CAP_MKNOD);
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
174
175 static void help(void) {
176         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178                "  -h --help                 Show this help\n"
179                "     --version              Print version string\n"
180                "  -q --quiet                Do not show status information\n"
181                "  -D --directory=PATH       Root directory for the container\n"
182                "     --template=PATH        Initialize root directory from template directory,\n"
183                "                            if missing\n"
184                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
185                "                            remove it after exit\n"
186                "  -i --image=PATH           File system device or disk image for the container\n"
187                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
188                "  -u --user=USER            Run the command under specified user or uid\n"
189                "  -M --machine=NAME         Set the machine name for the container\n"
190                "     --uuid=UUID            Set a specific machine UUID for the container\n"
191                "  -S --slice=SLICE          Place the container in the specified slice\n"
192                "     --private-network      Disable network in container\n"
193                "     --network-interface=INTERFACE\n"
194                "                            Assign an existing network interface to the\n"
195                "                            container\n"
196                "     --network-macvlan=INTERFACE\n"
197                "                            Create a macvlan network interface based on an\n"
198                "                            existing network interface to the container\n"
199                "     --network-veth         Add a virtual ethernet connection between host\n"
200                "                            and container\n"
201                "     --network-bridge=INTERFACE\n"
202                "                            Add a virtual ethernet connection between host\n"
203                "                            and container and add it to an existing bridge on\n"
204                "                            the host\n"
205                "  -Z --selinux-context=SECLABEL\n"
206                "                            Set the SELinux security context to be used by\n"
207                "                            processes in the container\n"
208                "  -L --selinux-apifs-context=SECLABEL\n"
209                "                            Set the SELinux security context to be used by\n"
210                "                            API/tmpfs file systems in the container\n"
211                "     --capability=CAP       In addition to the default, retain specified\n"
212                "                            capability\n"
213                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
214                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
215                "                            try-guest, try-host\n"
216                "  -j                        Equivalent to --link-journal=try-guest\n"
217                "     --read-only            Mount the root directory read-only\n"
218                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
219                "                            the container\n"
220                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
221                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
223                "     --share-system         Share system namespaces with host\n"
224                "     --register=BOOLEAN     Register container as machine\n"
225                "     --keep-unit            Do not register a scope for the machine, reuse\n"
226                "                            the service unit nspawn is running in\n"
227                "     --volatile[=MODE]      Run the system in volatile mode\n",
228                program_invocation_short_name);
229 }
230
231 static int set_sanitized_path(char **b, const char *path) {
232         char *p;
233
234         assert(b);
235         assert(path);
236
237         p = canonicalize_file_name(path);
238         if (!p) {
239                 if (errno != ENOENT)
240                         return -errno;
241
242                 p = path_make_absolute_cwd(path);
243                 if (!p)
244                         return -ENOMEM;
245         }
246
247         free(*b);
248         *b = path_kill_slashes(p);
249         return 0;
250 }
251
252 static int parse_argv(int argc, char *argv[]) {
253
254         enum {
255                 ARG_VERSION = 0x100,
256                 ARG_PRIVATE_NETWORK,
257                 ARG_UUID,
258                 ARG_READ_ONLY,
259                 ARG_CAPABILITY,
260                 ARG_DROP_CAPABILITY,
261                 ARG_LINK_JOURNAL,
262                 ARG_BIND,
263                 ARG_BIND_RO,
264                 ARG_TMPFS,
265                 ARG_SETENV,
266                 ARG_SHARE_SYSTEM,
267                 ARG_REGISTER,
268                 ARG_KEEP_UNIT,
269                 ARG_NETWORK_INTERFACE,
270                 ARG_NETWORK_MACVLAN,
271                 ARG_NETWORK_VETH,
272                 ARG_NETWORK_BRIDGE,
273                 ARG_PERSONALITY,
274                 ARG_VOLATILE,
275                 ARG_TEMPLATE,
276         };
277
278         static const struct option options[] = {
279                 { "help",                  no_argument,       NULL, 'h'                   },
280                 { "version",               no_argument,       NULL, ARG_VERSION           },
281                 { "directory",             required_argument, NULL, 'D'                   },
282                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
283                 { "ephemeral",             no_argument,       NULL, 'x'                   },
284                 { "user",                  required_argument, NULL, 'u'                   },
285                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
286                 { "boot",                  no_argument,       NULL, 'b'                   },
287                 { "uuid",                  required_argument, NULL, ARG_UUID              },
288                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
289                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
290                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
291                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
292                 { "bind",                  required_argument, NULL, ARG_BIND              },
293                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
294                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
295                 { "machine",               required_argument, NULL, 'M'                   },
296                 { "slice",                 required_argument, NULL, 'S'                   },
297                 { "setenv",                required_argument, NULL, ARG_SETENV            },
298                 { "selinux-context",       required_argument, NULL, 'Z'                   },
299                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
300                 { "quiet",                 no_argument,       NULL, 'q'                   },
301                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
302                 { "register",              required_argument, NULL, ARG_REGISTER          },
303                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
304                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
305                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
306                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
307                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
308                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
309                 { "image",                 required_argument, NULL, 'i'                   },
310                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
311                 {}
312         };
313
314         int c, r;
315         uint64_t plus = 0, minus = 0;
316
317         assert(argc >= 0);
318         assert(argv);
319
320         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
321
322                 switch (c) {
323
324                 case 'h':
325                         help();
326                         return 0;
327
328                 case ARG_VERSION:
329                         puts(PACKAGE_STRING);
330                         puts(SYSTEMD_FEATURES);
331                         return 0;
332
333                 case 'D':
334                         r = set_sanitized_path(&arg_directory, optarg);
335                         if (r < 0)
336                                 return log_error_errno(r, "Invalid root directory: %m");
337
338                         break;
339
340                 case ARG_TEMPLATE:
341                         r = set_sanitized_path(&arg_template, optarg);
342                         if (r < 0)
343                                 return log_error_errno(r, "Invalid template directory: %m");
344
345                         break;
346
347                 case 'i':
348                         r = set_sanitized_path(&arg_image, optarg);
349                         if (r < 0)
350                                 return log_error_errno(r, "Invalid image path: %m");
351
352                         break;
353
354                 case 'x':
355                         arg_ephemeral = true;
356                         break;
357
358                 case 'u':
359                         free(arg_user);
360                         arg_user = strdup(optarg);
361                         if (!arg_user)
362                                 return log_oom();
363
364                         break;
365
366                 case ARG_NETWORK_BRIDGE:
367                         arg_network_bridge = optarg;
368
369                         /* fall through */
370
371                 case ARG_NETWORK_VETH:
372                         arg_network_veth = true;
373                         arg_private_network = true;
374                         break;
375
376                 case ARG_NETWORK_INTERFACE:
377                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
378                                 return log_oom();
379
380                         arg_private_network = true;
381                         break;
382
383                 case ARG_NETWORK_MACVLAN:
384                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
385                                 return log_oom();
386
387                         /* fall through */
388
389                 case ARG_PRIVATE_NETWORK:
390                         arg_private_network = true;
391                         break;
392
393                 case 'b':
394                         arg_boot = true;
395                         break;
396
397                 case ARG_UUID:
398                         r = sd_id128_from_string(optarg, &arg_uuid);
399                         if (r < 0) {
400                                 log_error("Invalid UUID: %s", optarg);
401                                 return r;
402                         }
403                         break;
404
405                 case 'S':
406                         arg_slice = optarg;
407                         break;
408
409                 case 'M':
410                         if (isempty(optarg)) {
411                                 free(arg_machine);
412                                 arg_machine = NULL;
413                         } else {
414                                 if (!machine_name_is_valid(optarg)) {
415                                         log_error("Invalid machine name: %s", optarg);
416                                         return -EINVAL;
417                                 }
418
419                                 r = free_and_strdup(&arg_machine, optarg);
420                                 if (r < 0)
421                                         return log_oom();
422
423                                 break;
424                         }
425
426                 case 'Z':
427                         arg_selinux_context = optarg;
428                         break;
429
430                 case 'L':
431                         arg_selinux_apifs_context = optarg;
432                         break;
433
434                 case ARG_READ_ONLY:
435                         arg_read_only = true;
436                         break;
437
438                 case ARG_CAPABILITY:
439                 case ARG_DROP_CAPABILITY: {
440                         const char *state, *word;
441                         size_t length;
442
443                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444                                 _cleanup_free_ char *t;
445
446                                 t = strndup(word, length);
447                                 if (!t)
448                                         return log_oom();
449
450                                 if (streq(t, "all")) {
451                                         if (c == ARG_CAPABILITY)
452                                                 plus = (uint64_t) -1;
453                                         else
454                                                 minus = (uint64_t) -1;
455                                 } else {
456                                         int cap;
457
458                                         cap = capability_from_name(t);
459                                         if (cap < 0) {
460                                                 log_error("Failed to parse capability %s.", t);
461                                                 return -EINVAL;
462                                         }
463
464                                         if (c == ARG_CAPABILITY)
465                                                 plus |= 1ULL << (uint64_t) cap;
466                                         else
467                                                 minus |= 1ULL << (uint64_t) cap;
468                                 }
469                         }
470
471                         break;
472                 }
473
474                 case 'j':
475                         arg_link_journal = LINK_GUEST;
476                         arg_link_journal_try = true;
477                         break;
478
479                 case ARG_LINK_JOURNAL:
480                         if (streq(optarg, "auto")) {
481                                 arg_link_journal = LINK_AUTO;
482                                 arg_link_journal_try = false;
483                         } else if (streq(optarg, "no")) {
484                                 arg_link_journal = LINK_NO;
485                                 arg_link_journal_try = false;
486                         } else if (streq(optarg, "guest")) {
487                                 arg_link_journal = LINK_GUEST;
488                                 arg_link_journal_try = false;
489                         } else if (streq(optarg, "host")) {
490                                 arg_link_journal = LINK_HOST;
491                                 arg_link_journal_try = false;
492                         } else if (streq(optarg, "try-guest")) {
493                                 arg_link_journal = LINK_GUEST;
494                                 arg_link_journal_try = true;
495                         } else if (streq(optarg, "try-host")) {
496                                 arg_link_journal = LINK_HOST;
497                                 arg_link_journal_try = true;
498                         } else {
499                                 log_error("Failed to parse link journal mode %s", optarg);
500                                 return -EINVAL;
501                         }
502
503                         break;
504
505                 case ARG_BIND:
506                 case ARG_BIND_RO: {
507                         _cleanup_free_ char *a = NULL, *b = NULL;
508                         char *e;
509                         char ***x;
510
511                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
512
513                         e = strchr(optarg, ':');
514                         if (e) {
515                                 a = strndup(optarg, e - optarg);
516                                 b = strdup(e + 1);
517                         } else {
518                                 a = strdup(optarg);
519                                 b = strdup(optarg);
520                         }
521
522                         if (!a || !b)
523                                 return log_oom();
524
525                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
526                                 log_error("Invalid bind mount specification: %s", optarg);
527                                 return -EINVAL;
528                         }
529
530                         r = strv_extend(x, a);
531                         if (r < 0)
532                                 return log_oom();
533
534                         r = strv_extend(x, b);
535                         if (r < 0)
536                                 return log_oom();
537
538                         break;
539                 }
540
541                 case ARG_TMPFS: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544
545                         e = strchr(optarg, ':');
546                         if (e) {
547                                 a = strndup(optarg, e - optarg);
548                                 b = strdup(e + 1);
549                         } else {
550                                 a = strdup(optarg);
551                                 b = strdup("mode=0755");
552                         }
553
554                         if (!a || !b)
555                                 return log_oom();
556
557                         if (!path_is_absolute(a)) {
558                                 log_error("Invalid tmpfs specification: %s", optarg);
559                                 return -EINVAL;
560                         }
561
562                         r = strv_push(&arg_tmpfs, a);
563                         if (r < 0)
564                                 return log_oom();
565
566                         a = NULL;
567
568                         r = strv_push(&arg_tmpfs, b);
569                         if (r < 0)
570                                 return log_oom();
571
572                         b = NULL;
573
574                         break;
575                 }
576
577                 case ARG_SETENV: {
578                         char **n;
579
580                         if (!env_assignment_is_valid(optarg)) {
581                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
582                                 return -EINVAL;
583                         }
584
585                         n = strv_env_set(arg_setenv, optarg);
586                         if (!n)
587                                 return log_oom();
588
589                         strv_free(arg_setenv);
590                         arg_setenv = n;
591                         break;
592                 }
593
594                 case 'q':
595                         arg_quiet = true;
596                         break;
597
598                 case ARG_SHARE_SYSTEM:
599                         arg_share_system = true;
600                         break;
601
602                 case ARG_REGISTER:
603                         r = parse_boolean(optarg);
604                         if (r < 0) {
605                                 log_error("Failed to parse --register= argument: %s", optarg);
606                                 return r;
607                         }
608
609                         arg_register = r;
610                         break;
611
612                 case ARG_KEEP_UNIT:
613                         arg_keep_unit = true;
614                         break;
615
616                 case ARG_PERSONALITY:
617
618                         arg_personality = personality_from_string(optarg);
619                         if (arg_personality == 0xffffffffLU) {
620                                 log_error("Unknown or unsupported personality '%s'.", optarg);
621                                 return -EINVAL;
622                         }
623
624                         break;
625
626                 case ARG_VOLATILE:
627
628                         if (!optarg)
629                                 arg_volatile = VOLATILE_YES;
630                         else {
631                                 r = parse_boolean(optarg);
632                                 if (r < 0) {
633                                         if (streq(optarg, "state"))
634                                                 arg_volatile = VOLATILE_STATE;
635                                         else {
636                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
637                                                 return r;
638                                         }
639                                 } else
640                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
641                         }
642
643                         break;
644
645                 case '?':
646                         return -EINVAL;
647
648                 default:
649                         assert_not_reached("Unhandled option");
650                 }
651
652         if (arg_share_system)
653                 arg_register = false;
654
655         if (arg_boot && arg_share_system) {
656                 log_error("--boot and --share-system may not be combined.");
657                 return -EINVAL;
658         }
659
660         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
661                 log_error("--keep-unit may not be used when invoked from a user session.");
662                 return -EINVAL;
663         }
664
665         if (arg_directory && arg_image) {
666                 log_error("--directory= and --image= may not be combined.");
667                 return -EINVAL;
668         }
669
670         if (arg_template && arg_image) {
671                 log_error("--template= and --image= may not be combined.");
672                 return -EINVAL;
673         }
674
675         if (arg_template && !(arg_directory || arg_machine)) {
676                 log_error("--template= needs --directory= or --machine=.");
677                 return -EINVAL;
678         }
679
680         if (arg_ephemeral && arg_template) {
681                 log_error("--ephemeral and --template= may not be combined.");
682                 return -EINVAL;
683         }
684
685         if (arg_ephemeral && arg_image) {
686                 log_error("--ephemeral and --image= may not be combined.");
687                 return -EINVAL;
688         }
689
690         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
691                 log_error("--ephemeral and --link-journal= may not be combined.");
692                 return -EINVAL;
693         }
694
695         if (arg_volatile != VOLATILE_NO && arg_read_only) {
696                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
697                 return -EINVAL;
698         }
699
700         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
701
702         return 1;
703 }
704
705 static int mount_all(const char *dest) {
706
707         typedef struct MountPoint {
708                 const char *what;
709                 const char *where;
710                 const char *type;
711                 const char *options;
712                 unsigned long flags;
713                 bool fatal;
714         } MountPoint;
715
716         static const MountPoint mount_table[] = {
717                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
718                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
719                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
720                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
721                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
722                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
723                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
724                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
725 #ifdef HAVE_SELINUX
726                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
727                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
728 #endif
729         };
730
731         unsigned k;
732         int r = 0;
733
734         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
735                 _cleanup_free_ char *where = NULL;
736 #ifdef HAVE_SELINUX
737                 _cleanup_free_ char *options = NULL;
738 #endif
739                 const char *o;
740                 int t;
741
742                 where = strjoin(dest, "/", mount_table[k].where, NULL);
743                 if (!where)
744                         return log_oom();
745
746                 t = path_is_mount_point(where, true);
747                 if (t < 0) {
748                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
749
750                         if (r == 0)
751                                 r = t;
752
753                         continue;
754                 }
755
756                 /* Skip this entry if it is not a remount. */
757                 if (mount_table[k].what && t > 0)
758                         continue;
759
760                 t = mkdir_p(where, 0755);
761                 if (t < 0) {
762                         if (mount_table[k].fatal) {
763                                log_error_errno(t, "Failed to create directory %s: %m", where);
764
765                                 if (r == 0)
766                                         r = t;
767                         } else
768                                log_warning_errno(t, "Failed to create directory %s: %m", where);
769
770                         continue;
771                 }
772
773 #ifdef HAVE_SELINUX
774                 if (arg_selinux_apifs_context &&
775                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
776                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
777                         if (!options)
778                                 return log_oom();
779
780                         o = options;
781                 } else
782 #endif
783                         o = mount_table[k].options;
784
785
786                 if (mount(mount_table[k].what,
787                           where,
788                           mount_table[k].type,
789                           mount_table[k].flags,
790                           o) < 0) {
791
792                         if (mount_table[k].fatal) {
793                                 log_error_errno(errno, "mount(%s) failed: %m", where);
794
795                                 if (r == 0)
796                                         r = -errno;
797                         } else
798                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
799                 }
800         }
801
802         return r;
803 }
804
805 static int mount_binds(const char *dest, char **l, bool ro) {
806         char **x, **y;
807
808         STRV_FOREACH_PAIR(x, y, l) {
809                 _cleanup_free_ char *where = NULL;
810                 struct stat source_st, dest_st;
811                 int r;
812
813                 if (stat(*x, &source_st) < 0)
814                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
815
816                 where = strappend(dest, *y);
817                 if (!where)
818                         return log_oom();
819
820                 r = stat(where, &dest_st);
821                 if (r == 0) {
822                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
823                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
824                                 return -EINVAL;
825                         }
826                 } else if (errno == ENOENT) {
827                         r = mkdir_parents_label(where, 0755);
828                         if (r < 0)
829                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
830                 } else {
831                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
832                         return -errno;
833                 }
834
835                 /* Create the mount point, but be conservative -- refuse to create block
836                  * and char devices. */
837                 if (S_ISDIR(source_st.st_mode)) {
838                         r = mkdir_label(where, 0755);
839                         if (r < 0 && errno != EEXIST)
840                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
841                 } else if (S_ISFIFO(source_st.st_mode)) {
842                         r = mkfifo(where, 0644);
843                         if (r < 0 && errno != EEXIST)
844                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
845                 } else if (S_ISSOCK(source_st.st_mode)) {
846                         r = mknod(where, 0644 | S_IFSOCK, 0);
847                         if (r < 0 && errno != EEXIST)
848                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
849                 } else if (S_ISREG(source_st.st_mode)) {
850                         r = touch(where);
851                         if (r < 0)
852                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
853                 } else {
854                         log_error("Refusing to create mountpoint for file: %s", *x);
855                         return -ENOTSUP;
856                 }
857
858                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
859                         return log_error_errno(errno, "mount(%s) failed: %m", where);
860
861                 if (ro) {
862                         r = bind_remount_recursive(where, true);
863                         if (r < 0)
864                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
865                 }
866         }
867
868         return 0;
869 }
870
871 static int mount_tmpfs(const char *dest) {
872         char **i, **o;
873
874         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
875                 _cleanup_free_ char *where = NULL;
876                 int r;
877
878                 where = strappend(dest, *i);
879                 if (!where)
880                         return log_oom();
881
882                 r = mkdir_label(where, 0755);
883                 if (r < 0 && r != -EEXIST)
884                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
885
886                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
887                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
888         }
889
890         return 0;
891 }
892
893 static int setup_timezone(const char *dest) {
894         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
895         char *z, *y;
896         int r;
897
898         assert(dest);
899
900         /* Fix the timezone, if possible */
901         r = readlink_malloc("/etc/localtime", &p);
902         if (r < 0) {
903                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
904                 return 0;
905         }
906
907         z = path_startswith(p, "../usr/share/zoneinfo/");
908         if (!z)
909                 z = path_startswith(p, "/usr/share/zoneinfo/");
910         if (!z) {
911                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
912                 return 0;
913         }
914
915         where = strappend(dest, "/etc/localtime");
916         if (!where)
917                 return log_oom();
918
919         r = readlink_malloc(where, &q);
920         if (r >= 0) {
921                 y = path_startswith(q, "../usr/share/zoneinfo/");
922                 if (!y)
923                         y = path_startswith(q, "/usr/share/zoneinfo/");
924
925                 /* Already pointing to the right place? Then do nothing .. */
926                 if (y && streq(y, z))
927                         return 0;
928         }
929
930         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
931         if (!check)
932                 return log_oom();
933
934         if (access(check, F_OK) < 0) {
935                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
936                 return 0;
937         }
938
939         what = strappend("../usr/share/zoneinfo/", z);
940         if (!what)
941                 return log_oom();
942
943         r = mkdir_parents(where, 0755);
944         if (r < 0) {
945                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
946
947                 return 0;
948         }
949
950         r = unlink(where);
951         if (r < 0 && errno != ENOENT) {
952                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
953
954                 return 0;
955         }
956
957         if (symlink(what, where) < 0) {
958                 log_error_errno(errno, "Failed to correct timezone of container: %m");
959                 return 0;
960         }
961
962         return 0;
963 }
964
965 static int setup_resolv_conf(const char *dest) {
966         _cleanup_free_ char *where = NULL;
967         int r;
968
969         assert(dest);
970
971         if (arg_private_network)
972                 return 0;
973
974         /* Fix resolv.conf, if possible */
975         where = strappend(dest, "/etc/resolv.conf");
976         if (!where)
977                 return log_oom();
978
979         /* We don't really care for the results of this really. If it
980          * fails, it fails, but meh... */
981         r = mkdir_parents(where, 0755);
982         if (r < 0) {
983                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
984
985                 return 0;
986         }
987
988         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
989         if (r < 0) {
990                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
991
992                 return 0;
993         }
994
995         return 0;
996 }
997
998 static int setup_volatile_state(const char *directory) {
999         const char *p;
1000         int r;
1001
1002         assert(directory);
1003
1004         if (arg_volatile != VOLATILE_STATE)
1005                 return 0;
1006
1007         /* --volatile=state means we simply overmount /var
1008            with a tmpfs, and the rest read-only. */
1009
1010         r = bind_remount_recursive(directory, true);
1011         if (r < 0)
1012                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1013
1014         p = strappenda(directory, "/var");
1015         r = mkdir(p, 0755);
1016         if (r < 0 && errno != EEXIST)
1017                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1018
1019         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1020                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1021
1022         return 0;
1023 }
1024
1025 static int setup_volatile(const char *directory) {
1026         bool tmpfs_mounted = false, bind_mounted = false;
1027         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1028         const char *f, *t;
1029         int r;
1030
1031         assert(directory);
1032
1033         if (arg_volatile != VOLATILE_YES)
1034                 return 0;
1035
1036         /* --volatile=yes means we mount a tmpfs to the root dir, and
1037            the original /usr to use inside it, and that read-only. */
1038
1039         if (!mkdtemp(template))
1040                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1041
1042         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1043                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1044                 r = -errno;
1045                 goto fail;
1046         }
1047
1048         tmpfs_mounted = true;
1049
1050         f = strappenda(directory, "/usr");
1051         t = strappenda(template, "/usr");
1052
1053         r = mkdir(t, 0755);
1054         if (r < 0 && errno != EEXIST) {
1055                 log_error_errno(errno, "Failed to create %s: %m", t);
1056                 r = -errno;
1057                 goto fail;
1058         }
1059
1060         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1061                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1062                 r = -errno;
1063                 goto fail;
1064         }
1065
1066         bind_mounted = true;
1067
1068         r = bind_remount_recursive(t, true);
1069         if (r < 0) {
1070                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1071                 goto fail;
1072         }
1073
1074         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1075                 log_error_errno(errno, "Failed to move root mount: %m");
1076                 r = -errno;
1077                 goto fail;
1078         }
1079
1080         rmdir(template);
1081
1082         return 0;
1083
1084 fail:
1085         if (bind_mounted)
1086                 umount(t);
1087         if (tmpfs_mounted)
1088                 umount(template);
1089         rmdir(template);
1090         return r;
1091 }
1092
1093 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1094
1095         snprintf(s, 37,
1096                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1097                  SD_ID128_FORMAT_VAL(id));
1098
1099         return s;
1100 }
1101
1102 static int setup_boot_id(const char *dest) {
1103         _cleanup_free_ char *from = NULL, *to = NULL;
1104         sd_id128_t rnd = {};
1105         char as_uuid[37];
1106         int r;
1107
1108         assert(dest);
1109
1110         if (arg_share_system)
1111                 return 0;
1112
1113         /* Generate a new randomized boot ID, so that each boot-up of
1114          * the container gets a new one */
1115
1116         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1117         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1118         if (!from || !to)
1119                 return log_oom();
1120
1121         r = sd_id128_randomize(&rnd);
1122         if (r < 0)
1123                 return log_error_errno(r, "Failed to generate random boot id: %m");
1124
1125         id128_format_as_uuid(rnd, as_uuid);
1126
1127         r = write_string_file(from, as_uuid);
1128         if (r < 0)
1129                 return log_error_errno(r, "Failed to write boot id: %m");
1130
1131         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1132                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1133                 r = -errno;
1134         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1135                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1136
1137         unlink(from);
1138         return r;
1139 }
1140
1141 static int copy_devnodes(const char *dest) {
1142
1143         static const char devnodes[] =
1144                 "null\0"
1145                 "zero\0"
1146                 "full\0"
1147                 "random\0"
1148                 "urandom\0"
1149                 "tty\0"
1150                 "net/tun\0";
1151
1152         const char *d;
1153         int r = 0;
1154         _cleanup_umask_ mode_t u;
1155
1156         assert(dest);
1157
1158         u = umask(0000);
1159
1160         NULSTR_FOREACH(d, devnodes) {
1161                 _cleanup_free_ char *from = NULL, *to = NULL;
1162                 struct stat st;
1163
1164                 from = strappend("/dev/", d);
1165                 to = strjoin(dest, "/dev/", d, NULL);
1166                 if (!from || !to)
1167                         return log_oom();
1168
1169                 if (stat(from, &st) < 0) {
1170
1171                         if (errno != ENOENT)
1172                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1173
1174                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1175
1176                         log_error("%s is not a char or block device, cannot copy", from);
1177                         return -EIO;
1178
1179                 } else {
1180                         r = mkdir_parents(to, 0775);
1181                         if (r < 0) {
1182                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1183                                 return -r;
1184                         }
1185
1186                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1187                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1188                 }
1189         }
1190
1191         return r;
1192 }
1193
1194 static int setup_ptmx(const char *dest) {
1195         _cleanup_free_ char *p = NULL;
1196
1197         p = strappend(dest, "/dev/ptmx");
1198         if (!p)
1199                 return log_oom();
1200
1201         if (symlink("pts/ptmx", p) < 0)
1202                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1203
1204         return 0;
1205 }
1206
1207 static int setup_dev_console(const char *dest, const char *console) {
1208         _cleanup_umask_ mode_t u;
1209         const char *to;
1210         struct stat st;
1211         int r;
1212
1213         assert(dest);
1214         assert(console);
1215
1216         u = umask(0000);
1217
1218         if (stat("/dev/null", &st) < 0)
1219                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1220
1221         r = chmod_and_chown(console, 0600, 0, 0);
1222         if (r < 0)
1223                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1224
1225         /* We need to bind mount the right tty to /dev/console since
1226          * ptys can only exist on pts file systems. To have something
1227          * to bind mount things on we create a device node first, and
1228          * use /dev/null for that since we the cgroups device policy
1229          * allows us to create that freely, while we cannot create
1230          * /dev/console. (Note that the major minor doesn't actually
1231          * matter here, since we mount it over anyway). */
1232
1233         to = strappenda(dest, "/dev/console");
1234         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1235                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1236
1237         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1238                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1239
1240         return 0;
1241 }
1242
1243 static int setup_kmsg(const char *dest, int kmsg_socket) {
1244         _cleanup_free_ char *from = NULL, *to = NULL;
1245         int r, fd, k;
1246         _cleanup_umask_ mode_t u;
1247         union {
1248                 struct cmsghdr cmsghdr;
1249                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1250         } control = {};
1251         struct msghdr mh = {
1252                 .msg_control = &control,
1253                 .msg_controllen = sizeof(control),
1254         };
1255         struct cmsghdr *cmsg;
1256
1257         assert(dest);
1258         assert(kmsg_socket >= 0);
1259
1260         u = umask(0000);
1261
1262         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1263          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1264          * on the reading side behave very similar to /proc/kmsg,
1265          * their writing side behaves differently from /dev/kmsg in
1266          * that writing blocks when nothing is reading. In order to
1267          * avoid any problems with containers deadlocking due to this
1268          * we simply make /dev/kmsg unavailable to the container. */
1269         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1270             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1271                 return log_oom();
1272
1273         if (mkfifo(from, 0600) < 0)
1274                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1275
1276         r = chmod_and_chown(from, 0600, 0, 0);
1277         if (r < 0)
1278                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1279
1280         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1281                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1282
1283         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1284         if (fd < 0)
1285                 return log_error_errno(errno, "Failed to open fifo: %m");
1286
1287         cmsg = CMSG_FIRSTHDR(&mh);
1288         cmsg->cmsg_level = SOL_SOCKET;
1289         cmsg->cmsg_type = SCM_RIGHTS;
1290         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1291         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1292
1293         mh.msg_controllen = cmsg->cmsg_len;
1294
1295         /* Store away the fd in the socket, so that it stays open as
1296          * long as we run the child */
1297         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1298         safe_close(fd);
1299
1300         if (k < 0)
1301                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1302
1303         /* And now make the FIFO unavailable as /dev/kmsg... */
1304         unlink(from);
1305         return 0;
1306 }
1307
1308 static int setup_hostname(void) {
1309
1310         if (arg_share_system)
1311                 return 0;
1312
1313         if (sethostname_idempotent(arg_machine) < 0)
1314                 return -errno;
1315
1316         return 0;
1317 }
1318
1319 static int setup_journal(const char *directory) {
1320         sd_id128_t machine_id, this_id;
1321         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1322         char *id;
1323         int r;
1324
1325         /* Don't link journals in ephemeral mode */
1326         if (arg_ephemeral)
1327                 return 0;
1328
1329         p = strappend(directory, "/etc/machine-id");
1330         if (!p)
1331                 return log_oom();
1332
1333         r = read_one_line_file(p, &b);
1334         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1335                 return 0;
1336         else if (r < 0)
1337                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1338
1339         id = strstrip(b);
1340         if (isempty(id) && arg_link_journal == LINK_AUTO)
1341                 return 0;
1342
1343         /* Verify validity */
1344         r = sd_id128_from_string(id, &machine_id);
1345         if (r < 0)
1346                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1347
1348         r = sd_id128_get_machine(&this_id);
1349         if (r < 0)
1350                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1351
1352         if (sd_id128_equal(machine_id, this_id)) {
1353                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1354                          "Host and machine ids are equal (%s): refusing to link journals", id);
1355                 if (arg_link_journal == LINK_AUTO)
1356                         return 0;
1357                 return -EEXIST;
1358         }
1359
1360         if (arg_link_journal == LINK_NO)
1361                 return 0;
1362
1363         free(p);
1364         p = strappend("/var/log/journal/", id);
1365         q = strjoin(directory, "/var/log/journal/", id, NULL);
1366         if (!p || !q)
1367                 return log_oom();
1368
1369         if (path_is_mount_point(p, false) > 0) {
1370                 if (arg_link_journal != LINK_AUTO) {
1371                         log_error("%s: already a mount point, refusing to use for journal", p);
1372                         return -EEXIST;
1373                 }
1374
1375                 return 0;
1376         }
1377
1378         if (path_is_mount_point(q, false) > 0) {
1379                 if (arg_link_journal != LINK_AUTO) {
1380                         log_error("%s: already a mount point, refusing to use for journal", q);
1381                         return -EEXIST;
1382                 }
1383
1384                 return 0;
1385         }
1386
1387         r = readlink_and_make_absolute(p, &d);
1388         if (r >= 0) {
1389                 if ((arg_link_journal == LINK_GUEST ||
1390                      arg_link_journal == LINK_AUTO) &&
1391                     path_equal(d, q)) {
1392
1393                         r = mkdir_p(q, 0755);
1394                         if (r < 0)
1395                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1396                         return 0;
1397                 }
1398
1399                 if (unlink(p) < 0)
1400                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1401         } else if (r == -EINVAL) {
1402
1403                 if (arg_link_journal == LINK_GUEST &&
1404                     rmdir(p) < 0) {
1405
1406                         if (errno == ENOTDIR) {
1407                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1408                                 return r;
1409                         } else {
1410                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1411                                 return -errno;
1412                         }
1413                 }
1414         } else if (r != -ENOENT) {
1415                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1416                 return r;
1417         }
1418
1419         if (arg_link_journal == LINK_GUEST) {
1420
1421                 if (symlink(q, p) < 0) {
1422                         if (arg_link_journal_try) {
1423                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1424                                 return 0;
1425                         } else {
1426                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1427                                 return -errno;
1428                         }
1429                 }
1430
1431                 r = mkdir_p(q, 0755);
1432                 if (r < 0)
1433                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1434                 return 0;
1435         }
1436
1437         if (arg_link_journal == LINK_HOST) {
1438                 /* don't create parents here -- if the host doesn't have
1439                  * permanent journal set up, don't force it here */
1440                 r = mkdir(p, 0755);
1441                 if (r < 0) {
1442                         if (arg_link_journal_try) {
1443                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1444                                 return 0;
1445                         } else {
1446                                 log_error_errno(errno, "Failed to create %s: %m", p);
1447                                 return r;
1448                         }
1449                 }
1450
1451         } else if (access(p, F_OK) < 0)
1452                 return 0;
1453
1454         if (dir_is_empty(q) == 0)
1455                 log_warning("%s is not empty, proceeding anyway.", q);
1456
1457         r = mkdir_p(q, 0755);
1458         if (r < 0) {
1459                 log_error_errno(errno, "Failed to create %s: %m", q);
1460                 return r;
1461         }
1462
1463         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1464                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1465
1466         return 0;
1467 }
1468
1469 static int drop_capabilities(void) {
1470         return capability_bounding_set_drop(~arg_retain, false);
1471 }
1472
1473 static int register_machine(pid_t pid, int local_ifindex) {
1474         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1475         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1476         int r;
1477
1478         if (!arg_register)
1479                 return 0;
1480
1481         r = sd_bus_default_system(&bus);
1482         if (r < 0)
1483                 return log_error_errno(r, "Failed to open system bus: %m");
1484
1485         if (arg_keep_unit) {
1486                 r = sd_bus_call_method(
1487                                 bus,
1488                                 "org.freedesktop.machine1",
1489                                 "/org/freedesktop/machine1",
1490                                 "org.freedesktop.machine1.Manager",
1491                                 "RegisterMachineWithNetwork",
1492                                 &error,
1493                                 NULL,
1494                                 "sayssusai",
1495                                 arg_machine,
1496                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1497                                 "nspawn",
1498                                 "container",
1499                                 (uint32_t) pid,
1500                                 strempty(arg_directory),
1501                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1502         } else {
1503                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1504
1505                 r = sd_bus_message_new_method_call(
1506                                 bus,
1507                                 &m,
1508                                 "org.freedesktop.machine1",
1509                                 "/org/freedesktop/machine1",
1510                                 "org.freedesktop.machine1.Manager",
1511                                 "CreateMachineWithNetwork");
1512                 if (r < 0)
1513                         return log_error_errno(r, "Failed to create message: %m");
1514
1515                 r = sd_bus_message_append(
1516                                 m,
1517                                 "sayssusai",
1518                                 arg_machine,
1519                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520                                 "nspawn",
1521                                 "container",
1522                                 (uint32_t) pid,
1523                                 strempty(arg_directory),
1524                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525                 if (r < 0)
1526                         return log_error_errno(r, "Failed to append message arguments: %m");
1527
1528                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529                 if (r < 0)
1530                         return log_error_errno(r, "Failed to open container: %m");
1531
1532                 if (!isempty(arg_slice)) {
1533                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1534                         if (r < 0)
1535                                 return log_error_errno(r, "Failed to append slice: %m");
1536                 }
1537
1538                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1539                 if (r < 0)
1540                         return log_error_errno(r, "Failed to add device policy: %m");
1541
1542                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1543                                           /* Allow the container to
1544                                            * access and create the API
1545                                            * device nodes, so that
1546                                            * PrivateDevices= in the
1547                                            * container can work
1548                                            * fine */
1549                                           "/dev/null", "rwm",
1550                                           "/dev/zero", "rwm",
1551                                           "/dev/full", "rwm",
1552                                           "/dev/random", "rwm",
1553                                           "/dev/urandom", "rwm",
1554                                           "/dev/tty", "rwm",
1555                                           "/dev/net/tun", "rwm",
1556                                           /* Allow the container
1557                                            * access to ptys. However,
1558                                            * do not permit the
1559                                            * container to ever create
1560                                            * these device nodes. */
1561                                           "/dev/pts/ptmx", "rw",
1562                                           "char-pts", "rw");
1563                 if (r < 0)
1564                         return log_error_errno(r, "Failed to add device whitelist: %m");
1565
1566                 r = sd_bus_message_close_container(m);
1567                 if (r < 0)
1568                         return log_error_errno(r, "Failed to close container: %m");
1569
1570                 r = sd_bus_call(bus, m, 0, &error, NULL);
1571         }
1572
1573         if (r < 0) {
1574                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1575                 return r;
1576         }
1577
1578         return 0;
1579 }
1580
1581 static int terminate_machine(pid_t pid) {
1582         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1583         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1584         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1585         const char *path;
1586         int r;
1587
1588         if (!arg_register)
1589                 return 0;
1590
1591         r = sd_bus_default_system(&bus);
1592         if (r < 0)
1593                 return log_error_errno(r, "Failed to open system bus: %m");
1594
1595         r = sd_bus_call_method(
1596                         bus,
1597                         "org.freedesktop.machine1",
1598                         "/org/freedesktop/machine1",
1599                         "org.freedesktop.machine1.Manager",
1600                         "GetMachineByPID",
1601                         &error,
1602                         &reply,
1603                         "u",
1604                         (uint32_t) pid);
1605         if (r < 0) {
1606                 /* Note that the machine might already have been
1607                  * cleaned up automatically, hence don't consider it a
1608                  * failure if we cannot get the machine object. */
1609                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1610                 return 0;
1611         }
1612
1613         r = sd_bus_message_read(reply, "o", &path);
1614         if (r < 0)
1615                 return bus_log_parse_error(r);
1616
1617         r = sd_bus_call_method(
1618                         bus,
1619                         "org.freedesktop.machine1",
1620                         path,
1621                         "org.freedesktop.machine1.Machine",
1622                         "Terminate",
1623                         &error,
1624                         NULL,
1625                         NULL);
1626         if (r < 0) {
1627                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1628                 return 0;
1629         }
1630
1631         return 0;
1632 }
1633
1634 static int reset_audit_loginuid(void) {
1635         _cleanup_free_ char *p = NULL;
1636         int r;
1637
1638         if (arg_share_system)
1639                 return 0;
1640
1641         r = read_one_line_file("/proc/self/loginuid", &p);
1642         if (r == -ENOENT)
1643                 return 0;
1644         if (r < 0)
1645                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1646
1647         /* Already reset? */
1648         if (streq(p, "4294967295"))
1649                 return 0;
1650
1651         r = write_string_file("/proc/self/loginuid", "4294967295");
1652         if (r < 0) {
1653                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1654                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1655                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1656                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1657                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1658
1659                 sleep(5);
1660         }
1661
1662         return 0;
1663 }
1664
1665 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1666 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1667 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1668
1669 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1670         uint8_t result[8];
1671         size_t l, sz;
1672         uint8_t *v, *i;
1673         int r;
1674
1675         l = strlen(arg_machine);
1676         sz = sizeof(sd_id128_t) + l;
1677         if (idx > 0)
1678                 sz += sizeof(idx);
1679
1680         v = alloca(sz);
1681
1682         /* fetch some persistent data unique to the host */
1683         r = sd_id128_get_machine((sd_id128_t*) v);
1684         if (r < 0)
1685                 return r;
1686
1687         /* combine with some data unique (on this host) to this
1688          * container instance */
1689         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1690         if (idx > 0) {
1691                 idx = htole64(idx);
1692                 memcpy(i, &idx, sizeof(idx));
1693         }
1694
1695         /* Let's hash the host machine ID plus the container name. We
1696          * use a fixed, but originally randomly created hash key here. */
1697         siphash24(result, v, sz, hash_key.bytes);
1698
1699         assert_cc(ETH_ALEN <= sizeof(result));
1700         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1701
1702         /* see eth_random_addr in the kernel */
1703         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1704         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1705
1706         return 0;
1707 }
1708
1709 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1710         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1711         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1712         struct ether_addr mac_host, mac_container;
1713         int r, i;
1714
1715         if (!arg_private_network)
1716                 return 0;
1717
1718         if (!arg_network_veth)
1719                 return 0;
1720
1721         /* Use two different interface name prefixes depending whether
1722          * we are in bridge mode or not. */
1723         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1724                  arg_network_bridge ? "vb" : "ve", arg_machine);
1725
1726         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1729
1730         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1731         if (r < 0)
1732                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1733
1734         r = sd_rtnl_open(&rtnl, 0);
1735         if (r < 0)
1736                 return log_error_errno(r, "Failed to connect to netlink: %m");
1737
1738         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1739         if (r < 0)
1740                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1741
1742         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1743         if (r < 0)
1744                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1745
1746         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1747         if (r < 0)
1748                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1749
1750         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to open netlink container: %m");
1753
1754         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1755         if (r < 0)
1756                 return log_error_errno(r, "Failed to open netlink container: %m");
1757
1758         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1759         if (r < 0)
1760                 return log_error_errno(r, "Failed to open netlink container: %m");
1761
1762         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1763         if (r < 0)
1764                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1765
1766         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1767         if (r < 0)
1768                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1769
1770         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1771         if (r < 0)
1772                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1773
1774         r = sd_rtnl_message_close_container(m);
1775         if (r < 0)
1776                 return log_error_errno(r, "Failed to close netlink container: %m");
1777
1778         r = sd_rtnl_message_close_container(m);
1779         if (r < 0)
1780                 return log_error_errno(r, "Failed to close netlink container: %m");
1781
1782         r = sd_rtnl_message_close_container(m);
1783         if (r < 0)
1784                 return log_error_errno(r, "Failed to close netlink container: %m");
1785
1786         r = sd_rtnl_call(rtnl, m, 0, NULL);
1787         if (r < 0)
1788                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1789
1790         i = (int) if_nametoindex(iface_name);
1791         if (i <= 0)
1792                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1793
1794         *ifi = i;
1795
1796         return 0;
1797 }
1798
1799 static int setup_bridge(const char veth_name[], int *ifi) {
1800         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1801         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1802         int r, bridge;
1803
1804         if (!arg_private_network)
1805                 return 0;
1806
1807         if (!arg_network_veth)
1808                 return 0;
1809
1810         if (!arg_network_bridge)
1811                 return 0;
1812
1813         bridge = (int) if_nametoindex(arg_network_bridge);
1814         if (bridge <= 0)
1815                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1816
1817         *ifi = bridge;
1818
1819         r = sd_rtnl_open(&rtnl, 0);
1820         if (r < 0)
1821                 return log_error_errno(r, "Failed to connect to netlink: %m");
1822
1823         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1824         if (r < 0)
1825                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1826
1827         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1828         if (r < 0)
1829                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1830
1831         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1832         if (r < 0)
1833                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1834
1835         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1836         if (r < 0)
1837                 return log_error_errno(r, "Failed to add netlink master field: %m");
1838
1839         r = sd_rtnl_call(rtnl, m, 0, NULL);
1840         if (r < 0)
1841                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1842
1843         return 0;
1844 }
1845
1846 static int parse_interface(struct udev *udev, const char *name) {
1847         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1848         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1849         int ifi;
1850
1851         ifi = (int) if_nametoindex(name);
1852         if (ifi <= 0)
1853                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1854
1855         sprintf(ifi_str, "n%i", ifi);
1856         d = udev_device_new_from_device_id(udev, ifi_str);
1857         if (!d)
1858                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1859
1860         if (udev_device_get_is_initialized(d) <= 0) {
1861                 log_error("Network interface %s is not initialized yet.", name);
1862                 return -EBUSY;
1863         }
1864
1865         return ifi;
1866 }
1867
1868 static int move_network_interfaces(pid_t pid) {
1869         _cleanup_udev_unref_ struct udev *udev = NULL;
1870         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1871         char **i;
1872         int r;
1873
1874         if (!arg_private_network)
1875                 return 0;
1876
1877         if (strv_isempty(arg_network_interfaces))
1878                 return 0;
1879
1880         r = sd_rtnl_open(&rtnl, 0);
1881         if (r < 0)
1882                 return log_error_errno(r, "Failed to connect to netlink: %m");
1883
1884         udev = udev_new();
1885         if (!udev) {
1886                 log_error("Failed to connect to udev.");
1887                 return -ENOMEM;
1888         }
1889
1890         STRV_FOREACH(i, arg_network_interfaces) {
1891                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1892                 int ifi;
1893
1894                 ifi = parse_interface(udev, *i);
1895                 if (ifi < 0)
1896                         return ifi;
1897
1898                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1901
1902                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1903                 if (r < 0)
1904                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1905
1906                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1907                 if (r < 0)
1908                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1909         }
1910
1911         return 0;
1912 }
1913
1914 static int setup_macvlan(pid_t pid) {
1915         _cleanup_udev_unref_ struct udev *udev = NULL;
1916         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1917         unsigned idx = 0;
1918         char **i;
1919         int r;
1920
1921         if (!arg_private_network)
1922                 return 0;
1923
1924         if (strv_isempty(arg_network_macvlan))
1925                 return 0;
1926
1927         r = sd_rtnl_open(&rtnl, 0);
1928         if (r < 0)
1929                 return log_error_errno(r, "Failed to connect to netlink: %m");
1930
1931         udev = udev_new();
1932         if (!udev) {
1933                 log_error("Failed to connect to udev.");
1934                 return -ENOMEM;
1935         }
1936
1937         STRV_FOREACH(i, arg_network_macvlan) {
1938                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1939                 _cleanup_free_ char *n = NULL;
1940                 struct ether_addr mac;
1941                 int ifi;
1942
1943                 ifi = parse_interface(udev, *i);
1944                 if (ifi < 0)
1945                         return ifi;
1946
1947                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1948                 if (r < 0)
1949                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1950
1951                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1952                 if (r < 0)
1953                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1954
1955                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1956                 if (r < 0)
1957                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1958
1959                 n = strappend("mv-", *i);
1960                 if (!n)
1961                         return log_oom();
1962
1963                 strshorten(n, IFNAMSIZ-1);
1964
1965                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1966                 if (r < 0)
1967                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1968
1969                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1970                 if (r < 0)
1971                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1972
1973                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1974                 if (r < 0)
1975                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1976
1977                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1978                 if (r < 0)
1979                         return log_error_errno(r, "Failed to open netlink container: %m");
1980
1981                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1982                 if (r < 0)
1983                         return log_error_errno(r, "Failed to open netlink container: %m");
1984
1985                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1986                 if (r < 0)
1987                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1988
1989                 r = sd_rtnl_message_close_container(m);
1990                 if (r < 0)
1991                         return log_error_errno(r, "Failed to close netlink container: %m");
1992
1993                 r = sd_rtnl_message_close_container(m);
1994                 if (r < 0)
1995                         return log_error_errno(r, "Failed to close netlink container: %m");
1996
1997                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1998                 if (r < 0)
1999                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2000         }
2001
2002         return 0;
2003 }
2004
2005 static int setup_seccomp(void) {
2006
2007 #ifdef HAVE_SECCOMP
2008         static const int blacklist[] = {
2009                 SCMP_SYS(kexec_load),
2010                 SCMP_SYS(open_by_handle_at),
2011                 SCMP_SYS(init_module),
2012                 SCMP_SYS(finit_module),
2013                 SCMP_SYS(delete_module),
2014                 SCMP_SYS(iopl),
2015                 SCMP_SYS(ioperm),
2016                 SCMP_SYS(swapon),
2017                 SCMP_SYS(swapoff),
2018         };
2019
2020         scmp_filter_ctx seccomp;
2021         unsigned i;
2022         int r;
2023
2024         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2025         if (!seccomp)
2026                 return log_oom();
2027
2028         r = seccomp_add_secondary_archs(seccomp);
2029         if (r < 0) {
2030                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2031                 goto finish;
2032         }
2033
2034         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2035                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2036                 if (r == -EFAULT)
2037                         continue; /* unknown syscall */
2038                 if (r < 0) {
2039                         log_error_errno(r, "Failed to block syscall: %m");
2040                         goto finish;
2041                 }
2042         }
2043
2044         /*
2045            Audit is broken in containers, much of the userspace audit
2046            hookup will fail if running inside a container. We don't
2047            care and just turn off creation of audit sockets.
2048
2049            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2050            with EAFNOSUPPORT which audit userspace uses as indication
2051            that audit is disabled in the kernel.
2052          */
2053
2054         r = seccomp_rule_add(
2055                         seccomp,
2056                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2057                         SCMP_SYS(socket),
2058                         2,
2059                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2060                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2061         if (r < 0) {
2062                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2063                 goto finish;
2064         }
2065
2066         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2067         if (r < 0) {
2068                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2069                 goto finish;
2070         }
2071
2072         r = seccomp_load(seccomp);
2073         if (r < 0)
2074                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2075
2076 finish:
2077         seccomp_release(seccomp);
2078         return r;
2079 #else
2080         return 0;
2081 #endif
2082
2083 }
2084
2085 static int setup_propagate(const char *root) {
2086         const char *p, *q;
2087
2088         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2089         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2090         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2091         (void) mkdir_p(p, 0600);
2092
2093         q = strappenda(root, "/run/systemd/nspawn/incoming");
2094         mkdir_parents(q, 0755);
2095         mkdir_p(q, 0600);
2096
2097         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2098                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2099
2100         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2101                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2102
2103         return 0;
2104 }
2105
2106 static int setup_image(char **device_path, int *loop_nr) {
2107         struct loop_info64 info = {
2108                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2109         };
2110         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2111         _cleanup_free_ char* loopdev = NULL;
2112         struct stat st;
2113         int r, nr;
2114
2115         assert(device_path);
2116         assert(loop_nr);
2117         assert(arg_image);
2118
2119         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2120         if (fd < 0)
2121                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2122
2123         if (fstat(fd, &st) < 0)
2124                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2125
2126         if (S_ISBLK(st.st_mode)) {
2127                 char *p;
2128
2129                 p = strdup(arg_image);
2130                 if (!p)
2131                         return log_oom();
2132
2133                 *device_path = p;
2134
2135                 *loop_nr = -1;
2136
2137                 r = fd;
2138                 fd = -1;
2139
2140                 return r;
2141         }
2142
2143         if (!S_ISREG(st.st_mode)) {
2144                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2145                 return -EINVAL;
2146         }
2147
2148         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2149         if (control < 0)
2150                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2151
2152         nr = ioctl(control, LOOP_CTL_GET_FREE);
2153         if (nr < 0)
2154                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2155
2156         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2157                 return log_oom();
2158
2159         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2160         if (loop < 0)
2161                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2162
2163         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2164                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2165
2166         if (arg_read_only)
2167                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2168
2169         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2170                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2171
2172         *device_path = loopdev;
2173         loopdev = NULL;
2174
2175         *loop_nr = nr;
2176
2177         r = loop;
2178         loop = -1;
2179
2180         return r;
2181 }
2182
2183 static int dissect_image(
2184                 int fd,
2185                 char **root_device, bool *root_device_rw,
2186                 char **home_device, bool *home_device_rw,
2187                 char **srv_device, bool *srv_device_rw,
2188                 bool *secondary) {
2189
2190 #ifdef HAVE_BLKID
2191         int home_nr = -1, srv_nr = -1;
2192 #ifdef GPT_ROOT_NATIVE
2193         int root_nr = -1;
2194 #endif
2195 #ifdef GPT_ROOT_SECONDARY
2196         int secondary_root_nr = -1;
2197 #endif
2198
2199         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2200         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2201         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2202         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2203         _cleanup_udev_unref_ struct udev *udev = NULL;
2204         struct udev_list_entry *first, *item;
2205         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2206         const char *pttype = NULL;
2207         blkid_partlist pl;
2208         struct stat st;
2209         int r;
2210
2211         assert(fd >= 0);
2212         assert(root_device);
2213         assert(home_device);
2214         assert(srv_device);
2215         assert(secondary);
2216         assert(arg_image);
2217
2218         b = blkid_new_probe();
2219         if (!b)
2220                 return log_oom();
2221
2222         errno = 0;
2223         r = blkid_probe_set_device(b, fd, 0, 0);
2224         if (r != 0) {
2225                 if (errno == 0)
2226                         return log_oom();
2227
2228                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2229                 return -errno;
2230         }
2231
2232         blkid_probe_enable_partitions(b, 1);
2233         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2234
2235         errno = 0;
2236         r = blkid_do_safeprobe(b);
2237         if (r == -2 || r == 1) {
2238                 log_error("Failed to identify any partition table on %s.\n"
2239                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2240                 return -EINVAL;
2241         } else if (r != 0) {
2242                 if (errno == 0)
2243                         errno = EIO;
2244                 log_error_errno(errno, "Failed to probe: %m");
2245                 return -errno;
2246         }
2247
2248         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2249         if (!streq_ptr(pttype, "gpt")) {
2250                 log_error("Image %s does not carry a GUID Partition Table.\n"
2251                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2252                 return -EINVAL;
2253         }
2254
2255         errno = 0;
2256         pl = blkid_probe_get_partitions(b);
2257         if (!pl) {
2258                 if (errno == 0)
2259                         return log_oom();
2260
2261                 log_error("Failed to list partitions of %s", arg_image);
2262                 return -errno;
2263         }
2264
2265         udev = udev_new();
2266         if (!udev)
2267                 return log_oom();
2268
2269         if (fstat(fd, &st) < 0)
2270                 return log_error_errno(errno, "Failed to stat block device: %m");
2271
2272         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2273         if (!d)
2274                 return log_oom();
2275
2276         e = udev_enumerate_new(udev);
2277         if (!e)
2278                 return log_oom();
2279
2280         r = udev_enumerate_add_match_parent(e, d);
2281         if (r < 0)
2282                 return log_oom();
2283
2284         r = udev_enumerate_scan_devices(e);
2285         if (r < 0)
2286                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2287
2288         first = udev_enumerate_get_list_entry(e);
2289         udev_list_entry_foreach(item, first) {
2290                 _cleanup_udev_device_unref_ struct udev_device *q;
2291                 const char *stype, *node;
2292                 unsigned long long flags;
2293                 sd_id128_t type_id;
2294                 blkid_partition pp;
2295                 dev_t qn;
2296                 int nr;
2297
2298                 errno = 0;
2299                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2300                 if (!q) {
2301                         if (!errno)
2302                                 errno = ENOMEM;
2303
2304                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2305                         return -errno;
2306                 }
2307
2308                 qn = udev_device_get_devnum(q);
2309                 if (major(qn) == 0)
2310                         continue;
2311
2312                 if (st.st_rdev == qn)
2313                         continue;
2314
2315                 node = udev_device_get_devnode(q);
2316                 if (!node)
2317                         continue;
2318
2319                 pp = blkid_partlist_devno_to_partition(pl, qn);
2320                 if (!pp)
2321                         continue;
2322
2323                 flags = blkid_partition_get_flags(pp);
2324                 if (flags & GPT_FLAG_NO_AUTO)
2325                         continue;
2326
2327                 nr = blkid_partition_get_partno(pp);
2328                 if (nr < 0)
2329                         continue;
2330
2331                 stype = blkid_partition_get_type_string(pp);
2332                 if (!stype)
2333                         continue;
2334
2335                 if (sd_id128_from_string(stype, &type_id) < 0)
2336                         continue;
2337
2338                 if (sd_id128_equal(type_id, GPT_HOME)) {
2339
2340                         if (home && nr >= home_nr)
2341                                 continue;
2342
2343                         home_nr = nr;
2344                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2345
2346                         free(home);
2347                         home = strdup(node);
2348                         if (!home)
2349                                 return log_oom();
2350                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2351
2352                         if (srv && nr >= srv_nr)
2353                                 continue;
2354
2355                         srv_nr = nr;
2356                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2357
2358                         free(srv);
2359                         srv = strdup(node);
2360                         if (!srv)
2361                                 return log_oom();
2362                 }
2363 #ifdef GPT_ROOT_NATIVE
2364                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2365
2366                         if (root && nr >= root_nr)
2367                                 continue;
2368
2369                         root_nr = nr;
2370                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2371
2372                         free(root);
2373                         root = strdup(node);
2374                         if (!root)
2375                                 return log_oom();
2376                 }
2377 #endif
2378 #ifdef GPT_ROOT_SECONDARY
2379                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2380
2381                         if (secondary_root && nr >= secondary_root_nr)
2382                                 continue;
2383
2384                         secondary_root_nr = nr;
2385                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2386
2387
2388                         free(secondary_root);
2389                         secondary_root = strdup(node);
2390                         if (!secondary_root)
2391                                 return log_oom();
2392                 }
2393 #endif
2394         }
2395
2396         if (!root && !secondary_root) {
2397                 log_error("Failed to identify root partition in disk image %s.\n"
2398                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2399                 return -EINVAL;
2400         }
2401
2402         if (root) {
2403                 *root_device = root;
2404                 root = NULL;
2405
2406                 *root_device_rw = root_rw;
2407                 *secondary = false;
2408         } else if (secondary_root) {
2409                 *root_device = secondary_root;
2410                 secondary_root = NULL;
2411
2412                 *root_device_rw = secondary_root_rw;
2413                 *secondary = true;
2414         }
2415
2416         if (home) {
2417                 *home_device = home;
2418                 home = NULL;
2419
2420                 *home_device_rw = home_rw;
2421         }
2422
2423         if (srv) {
2424                 *srv_device = srv;
2425                 srv = NULL;
2426
2427                 *srv_device_rw = srv_rw;
2428         }
2429
2430         return 0;
2431 #else
2432         log_error("--image= is not supported, compiled without blkid support.");
2433         return -ENOTSUP;
2434 #endif
2435 }
2436
2437 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2438 #ifdef HAVE_BLKID
2439         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2440         const char *fstype, *p;
2441         int r;
2442
2443         assert(what);
2444         assert(where);
2445
2446         if (arg_read_only)
2447                 rw = false;
2448
2449         if (directory)
2450                 p = strappenda(where, directory);
2451         else
2452                 p = where;
2453
2454         errno = 0;
2455         b = blkid_new_probe_from_filename(what);
2456         if (!b) {
2457                 if (errno == 0)
2458                         return log_oom();
2459                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2460                 return -errno;
2461         }
2462
2463         blkid_probe_enable_superblocks(b, 1);
2464         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2465
2466         errno = 0;
2467         r = blkid_do_safeprobe(b);
2468         if (r == -1 || r == 1) {
2469                 log_error("Cannot determine file system type of %s", what);
2470                 return -EINVAL;
2471         } else if (r != 0) {
2472                 if (errno == 0)
2473                         errno = EIO;
2474                 log_error_errno(errno, "Failed to probe %s: %m", what);
2475                 return -errno;
2476         }
2477
2478         errno = 0;
2479         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2480                 if (errno == 0)
2481                         errno = EINVAL;
2482                 log_error("Failed to determine file system type of %s", what);
2483                 return -errno;
2484         }
2485
2486         if (streq(fstype, "crypto_LUKS")) {
2487                 log_error("nspawn currently does not support LUKS disk images.");
2488                 return -ENOTSUP;
2489         }
2490
2491         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2492                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2493
2494         return 0;
2495 #else
2496         log_error("--image= is not supported, compiled without blkid support.");
2497         return -ENOTSUP;
2498 #endif
2499 }
2500
2501 static int mount_devices(
2502                 const char *where,
2503                 const char *root_device, bool root_device_rw,
2504                 const char *home_device, bool home_device_rw,
2505                 const char *srv_device, bool srv_device_rw) {
2506         int r;
2507
2508         assert(where);
2509
2510         if (root_device) {
2511                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2512                 if (r < 0)
2513                         return log_error_errno(r, "Failed to mount root directory: %m");
2514         }
2515
2516         if (home_device) {
2517                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2518                 if (r < 0)
2519                         return log_error_errno(r, "Failed to mount home directory: %m");
2520         }
2521
2522         if (srv_device) {
2523                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2524                 if (r < 0)
2525                         return log_error_errno(r, "Failed to mount server data directory: %m");
2526         }
2527
2528         return 0;
2529 }
2530
2531 static void loop_remove(int nr, int *image_fd) {
2532         _cleanup_close_ int control = -1;
2533         int r;
2534
2535         if (nr < 0)
2536                 return;
2537
2538         if (image_fd && *image_fd >= 0) {
2539                 r = ioctl(*image_fd, LOOP_CLR_FD);
2540                 if (r < 0)
2541                         log_warning_errno(errno, "Failed to close loop image: %m");
2542                 *image_fd = safe_close(*image_fd);
2543         }
2544
2545         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2546         if (control < 0) {
2547                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2548                 return;
2549         }
2550
2551         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2552         if (r < 0)
2553                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2554 }
2555
2556 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2557         int pipe_fds[2];
2558         pid_t pid;
2559
2560         assert(database);
2561         assert(key);
2562         assert(rpid);
2563
2564         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2565                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2566
2567         pid = fork();
2568         if (pid < 0)
2569                 return log_error_errno(errno, "Failed to fork getent child: %m");
2570         else if (pid == 0) {
2571                 int nullfd;
2572                 char *empty_env = NULL;
2573
2574                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2575                         _exit(EXIT_FAILURE);
2576
2577                 if (pipe_fds[0] > 2)
2578                         safe_close(pipe_fds[0]);
2579                 if (pipe_fds[1] > 2)
2580                         safe_close(pipe_fds[1]);
2581
2582                 nullfd = open("/dev/null", O_RDWR);
2583                 if (nullfd < 0)
2584                         _exit(EXIT_FAILURE);
2585
2586                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2587                         _exit(EXIT_FAILURE);
2588
2589                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2590                         _exit(EXIT_FAILURE);
2591
2592                 if (nullfd > 2)
2593                         safe_close(nullfd);
2594
2595                 reset_all_signal_handlers();
2596                 close_all_fds(NULL, 0);
2597
2598                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2599                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2600                 _exit(EXIT_FAILURE);
2601         }
2602
2603         pipe_fds[1] = safe_close(pipe_fds[1]);
2604
2605         *rpid = pid;
2606
2607         return pipe_fds[0];
2608 }
2609
2610 static int change_uid_gid(char **_home) {
2611         char line[LINE_MAX], *x, *u, *g, *h;
2612         const char *word, *state;
2613         _cleanup_free_ uid_t *uids = NULL;
2614         _cleanup_free_ char *home = NULL;
2615         _cleanup_fclose_ FILE *f = NULL;
2616         _cleanup_close_ int fd = -1;
2617         unsigned n_uids = 0;
2618         size_t sz = 0, l;
2619         uid_t uid;
2620         gid_t gid;
2621         pid_t pid;
2622         int r;
2623
2624         assert(_home);
2625
2626         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2627                 /* Reset everything fully to 0, just in case */
2628
2629                 if (setgroups(0, NULL) < 0)
2630                         return log_error_errno(errno, "setgroups() failed: %m");
2631
2632                 if (setresgid(0, 0, 0) < 0)
2633                         return log_error_errno(errno, "setregid() failed: %m");
2634
2635                 if (setresuid(0, 0, 0) < 0)
2636                         return log_error_errno(errno, "setreuid() failed: %m");
2637
2638                 *_home = NULL;
2639                 return 0;
2640         }
2641
2642         /* First, get user credentials */
2643         fd = spawn_getent("passwd", arg_user, &pid);
2644         if (fd < 0)
2645                 return fd;
2646
2647         f = fdopen(fd, "r");
2648         if (!f)
2649                 return log_oom();
2650         fd = -1;
2651
2652         if (!fgets(line, sizeof(line), f)) {
2653
2654                 if (!ferror(f)) {
2655                         log_error("Failed to resolve user %s.", arg_user);
2656                         return -ESRCH;
2657                 }
2658
2659                 log_error_errno(errno, "Failed to read from getent: %m");
2660                 return -errno;
2661         }
2662
2663         truncate_nl(line);
2664
2665         wait_for_terminate_and_warn("getent passwd", pid, true);
2666
2667         x = strchr(line, ':');
2668         if (!x) {
2669                 log_error("/etc/passwd entry has invalid user field.");
2670                 return -EIO;
2671         }
2672
2673         u = strchr(x+1, ':');
2674         if (!u) {
2675                 log_error("/etc/passwd entry has invalid password field.");
2676                 return -EIO;
2677         }
2678
2679         u++;
2680         g = strchr(u, ':');
2681         if (!g) {
2682                 log_error("/etc/passwd entry has invalid UID field.");
2683                 return -EIO;
2684         }
2685
2686         *g = 0;
2687         g++;
2688         x = strchr(g, ':');
2689         if (!x) {
2690                 log_error("/etc/passwd entry has invalid GID field.");
2691                 return -EIO;
2692         }
2693
2694         *x = 0;
2695         h = strchr(x+1, ':');
2696         if (!h) {
2697                 log_error("/etc/passwd entry has invalid GECOS field.");
2698                 return -EIO;
2699         }
2700
2701         h++;
2702         x = strchr(h, ':');
2703         if (!x) {
2704                 log_error("/etc/passwd entry has invalid home directory field.");
2705                 return -EIO;
2706         }
2707
2708         *x = 0;
2709
2710         r = parse_uid(u, &uid);
2711         if (r < 0) {
2712                 log_error("Failed to parse UID of user.");
2713                 return -EIO;
2714         }
2715
2716         r = parse_gid(g, &gid);
2717         if (r < 0) {
2718                 log_error("Failed to parse GID of user.");
2719                 return -EIO;
2720         }
2721
2722         home = strdup(h);
2723         if (!home)
2724                 return log_oom();
2725
2726         /* Second, get group memberships */
2727         fd = spawn_getent("initgroups", arg_user, &pid);
2728         if (fd < 0)
2729                 return fd;
2730
2731         fclose(f);
2732         f = fdopen(fd, "r");
2733         if (!f)
2734                 return log_oom();
2735         fd = -1;
2736
2737         if (!fgets(line, sizeof(line), f)) {
2738                 if (!ferror(f)) {
2739                         log_error("Failed to resolve user %s.", arg_user);
2740                         return -ESRCH;
2741                 }
2742
2743                 log_error_errno(errno, "Failed to read from getent: %m");
2744                 return -errno;
2745         }
2746
2747         truncate_nl(line);
2748
2749         wait_for_terminate_and_warn("getent initgroups", pid, true);
2750
2751         /* Skip over the username and subsequent separator whitespace */
2752         x = line;
2753         x += strcspn(x, WHITESPACE);
2754         x += strspn(x, WHITESPACE);
2755
2756         FOREACH_WORD(word, l, x, state) {
2757                 char c[l+1];
2758
2759                 memcpy(c, word, l);
2760                 c[l] = 0;
2761
2762                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2763                         return log_oom();
2764
2765                 r = parse_uid(c, &uids[n_uids++]);
2766                 if (r < 0) {
2767                         log_error("Failed to parse group data from getent.");
2768                         return -EIO;
2769                 }
2770         }
2771
2772         r = mkdir_parents(home, 0775);
2773         if (r < 0)
2774                 return log_error_errno(r, "Failed to make home root directory: %m");
2775
2776         r = mkdir_safe(home, 0755, uid, gid);
2777         if (r < 0 && r != -EEXIST)
2778                 return log_error_errno(r, "Failed to make home directory: %m");
2779
2780         fchown(STDIN_FILENO, uid, gid);
2781         fchown(STDOUT_FILENO, uid, gid);
2782         fchown(STDERR_FILENO, uid, gid);
2783
2784         if (setgroups(n_uids, uids) < 0)
2785                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2786
2787         if (setresgid(gid, gid, gid) < 0)
2788                 return log_error_errno(errno, "setregid() failed: %m");
2789
2790         if (setresuid(uid, uid, uid) < 0)
2791                 return log_error_errno(errno, "setreuid() failed: %m");
2792
2793         if (_home) {
2794                 *_home = home;
2795                 home = NULL;
2796         }
2797
2798         return 0;
2799 }
2800
2801 /*
2802  * Return values:
2803  * < 0 : wait_for_terminate() failed to get the state of the
2804  *       container, the container was terminated by a signal, or
2805  *       failed for an unknown reason.  No change is made to the
2806  *       container argument.
2807  * > 0 : The program executed in the container terminated with an
2808  *       error.  The exit code of the program executed in the
2809  *       container is returned.  The container argument has been set
2810  *       to CONTAINER_TERMINATED.
2811  *   0 : The container is being rebooted, has been shut down or exited
2812  *       successfully.  The container argument has been set to either
2813  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2814  *
2815  * That is, success is indicated by a return value of zero, and an
2816  * error is indicated by a non-zero value.
2817  */
2818 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2819         siginfo_t status;
2820         int r;
2821
2822         r = wait_for_terminate(pid, &status);
2823         if (r < 0)
2824                 return log_warning_errno(r, "Failed to wait for container: %m");
2825
2826         switch (status.si_code) {
2827
2828         case CLD_EXITED:
2829                 if (status.si_status == 0) {
2830                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2831
2832                 } else
2833                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2834
2835                 *container = CONTAINER_TERMINATED;
2836                 return status.si_status;
2837
2838         case CLD_KILLED:
2839                 if (status.si_status == SIGINT) {
2840
2841                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2842                         *container = CONTAINER_TERMINATED;
2843                         return 0;
2844
2845                 } else if (status.si_status == SIGHUP) {
2846
2847                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2848                         *container = CONTAINER_REBOOTED;
2849                         return 0;
2850                 }
2851
2852                 /* CLD_KILLED fallthrough */
2853
2854         case CLD_DUMPED:
2855                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2856                 return -EIO;
2857
2858         default:
2859                 log_error("Container %s failed due to unknown reason.", arg_machine);
2860                 return -EIO;
2861         }
2862
2863         return r;
2864 }
2865
2866 static void nop_handler(int sig) {}
2867
2868 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2869         pid_t pid;
2870
2871         pid = PTR_TO_UINT32(userdata);
2872         if (pid > 0) {
2873                 if (kill(pid, SIGRTMIN+3) >= 0) {
2874                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2875                         sd_event_source_set_userdata(s, NULL);
2876                         return 0;
2877                 }
2878         }
2879
2880         sd_event_exit(sd_event_source_get_event(s), 0);
2881         return 0;
2882 }
2883
2884 static int determine_names(void) {
2885
2886         if (!arg_image && !arg_directory) {
2887                 if (arg_machine)
2888                         arg_directory = strappend("/var/lib/container/", arg_machine);
2889                 else
2890                         arg_directory = get_current_dir_name();
2891
2892                 if (!arg_directory) {
2893                         log_error("Failed to determine path, please use -D.");
2894                         return -EINVAL;
2895                 }
2896         }
2897
2898         if (!arg_machine) {
2899                 if (arg_directory && path_equal(arg_directory, "/"))
2900                         arg_machine = gethostname_malloc();
2901                 else
2902                         arg_machine = strdup(basename(arg_image ?: arg_directory));
2903
2904                 if (!arg_machine)
2905                         return log_oom();
2906
2907                 hostname_cleanup(arg_machine, false);
2908                 if (!machine_name_is_valid(arg_machine)) {
2909                         log_error("Failed to determine machine name automatically, please use -M.");
2910                         return -EINVAL;
2911                 }
2912
2913                 if (arg_ephemeral) {
2914                         char *b;
2915
2916                         /* Add a random suffix when this is an
2917                          * ephemeral machine, so that we can run many
2918                          * instances at once without manually having
2919                          * to specify -M each time. */
2920
2921                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2922                                 return log_oom();
2923
2924                         free(arg_machine);
2925                         arg_machine = b;
2926                 }
2927         }
2928
2929         return 0;
2930 }
2931
2932 int main(int argc, char *argv[]) {
2933
2934         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2935         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2936         _cleanup_close_ int master = -1, image_fd = -1;
2937         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2938         _cleanup_fdset_free_ FDSet *fds = NULL;
2939         int r, n_fd_passed, loop_nr = -1;
2940         const char *console = NULL;
2941         char veth_name[IFNAMSIZ];
2942         bool secondary = false, remove_subvol = false;
2943         sigset_t mask, mask_chld;
2944         pid_t pid = 0;
2945         int ret = EXIT_SUCCESS;
2946
2947         log_parse_environment();
2948         log_open();
2949
2950         r = parse_argv(argc, argv);
2951         if (r <= 0)
2952                 goto finish;
2953
2954         r = determine_names();
2955         if (r < 0)
2956                 goto finish;
2957
2958         if (geteuid() != 0) {
2959                 log_error("Need to be root.");
2960                 r = -EPERM;
2961                 goto finish;
2962         }
2963
2964         if (sd_booted() <= 0) {
2965                 log_error("Not running on a systemd system.");
2966                 r = -EINVAL;
2967                 goto finish;
2968         }
2969
2970         log_close();
2971         n_fd_passed = sd_listen_fds(false);
2972         if (n_fd_passed > 0) {
2973                 r = fdset_new_listen_fds(&fds, false);
2974                 if (r < 0) {
2975                         log_error_errno(r, "Failed to collect file descriptors: %m");
2976                         goto finish;
2977                 }
2978         }
2979         fdset_close_others(fds);
2980         log_open();
2981
2982         if (arg_directory) {
2983                 assert(!arg_image);
2984
2985                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
2986                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
2987                         r = -EINVAL;
2988                         goto finish;
2989                 }
2990
2991                 if (arg_template) {
2992                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2993                         if (r == -EEXIST) {
2994                                 if (!arg_quiet)
2995                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2996                         } else if (r < 0) {
2997                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2998                                 goto finish;
2999                         } else {
3000                                 if (!arg_quiet)
3001                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
3002                         }
3003
3004                 } else if (arg_ephemeral) {
3005                         char *np;
3006
3007                         /* If the specified path is a mount point we
3008                          * generate the new snapshot immediately
3009                          * inside it under a random name. However if
3010                          * the specified is not a mount point we
3011                          * create the new snapshot in the parent
3012                          * directory, just next to it. */
3013                         r = path_is_mount_point(arg_directory, false);
3014                         if (r < 0) {
3015                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3016                                 goto finish;
3017                         }
3018                         if (r > 0)
3019                                 r = tempfn_random_child(arg_directory, &np);
3020                         else
3021                                 r = tempfn_random(arg_directory, &np);
3022                         if (r < 0) {
3023                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3024                                 goto finish;
3025                         }
3026
3027                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3028                         if (r < 0) {
3029                                 free(np);
3030                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3031                                 goto finish;
3032                         }
3033
3034                         free(arg_directory);
3035                         arg_directory = np;
3036
3037                         remove_subvol = true;
3038                 }
3039
3040                 if (arg_boot) {
3041                         if (path_is_os_tree(arg_directory) <= 0) {
3042                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3043                                 r = -EINVAL;
3044                                 goto finish;
3045                         }
3046                 } else {
3047                         const char *p;
3048
3049                         p = strappenda(arg_directory,
3050                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3051                         if (access(p, F_OK) < 0) {
3052                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3053                                 r = -EINVAL;
3054                                 goto finish;
3055                         }
3056                 }
3057
3058         } else {
3059                 char template[] = "/tmp/nspawn-root-XXXXXX";
3060
3061                 assert(arg_image);
3062                 assert(!arg_template);
3063
3064                 if (!mkdtemp(template)) {
3065                         log_error_errno(errno, "Failed to create temporary directory: %m");
3066                         r = -errno;
3067                         goto finish;
3068                 }
3069
3070                 arg_directory = strdup(template);
3071                 if (!arg_directory) {
3072                         r = log_oom();
3073                         goto finish;
3074                 }
3075
3076                 image_fd = setup_image(&device_path, &loop_nr);
3077                 if (image_fd < 0) {
3078                         r = image_fd;
3079                         goto finish;
3080                 }
3081
3082                 r = dissect_image(image_fd,
3083                                   &root_device, &root_device_rw,
3084                                   &home_device, &home_device_rw,
3085                                   &srv_device, &srv_device_rw,
3086                                   &secondary);
3087                 if (r < 0)
3088                         goto finish;
3089         }
3090
3091         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3092         if (master < 0) {
3093                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3094                 goto finish;
3095         }
3096
3097         console = ptsname(master);
3098         if (!console) {
3099                 r = log_error_errno(errno, "Failed to determine tty name: %m");
3100                 goto finish;
3101         }
3102
3103         if (!arg_quiet)
3104                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3105                          arg_machine, arg_image ?: arg_directory);
3106
3107         if (unlockpt(master) < 0) {
3108                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3109                 goto finish;
3110         }
3111
3112         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3113                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3114                 goto finish;
3115         }
3116
3117         sd_notify(false,
3118                   "READY=1\n"
3119                   "STATUS=Container running.");
3120
3121         assert_se(sigemptyset(&mask) == 0);
3122         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3123         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3124
3125         assert_se(sigemptyset(&mask_chld) == 0);
3126         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3127
3128         for (;;) {
3129                 ContainerStatus container_status;
3130                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3131                 struct sigaction sa = {
3132                         .sa_handler = nop_handler,
3133                         .sa_flags = SA_NOCLDSTOP,
3134                 };
3135
3136                 r = barrier_create(&barrier);
3137                 if (r < 0) {
3138                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3139                         goto finish;
3140                 }
3141
3142                 /* Child can be killed before execv(), so handle SIGCHLD
3143                  * in order to interrupt parent's blocking calls and
3144                  * give it a chance to call wait() and terminate. */
3145                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3146                 if (r < 0) {
3147                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3148                         goto finish;
3149                 }
3150
3151                 r = sigaction(SIGCHLD, &sa, NULL);
3152                 if (r < 0) {
3153                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3154                         goto finish;
3155                 }
3156
3157                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3158                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3159                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3160                 if (pid < 0) {
3161                         if (errno == EINVAL)
3162                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3163                         else
3164                                 r = log_error_errno(errno, "clone() failed: %m");
3165
3166                         goto finish;
3167                 }
3168
3169                 if (pid == 0) {
3170                         /* child */
3171                         _cleanup_free_ char *home = NULL;
3172                         unsigned n_env = 2;
3173                         const char *envp[] = {
3174                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3175                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3176                                 NULL, /* TERM */
3177                                 NULL, /* HOME */
3178                                 NULL, /* USER */
3179                                 NULL, /* LOGNAME */
3180                                 NULL, /* container_uuid */
3181                                 NULL, /* LISTEN_FDS */
3182                                 NULL, /* LISTEN_PID */
3183                                 NULL
3184                         };
3185                         char **env_use;
3186
3187                         barrier_set_role(&barrier, BARRIER_CHILD);
3188
3189                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3190                         if (envp[n_env])
3191                                 n_env ++;
3192
3193                         master = safe_close(master);
3194
3195                         close_nointr(STDIN_FILENO);
3196                         close_nointr(STDOUT_FILENO);
3197                         close_nointr(STDERR_FILENO);
3198
3199                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3200
3201                         reset_all_signal_handlers();
3202                         reset_signal_mask();
3203
3204                         r = open_terminal(console, O_RDWR);
3205                         if (r != STDIN_FILENO) {
3206                                 if (r >= 0) {
3207                                         safe_close(r);
3208                                         r = -EINVAL;
3209                                 }
3210
3211                                 log_error_errno(r, "Failed to open console: %m");
3212                                 _exit(EXIT_FAILURE);
3213                         }
3214
3215                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3216                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3217                                 log_error_errno(errno, "Failed to duplicate console: %m");
3218                                 _exit(EXIT_FAILURE);
3219                         }
3220
3221                         if (setsid() < 0) {
3222                                 log_error_errno(errno, "setsid() failed: %m");
3223                                 _exit(EXIT_FAILURE);
3224                         }
3225
3226                         if (reset_audit_loginuid() < 0)
3227                                 _exit(EXIT_FAILURE);
3228
3229                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3230                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3231                                 _exit(EXIT_FAILURE);
3232                         }
3233
3234                         /* Mark everything as slave, so that we still
3235                          * receive mounts from the real root, but don't
3236                          * propagate mounts to the real root. */
3237                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3238                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3239                                 _exit(EXIT_FAILURE);
3240                         }
3241
3242                         if (mount_devices(arg_directory,
3243                                           root_device, root_device_rw,
3244                                           home_device, home_device_rw,
3245                                           srv_device, srv_device_rw) < 0)
3246                                 _exit(EXIT_FAILURE);
3247
3248                         /* Turn directory into bind mount */
3249                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3250                                 log_error_errno(errno, "Failed to make bind mount: %m");
3251                                 _exit(EXIT_FAILURE);
3252                         }
3253
3254                         r = setup_volatile(arg_directory);
3255                         if (r < 0)
3256                                 _exit(EXIT_FAILURE);
3257
3258                         if (setup_volatile_state(arg_directory) < 0)
3259                                 _exit(EXIT_FAILURE);
3260
3261                         r = base_filesystem_create(arg_directory);
3262                         if (r < 0)
3263                                 _exit(EXIT_FAILURE);
3264
3265                         if (arg_read_only) {
3266                                 r = bind_remount_recursive(arg_directory, true);
3267                                 if (r < 0) {
3268                                         log_error_errno(r, "Failed to make tree read-only: %m");
3269                                         _exit(EXIT_FAILURE);
3270                                 }
3271                         }
3272
3273                         if (mount_all(arg_directory) < 0)
3274                                 _exit(EXIT_FAILURE);
3275
3276                         if (copy_devnodes(arg_directory) < 0)
3277                                 _exit(EXIT_FAILURE);
3278
3279                         if (setup_ptmx(arg_directory) < 0)
3280                                 _exit(EXIT_FAILURE);
3281
3282                         dev_setup(arg_directory);
3283
3284                         if (setup_propagate(arg_directory) < 0)
3285                                 _exit(EXIT_FAILURE);
3286
3287                         if (setup_seccomp() < 0)
3288                                 _exit(EXIT_FAILURE);
3289
3290                         if (setup_dev_console(arg_directory, console) < 0)
3291                                 _exit(EXIT_FAILURE);
3292
3293                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3294                                 _exit(EXIT_FAILURE);
3295
3296                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3297
3298                         if (setup_boot_id(arg_directory) < 0)
3299                                 _exit(EXIT_FAILURE);
3300
3301                         if (setup_timezone(arg_directory) < 0)
3302                                 _exit(EXIT_FAILURE);
3303
3304                         if (setup_resolv_conf(arg_directory) < 0)
3305                                 _exit(EXIT_FAILURE);
3306
3307                         if (setup_journal(arg_directory) < 0)
3308                                 _exit(EXIT_FAILURE);
3309
3310                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3311                                 _exit(EXIT_FAILURE);
3312
3313                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3314                                 _exit(EXIT_FAILURE);
3315
3316                         if (mount_tmpfs(arg_directory) < 0)
3317                                 _exit(EXIT_FAILURE);
3318
3319                         /* Tell the parent that we are ready, and that
3320                          * it can cgroupify us to that we lack access
3321                          * to certain devices and resources. */
3322                         (void)barrier_place(&barrier);
3323
3324                         if (chdir(arg_directory) < 0) {
3325                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3326                                 _exit(EXIT_FAILURE);
3327                         }
3328
3329                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3330                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3331                                 _exit(EXIT_FAILURE);
3332                         }
3333
3334                         if (chroot(".") < 0) {
3335                                 log_error_errno(errno, "chroot() failed: %m");
3336                                 _exit(EXIT_FAILURE);
3337                         }
3338
3339                         if (chdir("/") < 0) {
3340                                 log_error_errno(errno, "chdir() failed: %m");
3341                                 _exit(EXIT_FAILURE);
3342                         }
3343
3344                         umask(0022);
3345
3346                         if (arg_private_network)
3347                                 loopback_setup();
3348
3349                         if (drop_capabilities() < 0) {
3350                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3351                                 _exit(EXIT_FAILURE);
3352                         }
3353
3354                         r = change_uid_gid(&home);
3355                         if (r < 0)
3356                                 _exit(EXIT_FAILURE);
3357
3358                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3359                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3360                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3361                                 log_oom();
3362                                 _exit(EXIT_FAILURE);
3363                         }
3364
3365                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3366                                 char as_uuid[37];
3367
3368                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3369                                         log_oom();
3370                                         _exit(EXIT_FAILURE);
3371                                 }
3372                         }
3373
3374                         if (fdset_size(fds) > 0) {
3375                                 r = fdset_cloexec(fds, false);
3376                                 if (r < 0) {
3377                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3378                                         _exit(EXIT_FAILURE);
3379                                 }
3380
3381                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3382                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3383                                         log_oom();
3384                                         _exit(EXIT_FAILURE);
3385                                 }
3386                         }
3387
3388                         setup_hostname();
3389
3390                         if (arg_personality != 0xffffffffLU) {
3391                                 if (personality(arg_personality) < 0) {
3392                                         log_error_errno(errno, "personality() failed: %m");
3393                                         _exit(EXIT_FAILURE);
3394                                 }
3395                         } else if (secondary) {
3396                                 if (personality(PER_LINUX32) < 0) {
3397                                         log_error_errno(errno, "personality() failed: %m");
3398                                         _exit(EXIT_FAILURE);
3399                                 }
3400                         }
3401
3402 #ifdef HAVE_SELINUX
3403                         if (arg_selinux_context)
3404                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3405                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3406                                         _exit(EXIT_FAILURE);
3407                                 }
3408 #endif
3409
3410                         if (!strv_isempty(arg_setenv)) {
3411                                 char **n;
3412
3413                                 n = strv_env_merge(2, envp, arg_setenv);
3414                                 if (!n) {
3415                                         log_oom();
3416                                         _exit(EXIT_FAILURE);
3417                                 }
3418
3419                                 env_use = n;
3420                         } else
3421                                 env_use = (char**) envp;
3422
3423                         /* Wait until the parent is ready with the setup, too... */
3424                         if (!barrier_place_and_sync(&barrier))
3425                                 _exit(EXIT_FAILURE);
3426
3427                         if (arg_boot) {
3428                                 char **a;
3429                                 size_t l;
3430
3431                                 /* Automatically search for the init system */
3432
3433                                 l = 1 + argc - optind;
3434                                 a = newa(char*, l + 1);
3435                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3436
3437                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3438                                 execve(a[0], a, env_use);
3439
3440                                 a[0] = (char*) "/lib/systemd/systemd";
3441                                 execve(a[0], a, env_use);
3442
3443                                 a[0] = (char*) "/sbin/init";
3444                                 execve(a[0], a, env_use);
3445                         } else if (argc > optind)
3446                                 execvpe(argv[optind], argv + optind, env_use);
3447                         else {
3448                                 chdir(home ? home : "/root");
3449                                 execle("/bin/bash", "-bash", NULL, env_use);
3450                                 execle("/bin/sh", "-sh", NULL, env_use);
3451                         }
3452
3453                         log_error_errno(errno, "execv() failed: %m");
3454                         _exit(EXIT_FAILURE);
3455                 }
3456
3457                 barrier_set_role(&barrier, BARRIER_PARENT);
3458                 fdset_free(fds);
3459                 fds = NULL;
3460
3461                 /* wait for child-setup to be done */
3462                 if (barrier_place_and_sync(&barrier)) {
3463                         _cleanup_event_unref_ sd_event *event = NULL;
3464                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3465                         char last_char = 0;
3466                         int ifi = 0;
3467
3468                         r = move_network_interfaces(pid);
3469                         if (r < 0)
3470                                 goto finish;
3471
3472                         r = setup_veth(pid, veth_name, &ifi);
3473                         if (r < 0)
3474                                 goto finish;
3475
3476                         r = setup_bridge(veth_name, &ifi);
3477                         if (r < 0)
3478                                 goto finish;
3479
3480                         r = setup_macvlan(pid);
3481                         if (r < 0)
3482                                 goto finish;
3483
3484                         r = register_machine(pid, ifi);
3485                         if (r < 0)
3486                                 goto finish;
3487
3488                         /* Block SIGCHLD here, before notifying child.
3489                          * process_pty() will handle it with the other signals. */
3490                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3491                         if (r < 0)
3492                                 goto finish;
3493
3494                         /* Reset signal to default */
3495                         r = default_signals(SIGCHLD, -1);
3496                         if (r < 0)
3497                                 goto finish;
3498
3499                         /* Notify the child that the parent is ready with all
3500                          * its setup, and that the child can now hand over
3501                          * control to the code to run inside the container. */
3502                         (void)barrier_place(&barrier);
3503
3504                         r = sd_event_new(&event);
3505                         if (r < 0) {
3506                                 log_error_errno(r, "Failed to get default event source: %m");
3507                                 goto finish;
3508                         }
3509
3510                         if (arg_boot) {
3511                                 /* Try to kill the init system on SIGINT or SIGTERM */
3512                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3513                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3514                         } else {
3515                                 /* Immediately exit */
3516                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3517                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3518                         }
3519
3520                         /* simply exit on sigchld */
3521                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3522
3523                         r = pty_forward_new(event, master, true, &forward);
3524                         if (r < 0) {
3525                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3526                                 goto finish;
3527                         }
3528
3529                         r = sd_event_loop(event);
3530                         if (r < 0) {
3531                                 log_error_errno(r, "Failed to run event loop: %m");
3532                                 goto finish;
3533                         }
3534
3535                         pty_forward_last_char(forward, &last_char);
3536
3537                         forward = pty_forward_free(forward);
3538
3539                         if (!arg_quiet && last_char != '\n')
3540                                 putc('\n', stdout);
3541
3542                         /* Kill if it is not dead yet anyway */
3543                         terminate_machine(pid);
3544                 }
3545
3546                 /* Normally redundant, but better safe than sorry */
3547                 kill(pid, SIGKILL);
3548
3549                 r = wait_for_container(pid, &container_status);
3550                 pid = 0;
3551
3552                 if (r < 0)
3553                         /* We failed to wait for the container, or the
3554                          * container exited abnormally */
3555                         goto finish;
3556                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3557                         /* The container exited with a non-zero
3558                          * status, or with zero status and no reboot
3559                          * was requested. */
3560                         ret = r;
3561                         break;
3562                 }
3563
3564                 /* CONTAINER_REBOOTED, loop again */
3565
3566                 if (arg_keep_unit) {
3567                         /* Special handling if we are running as a
3568                          * service: instead of simply restarting the
3569                          * machine we want to restart the entire
3570                          * service, so let's inform systemd about this
3571                          * with the special exit code 133. The service
3572                          * file uses RestartForceExitStatus=133 so
3573                          * that this results in a full nspawn
3574                          * restart. This is necessary since we might
3575                          * have cgroup parameters set we want to have
3576                          * flushed out. */
3577                         ret = 133;
3578                         r = 0;
3579                         break;
3580                 }
3581         }
3582
3583 finish:
3584         sd_notify(false,
3585                   "STOPPING=1\n"
3586                   "STATUS=Terminating...");
3587
3588         loop_remove(loop_nr, &image_fd);
3589
3590         if (pid > 0)
3591                 kill(pid, SIGKILL);
3592
3593         if (remove_subvol && arg_directory) {
3594                 int k;
3595
3596                 k = btrfs_subvol_remove(arg_directory);
3597                 if (k < 0)
3598                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3599         }
3600
3601         if (arg_machine) {
3602                 const char *p;
3603
3604                 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3605                 (void) rm_rf(p, false, true, false);
3606         }
3607
3608         free(arg_directory);
3609         free(arg_template);
3610         free(arg_image);
3611         free(arg_machine);
3612         free(arg_user);
3613         strv_free(arg_setenv);
3614         strv_free(arg_network_interfaces);
3615         strv_free(arg_network_macvlan);
3616         strv_free(arg_bind);
3617         strv_free(arg_bind_ro);
3618         strv_free(arg_tmpfs);
3619
3620         return r < 0 ? EXIT_FAILURE : ret;
3621 }