chiark / gitweb /
1bfc99d5fcaa37129fcd6e85304caa54e8fa959c
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95
96 #ifdef HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99
100 typedef enum ContainerStatus {
101         CONTAINER_TERMINATED,
102         CONTAINER_REBOOTED
103 } ContainerStatus;
104
105 typedef enum LinkJournal {
106         LINK_NO,
107         LINK_AUTO,
108         LINK_HOST,
109         LINK_GUEST
110 } LinkJournal;
111
112 typedef enum Volatile {
113         VOLATILE_NO,
114         VOLATILE_YES,
115         VOLATILE_STATE,
116 } Volatile;
117
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133         (1ULL << CAP_CHOWN) |
134         (1ULL << CAP_DAC_OVERRIDE) |
135         (1ULL << CAP_DAC_READ_SEARCH) |
136         (1ULL << CAP_FOWNER) |
137         (1ULL << CAP_FSETID) |
138         (1ULL << CAP_IPC_OWNER) |
139         (1ULL << CAP_KILL) |
140         (1ULL << CAP_LEASE) |
141         (1ULL << CAP_LINUX_IMMUTABLE) |
142         (1ULL << CAP_NET_BIND_SERVICE) |
143         (1ULL << CAP_NET_BROADCAST) |
144         (1ULL << CAP_NET_RAW) |
145         (1ULL << CAP_SETGID) |
146         (1ULL << CAP_SETFCAP) |
147         (1ULL << CAP_SETPCAP) |
148         (1ULL << CAP_SETUID) |
149         (1ULL << CAP_SYS_ADMIN) |
150         (1ULL << CAP_SYS_CHROOT) |
151         (1ULL << CAP_SYS_NICE) |
152         (1ULL << CAP_SYS_PTRACE) |
153         (1ULL << CAP_SYS_TTY_CONFIG) |
154         (1ULL << CAP_SYS_RESOURCE) |
155         (1ULL << CAP_SYS_BOOT) |
156         (1ULL << CAP_AUDIT_WRITE) |
157         (1ULL << CAP_AUDIT_CONTROL) |
158         (1ULL << CAP_MKNOD);
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
174
175 static void help(void) {
176         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178                "  -h --help                 Show this help\n"
179                "     --version              Print version string\n"
180                "  -q --quiet                Do not show status information\n"
181                "  -D --directory=PATH       Root directory for the container\n"
182                "     --template=PATH        Initialize root directory from template directory,\n"
183                "                            if missing\n"
184                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
185                "                            remove it after exit\n"
186                "  -i --image=PATH           File system device or disk image for the container\n"
187                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
188                "  -u --user=USER            Run the command under specified user or uid\n"
189                "  -M --machine=NAME         Set the machine name for the container\n"
190                "     --uuid=UUID            Set a specific machine UUID for the container\n"
191                "  -S --slice=SLICE          Place the container in the specified slice\n"
192                "     --private-network      Disable network in container\n"
193                "     --network-interface=INTERFACE\n"
194                "                            Assign an existing network interface to the\n"
195                "                            container\n"
196                "     --network-macvlan=INTERFACE\n"
197                "                            Create a macvlan network interface based on an\n"
198                "                            existing network interface to the container\n"
199                "     --network-veth         Add a virtual ethernet connection between host\n"
200                "                            and container\n"
201                "     --network-bridge=INTERFACE\n"
202                "                            Add a virtual ethernet connection between host\n"
203                "                            and container and add it to an existing bridge on\n"
204                "                            the host\n"
205                "  -Z --selinux-context=SECLABEL\n"
206                "                            Set the SELinux security context to be used by\n"
207                "                            processes in the container\n"
208                "  -L --selinux-apifs-context=SECLABEL\n"
209                "                            Set the SELinux security context to be used by\n"
210                "                            API/tmpfs file systems in the container\n"
211                "     --capability=CAP       In addition to the default, retain specified\n"
212                "                            capability\n"
213                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
214                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
215                "                            try-guest, try-host\n"
216                "  -j                        Equivalent to --link-journal=try-guest\n"
217                "     --read-only            Mount the root directory read-only\n"
218                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
219                "                            the container\n"
220                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
221                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
223                "     --share-system         Share system namespaces with host\n"
224                "     --register=BOOLEAN     Register container as machine\n"
225                "     --keep-unit            Do not register a scope for the machine, reuse\n"
226                "                            the service unit nspawn is running in\n"
227                "     --volatile[=MODE]      Run the system in volatile mode\n",
228                program_invocation_short_name);
229 }
230
231 static int set_sanitized_path(char **b, const char *path) {
232         char *p;
233
234         assert(b);
235         assert(path);
236
237         p = canonicalize_file_name(path);
238         if (!p) {
239                 if (errno != ENOENT)
240                         return -errno;
241
242                 p = path_make_absolute_cwd(path);
243                 if (!p)
244                         return -ENOMEM;
245         }
246
247         free(*b);
248         *b = path_kill_slashes(p);
249         return 0;
250 }
251
252 static int parse_argv(int argc, char *argv[]) {
253
254         enum {
255                 ARG_VERSION = 0x100,
256                 ARG_PRIVATE_NETWORK,
257                 ARG_UUID,
258                 ARG_READ_ONLY,
259                 ARG_CAPABILITY,
260                 ARG_DROP_CAPABILITY,
261                 ARG_LINK_JOURNAL,
262                 ARG_BIND,
263                 ARG_BIND_RO,
264                 ARG_TMPFS,
265                 ARG_SETENV,
266                 ARG_SHARE_SYSTEM,
267                 ARG_REGISTER,
268                 ARG_KEEP_UNIT,
269                 ARG_NETWORK_INTERFACE,
270                 ARG_NETWORK_MACVLAN,
271                 ARG_NETWORK_VETH,
272                 ARG_NETWORK_BRIDGE,
273                 ARG_PERSONALITY,
274                 ARG_VOLATILE,
275                 ARG_TEMPLATE,
276         };
277
278         static const struct option options[] = {
279                 { "help",                  no_argument,       NULL, 'h'                   },
280                 { "version",               no_argument,       NULL, ARG_VERSION           },
281                 { "directory",             required_argument, NULL, 'D'                   },
282                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
283                 { "ephemeral",             no_argument,       NULL, 'x'                   },
284                 { "user",                  required_argument, NULL, 'u'                   },
285                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
286                 { "boot",                  no_argument,       NULL, 'b'                   },
287                 { "uuid",                  required_argument, NULL, ARG_UUID              },
288                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
289                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
290                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
291                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
292                 { "bind",                  required_argument, NULL, ARG_BIND              },
293                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
294                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
295                 { "machine",               required_argument, NULL, 'M'                   },
296                 { "slice",                 required_argument, NULL, 'S'                   },
297                 { "setenv",                required_argument, NULL, ARG_SETENV            },
298                 { "selinux-context",       required_argument, NULL, 'Z'                   },
299                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
300                 { "quiet",                 no_argument,       NULL, 'q'                   },
301                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
302                 { "register",              required_argument, NULL, ARG_REGISTER          },
303                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
304                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
305                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
306                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
307                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
308                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
309                 { "image",                 required_argument, NULL, 'i'                   },
310                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
311                 {}
312         };
313
314         int c, r;
315         uint64_t plus = 0, minus = 0;
316
317         assert(argc >= 0);
318         assert(argv);
319
320         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
321
322                 switch (c) {
323
324                 case 'h':
325                         help();
326                         return 0;
327
328                 case ARG_VERSION:
329                         puts(PACKAGE_STRING);
330                         puts(SYSTEMD_FEATURES);
331                         return 0;
332
333                 case 'D':
334                         r = set_sanitized_path(&arg_directory, optarg);
335                         if (r < 0)
336                                 return log_error_errno(r, "Invalid root directory: %m");
337
338                         break;
339
340                 case ARG_TEMPLATE:
341                         r = set_sanitized_path(&arg_template, optarg);
342                         if (r < 0)
343                                 return log_error_errno(r, "Invalid template directory: %m");
344
345                         break;
346
347                 case 'i':
348                         r = set_sanitized_path(&arg_image, optarg);
349                         if (r < 0)
350                                 return log_error_errno(r, "Invalid image path: %m");
351
352                         break;
353
354                 case 'x':
355                         arg_ephemeral = true;
356                         break;
357
358                 case 'u':
359                         free(arg_user);
360                         arg_user = strdup(optarg);
361                         if (!arg_user)
362                                 return log_oom();
363
364                         break;
365
366                 case ARG_NETWORK_BRIDGE:
367                         arg_network_bridge = optarg;
368
369                         /* fall through */
370
371                 case ARG_NETWORK_VETH:
372                         arg_network_veth = true;
373                         arg_private_network = true;
374                         break;
375
376                 case ARG_NETWORK_INTERFACE:
377                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
378                                 return log_oom();
379
380                         arg_private_network = true;
381                         break;
382
383                 case ARG_NETWORK_MACVLAN:
384                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
385                                 return log_oom();
386
387                         /* fall through */
388
389                 case ARG_PRIVATE_NETWORK:
390                         arg_private_network = true;
391                         break;
392
393                 case 'b':
394                         arg_boot = true;
395                         break;
396
397                 case ARG_UUID:
398                         r = sd_id128_from_string(optarg, &arg_uuid);
399                         if (r < 0) {
400                                 log_error("Invalid UUID: %s", optarg);
401                                 return r;
402                         }
403                         break;
404
405                 case 'S':
406                         arg_slice = optarg;
407                         break;
408
409                 case 'M':
410                         if (isempty(optarg)) {
411                                 free(arg_machine);
412                                 arg_machine = NULL;
413                         } else {
414                                 if (!machine_name_is_valid(optarg)) {
415                                         log_error("Invalid machine name: %s", optarg);
416                                         return -EINVAL;
417                                 }
418
419                                 r = free_and_strdup(&arg_machine, optarg);
420                                 if (r < 0)
421                                         return log_oom();
422
423                                 break;
424                         }
425
426                 case 'Z':
427                         arg_selinux_context = optarg;
428                         break;
429
430                 case 'L':
431                         arg_selinux_apifs_context = optarg;
432                         break;
433
434                 case ARG_READ_ONLY:
435                         arg_read_only = true;
436                         break;
437
438                 case ARG_CAPABILITY:
439                 case ARG_DROP_CAPABILITY: {
440                         const char *state, *word;
441                         size_t length;
442
443                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444                                 _cleanup_free_ char *t;
445
446                                 t = strndup(word, length);
447                                 if (!t)
448                                         return log_oom();
449
450                                 if (streq(t, "all")) {
451                                         if (c == ARG_CAPABILITY)
452                                                 plus = (uint64_t) -1;
453                                         else
454                                                 minus = (uint64_t) -1;
455                                 } else {
456                                         int cap;
457
458                                         cap = capability_from_name(t);
459                                         if (cap < 0) {
460                                                 log_error("Failed to parse capability %s.", t);
461                                                 return -EINVAL;
462                                         }
463
464                                         if (c == ARG_CAPABILITY)
465                                                 plus |= 1ULL << (uint64_t) cap;
466                                         else
467                                                 minus |= 1ULL << (uint64_t) cap;
468                                 }
469                         }
470
471                         break;
472                 }
473
474                 case 'j':
475                         arg_link_journal = LINK_GUEST;
476                         arg_link_journal_try = true;
477                         break;
478
479                 case ARG_LINK_JOURNAL:
480                         if (streq(optarg, "auto"))
481                                 arg_link_journal = LINK_AUTO;
482                         else if (streq(optarg, "no"))
483                                 arg_link_journal = LINK_NO;
484                         else if (streq(optarg, "guest"))
485                                 arg_link_journal = LINK_GUEST;
486                         else if (streq(optarg, "host"))
487                                 arg_link_journal = LINK_HOST;
488                         else if (streq(optarg, "try-guest")) {
489                                 arg_link_journal = LINK_GUEST;
490                                 arg_link_journal_try = true;
491                         } else if (streq(optarg, "try-host")) {
492                                 arg_link_journal = LINK_HOST;
493                                 arg_link_journal_try = true;
494                         } else {
495                                 log_error("Failed to parse link journal mode %s", optarg);
496                                 return -EINVAL;
497                         }
498
499                         break;
500
501                 case ARG_BIND:
502                 case ARG_BIND_RO: {
503                         _cleanup_free_ char *a = NULL, *b = NULL;
504                         char *e;
505                         char ***x;
506
507                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
508
509                         e = strchr(optarg, ':');
510                         if (e) {
511                                 a = strndup(optarg, e - optarg);
512                                 b = strdup(e + 1);
513                         } else {
514                                 a = strdup(optarg);
515                                 b = strdup(optarg);
516                         }
517
518                         if (!a || !b)
519                                 return log_oom();
520
521                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
522                                 log_error("Invalid bind mount specification: %s", optarg);
523                                 return -EINVAL;
524                         }
525
526                         r = strv_extend(x, a);
527                         if (r < 0)
528                                 return log_oom();
529
530                         r = strv_extend(x, b);
531                         if (r < 0)
532                                 return log_oom();
533
534                         break;
535                 }
536
537                 case ARG_TMPFS: {
538                         _cleanup_free_ char *a = NULL, *b = NULL;
539                         char *e;
540
541                         e = strchr(optarg, ':');
542                         if (e) {
543                                 a = strndup(optarg, e - optarg);
544                                 b = strdup(e + 1);
545                         } else {
546                                 a = strdup(optarg);
547                                 b = strdup("mode=0755");
548                         }
549
550                         if (!a || !b)
551                                 return log_oom();
552
553                         if (!path_is_absolute(a)) {
554                                 log_error("Invalid tmpfs specification: %s", optarg);
555                                 return -EINVAL;
556                         }
557
558                         r = strv_push(&arg_tmpfs, a);
559                         if (r < 0)
560                                 return log_oom();
561
562                         a = NULL;
563
564                         r = strv_push(&arg_tmpfs, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         b = NULL;
569
570                         break;
571                 }
572
573                 case ARG_SETENV: {
574                         char **n;
575
576                         if (!env_assignment_is_valid(optarg)) {
577                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
578                                 return -EINVAL;
579                         }
580
581                         n = strv_env_set(arg_setenv, optarg);
582                         if (!n)
583                                 return log_oom();
584
585                         strv_free(arg_setenv);
586                         arg_setenv = n;
587                         break;
588                 }
589
590                 case 'q':
591                         arg_quiet = true;
592                         break;
593
594                 case ARG_SHARE_SYSTEM:
595                         arg_share_system = true;
596                         break;
597
598                 case ARG_REGISTER:
599                         r = parse_boolean(optarg);
600                         if (r < 0) {
601                                 log_error("Failed to parse --register= argument: %s", optarg);
602                                 return r;
603                         }
604
605                         arg_register = r;
606                         break;
607
608                 case ARG_KEEP_UNIT:
609                         arg_keep_unit = true;
610                         break;
611
612                 case ARG_PERSONALITY:
613
614                         arg_personality = personality_from_string(optarg);
615                         if (arg_personality == 0xffffffffLU) {
616                                 log_error("Unknown or unsupported personality '%s'.", optarg);
617                                 return -EINVAL;
618                         }
619
620                         break;
621
622                 case ARG_VOLATILE:
623
624                         if (!optarg)
625                                 arg_volatile = VOLATILE_YES;
626                         else {
627                                 r = parse_boolean(optarg);
628                                 if (r < 0) {
629                                         if (streq(optarg, "state"))
630                                                 arg_volatile = VOLATILE_STATE;
631                                         else {
632                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
633                                                 return r;
634                                         }
635                                 } else
636                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
637                         }
638
639                         break;
640
641                 case '?':
642                         return -EINVAL;
643
644                 default:
645                         assert_not_reached("Unhandled option");
646                 }
647
648         if (arg_share_system)
649                 arg_register = false;
650
651         if (arg_boot && arg_share_system) {
652                 log_error("--boot and --share-system may not be combined.");
653                 return -EINVAL;
654         }
655
656         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
657                 log_error("--keep-unit may not be used when invoked from a user session.");
658                 return -EINVAL;
659         }
660
661         if (arg_directory && arg_image) {
662                 log_error("--directory= and --image= may not be combined.");
663                 return -EINVAL;
664         }
665
666         if (arg_template && arg_image) {
667                 log_error("--template= and --image= may not be combined.");
668                 return -EINVAL;
669         }
670
671         if (arg_template && !(arg_directory || arg_machine)) {
672                 log_error("--template= needs --directory= or --machine=.");
673                 return -EINVAL;
674         }
675
676         if (arg_ephemeral && arg_template) {
677                 log_error("--ephemeral and --template= may not be combined.");
678                 return -EINVAL;
679         }
680
681         if (arg_ephemeral && arg_image) {
682                 log_error("--ephemeral and --image= may not be combined.");
683                 return -EINVAL;
684         }
685
686         if (arg_volatile != VOLATILE_NO && arg_read_only) {
687                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
688                 return -EINVAL;
689         }
690
691         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
692
693         return 1;
694 }
695
696 static int mount_all(const char *dest) {
697
698         typedef struct MountPoint {
699                 const char *what;
700                 const char *where;
701                 const char *type;
702                 const char *options;
703                 unsigned long flags;
704                 bool fatal;
705         } MountPoint;
706
707         static const MountPoint mount_table[] = {
708                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
709                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
710                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
711                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
712                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
713                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
714                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
715                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
716 #ifdef HAVE_SELINUX
717                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
718                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
719 #endif
720         };
721
722         unsigned k;
723         int r = 0;
724
725         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
726                 _cleanup_free_ char *where = NULL;
727 #ifdef HAVE_SELINUX
728                 _cleanup_free_ char *options = NULL;
729 #endif
730                 const char *o;
731                 int t;
732
733                 where = strjoin(dest, "/", mount_table[k].where, NULL);
734                 if (!where)
735                         return log_oom();
736
737                 t = path_is_mount_point(where, true);
738                 if (t < 0) {
739                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
740
741                         if (r == 0)
742                                 r = t;
743
744                         continue;
745                 }
746
747                 /* Skip this entry if it is not a remount. */
748                 if (mount_table[k].what && t > 0)
749                         continue;
750
751                 t = mkdir_p(where, 0755);
752                 if (t < 0) {
753                         if (mount_table[k].fatal) {
754                                log_error_errno(t, "Failed to create directory %s: %m", where);
755
756                                 if (r == 0)
757                                         r = t;
758                         } else
759                                log_warning_errno(t, "Failed to create directory %s: %m", where);
760
761                         continue;
762                 }
763
764 #ifdef HAVE_SELINUX
765                 if (arg_selinux_apifs_context &&
766                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
767                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
768                         if (!options)
769                                 return log_oom();
770
771                         o = options;
772                 } else
773 #endif
774                         o = mount_table[k].options;
775
776
777                 if (mount(mount_table[k].what,
778                           where,
779                           mount_table[k].type,
780                           mount_table[k].flags,
781                           o) < 0) {
782
783                         if (mount_table[k].fatal) {
784                                 log_error_errno(errno, "mount(%s) failed: %m", where);
785
786                                 if (r == 0)
787                                         r = -errno;
788                         } else
789                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
790                 }
791         }
792
793         return r;
794 }
795
796 static int mount_binds(const char *dest, char **l, bool ro) {
797         char **x, **y;
798
799         STRV_FOREACH_PAIR(x, y, l) {
800                 _cleanup_free_ char *where = NULL;
801                 struct stat source_st, dest_st;
802                 int r;
803
804                 if (stat(*x, &source_st) < 0)
805                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
806
807                 where = strappend(dest, *y);
808                 if (!where)
809                         return log_oom();
810
811                 r = stat(where, &dest_st);
812                 if (r == 0) {
813                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
814                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
815                                 return -EINVAL;
816                         }
817                 } else if (errno == ENOENT) {
818                         r = mkdir_parents_label(where, 0755);
819                         if (r < 0)
820                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
821                 } else {
822                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
823                         return -errno;
824                 }
825
826                 /* Create the mount point, but be conservative -- refuse to create block
827                  * and char devices. */
828                 if (S_ISDIR(source_st.st_mode)) {
829                         r = mkdir_label(where, 0755);
830                         if (r < 0 && errno != EEXIST)
831                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
832                 } else if (S_ISFIFO(source_st.st_mode)) {
833                         r = mkfifo(where, 0644);
834                         if (r < 0 && errno != EEXIST)
835                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
836                 } else if (S_ISSOCK(source_st.st_mode)) {
837                         r = mknod(where, 0644 | S_IFSOCK, 0);
838                         if (r < 0 && errno != EEXIST)
839                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
840                 } else if (S_ISREG(source_st.st_mode)) {
841                         r = touch(where);
842                         if (r < 0)
843                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
844                 } else {
845                         log_error("Refusing to create mountpoint for file: %s", *x);
846                         return -ENOTSUP;
847                 }
848
849                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
850                         return log_error_errno(errno, "mount(%s) failed: %m", where);
851
852                 if (ro) {
853                         r = bind_remount_recursive(where, true);
854                         if (r < 0)
855                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
856                 }
857         }
858
859         return 0;
860 }
861
862 static int mount_tmpfs(const char *dest) {
863         char **i, **o;
864
865         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
866                 _cleanup_free_ char *where = NULL;
867                 int r;
868
869                 where = strappend(dest, *i);
870                 if (!where)
871                         return log_oom();
872
873                 r = mkdir_label(where, 0755);
874                 if (r < 0 && r != -EEXIST)
875                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
876
877                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
878                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
879         }
880
881         return 0;
882 }
883
884 static int setup_timezone(const char *dest) {
885         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
886         char *z, *y;
887         int r;
888
889         assert(dest);
890
891         /* Fix the timezone, if possible */
892         r = readlink_malloc("/etc/localtime", &p);
893         if (r < 0) {
894                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
895                 return 0;
896         }
897
898         z = path_startswith(p, "../usr/share/zoneinfo/");
899         if (!z)
900                 z = path_startswith(p, "/usr/share/zoneinfo/");
901         if (!z) {
902                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
903                 return 0;
904         }
905
906         where = strappend(dest, "/etc/localtime");
907         if (!where)
908                 return log_oom();
909
910         r = readlink_malloc(where, &q);
911         if (r >= 0) {
912                 y = path_startswith(q, "../usr/share/zoneinfo/");
913                 if (!y)
914                         y = path_startswith(q, "/usr/share/zoneinfo/");
915
916                 /* Already pointing to the right place? Then do nothing .. */
917                 if (y && streq(y, z))
918                         return 0;
919         }
920
921         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
922         if (!check)
923                 return log_oom();
924
925         if (access(check, F_OK) < 0) {
926                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
927                 return 0;
928         }
929
930         what = strappend("../usr/share/zoneinfo/", z);
931         if (!what)
932                 return log_oom();
933
934         r = mkdir_parents(where, 0755);
935         if (r < 0) {
936                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
937
938                 return 0;
939         }
940
941         r = unlink(where);
942         if (r < 0 && errno != ENOENT) {
943                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
944
945                 return 0;
946         }
947
948         if (symlink(what, where) < 0) {
949                 log_error_errno(errno, "Failed to correct timezone of container: %m");
950                 return 0;
951         }
952
953         return 0;
954 }
955
956 static int setup_resolv_conf(const char *dest) {
957         _cleanup_free_ char *where = NULL;
958         int r;
959
960         assert(dest);
961
962         if (arg_private_network)
963                 return 0;
964
965         /* Fix resolv.conf, if possible */
966         where = strappend(dest, "/etc/resolv.conf");
967         if (!where)
968                 return log_oom();
969
970         /* We don't really care for the results of this really. If it
971          * fails, it fails, but meh... */
972         r = mkdir_parents(where, 0755);
973         if (r < 0) {
974                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
975
976                 return 0;
977         }
978
979         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
980         if (r < 0) {
981                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
982
983                 return 0;
984         }
985
986         return 0;
987 }
988
989 static int setup_volatile_state(const char *directory) {
990         const char *p;
991         int r;
992
993         assert(directory);
994
995         if (arg_volatile != VOLATILE_STATE)
996                 return 0;
997
998         /* --volatile=state means we simply overmount /var
999            with a tmpfs, and the rest read-only. */
1000
1001         r = bind_remount_recursive(directory, true);
1002         if (r < 0)
1003                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1004
1005         p = strappenda(directory, "/var");
1006         r = mkdir(p, 0755);
1007         if (r < 0 && errno != EEXIST)
1008                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1009
1010         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1011                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1012
1013         return 0;
1014 }
1015
1016 static int setup_volatile(const char *directory) {
1017         bool tmpfs_mounted = false, bind_mounted = false;
1018         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1019         const char *f, *t;
1020         int r;
1021
1022         assert(directory);
1023
1024         if (arg_volatile != VOLATILE_YES)
1025                 return 0;
1026
1027         /* --volatile=yes means we mount a tmpfs to the root dir, and
1028            the original /usr to use inside it, and that read-only. */
1029
1030         if (!mkdtemp(template))
1031                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1032
1033         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1034                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1035                 r = -errno;
1036                 goto fail;
1037         }
1038
1039         tmpfs_mounted = true;
1040
1041         f = strappenda(directory, "/usr");
1042         t = strappenda(template, "/usr");
1043
1044         r = mkdir(t, 0755);
1045         if (r < 0 && errno != EEXIST) {
1046                 log_error_errno(errno, "Failed to create %s: %m", t);
1047                 r = -errno;
1048                 goto fail;
1049         }
1050
1051         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1052                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1053                 r = -errno;
1054                 goto fail;
1055         }
1056
1057         bind_mounted = true;
1058
1059         r = bind_remount_recursive(t, true);
1060         if (r < 0) {
1061                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1062                 goto fail;
1063         }
1064
1065         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1066                 log_error_errno(errno, "Failed to move root mount: %m");
1067                 r = -errno;
1068                 goto fail;
1069         }
1070
1071         rmdir(template);
1072
1073         return 0;
1074
1075 fail:
1076         if (bind_mounted)
1077                 umount(t);
1078         if (tmpfs_mounted)
1079                 umount(template);
1080         rmdir(template);
1081         return r;
1082 }
1083
1084 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1085
1086         snprintf(s, 37,
1087                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1088                  SD_ID128_FORMAT_VAL(id));
1089
1090         return s;
1091 }
1092
1093 static int setup_boot_id(const char *dest) {
1094         _cleanup_free_ char *from = NULL, *to = NULL;
1095         sd_id128_t rnd = {};
1096         char as_uuid[37];
1097         int r;
1098
1099         assert(dest);
1100
1101         if (arg_share_system)
1102                 return 0;
1103
1104         /* Generate a new randomized boot ID, so that each boot-up of
1105          * the container gets a new one */
1106
1107         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1108         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1109         if (!from || !to)
1110                 return log_oom();
1111
1112         r = sd_id128_randomize(&rnd);
1113         if (r < 0)
1114                 return log_error_errno(r, "Failed to generate random boot id: %m");
1115
1116         id128_format_as_uuid(rnd, as_uuid);
1117
1118         r = write_string_file(from, as_uuid);
1119         if (r < 0)
1120                 return log_error_errno(r, "Failed to write boot id: %m");
1121
1122         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1123                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1124                 r = -errno;
1125         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1126                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1127
1128         unlink(from);
1129         return r;
1130 }
1131
1132 static int copy_devnodes(const char *dest) {
1133
1134         static const char devnodes[] =
1135                 "null\0"
1136                 "zero\0"
1137                 "full\0"
1138                 "random\0"
1139                 "urandom\0"
1140                 "tty\0"
1141                 "net/tun\0";
1142
1143         const char *d;
1144         int r = 0;
1145         _cleanup_umask_ mode_t u;
1146
1147         assert(dest);
1148
1149         u = umask(0000);
1150
1151         NULSTR_FOREACH(d, devnodes) {
1152                 _cleanup_free_ char *from = NULL, *to = NULL;
1153                 struct stat st;
1154
1155                 from = strappend("/dev/", d);
1156                 to = strjoin(dest, "/dev/", d, NULL);
1157                 if (!from || !to)
1158                         return log_oom();
1159
1160                 if (stat(from, &st) < 0) {
1161
1162                         if (errno != ENOENT)
1163                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1164
1165                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1166
1167                         log_error("%s is not a char or block device, cannot copy", from);
1168                         return -EIO;
1169
1170                 } else {
1171                         r = mkdir_parents(to, 0775);
1172                         if (r < 0) {
1173                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1174                                 return -r;
1175                         }
1176
1177                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1178                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1179                 }
1180         }
1181
1182         return r;
1183 }
1184
1185 static int setup_ptmx(const char *dest) {
1186         _cleanup_free_ char *p = NULL;
1187
1188         p = strappend(dest, "/dev/ptmx");
1189         if (!p)
1190                 return log_oom();
1191
1192         if (symlink("pts/ptmx", p) < 0)
1193                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1194
1195         return 0;
1196 }
1197
1198 static int setup_dev_console(const char *dest, const char *console) {
1199         _cleanup_umask_ mode_t u;
1200         const char *to;
1201         struct stat st;
1202         int r;
1203
1204         assert(dest);
1205         assert(console);
1206
1207         u = umask(0000);
1208
1209         if (stat("/dev/null", &st) < 0)
1210                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1211
1212         r = chmod_and_chown(console, 0600, 0, 0);
1213         if (r < 0)
1214                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1215
1216         /* We need to bind mount the right tty to /dev/console since
1217          * ptys can only exist on pts file systems. To have something
1218          * to bind mount things on we create a device node first, and
1219          * use /dev/null for that since we the cgroups device policy
1220          * allows us to create that freely, while we cannot create
1221          * /dev/console. (Note that the major minor doesn't actually
1222          * matter here, since we mount it over anyway). */
1223
1224         to = strappenda(dest, "/dev/console");
1225         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1226                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1227
1228         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1229                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1230
1231         return 0;
1232 }
1233
1234 static int setup_kmsg(const char *dest, int kmsg_socket) {
1235         _cleanup_free_ char *from = NULL, *to = NULL;
1236         int r, fd, k;
1237         _cleanup_umask_ mode_t u;
1238         union {
1239                 struct cmsghdr cmsghdr;
1240                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1241         } control = {};
1242         struct msghdr mh = {
1243                 .msg_control = &control,
1244                 .msg_controllen = sizeof(control),
1245         };
1246         struct cmsghdr *cmsg;
1247
1248         assert(dest);
1249         assert(kmsg_socket >= 0);
1250
1251         u = umask(0000);
1252
1253         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1254          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1255          * on the reading side behave very similar to /proc/kmsg,
1256          * their writing side behaves differently from /dev/kmsg in
1257          * that writing blocks when nothing is reading. In order to
1258          * avoid any problems with containers deadlocking due to this
1259          * we simply make /dev/kmsg unavailable to the container. */
1260         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1261             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1262                 return log_oom();
1263
1264         if (mkfifo(from, 0600) < 0)
1265                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1266
1267         r = chmod_and_chown(from, 0600, 0, 0);
1268         if (r < 0)
1269                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1270
1271         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1272                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1273
1274         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1275         if (fd < 0)
1276                 return log_error_errno(errno, "Failed to open fifo: %m");
1277
1278         cmsg = CMSG_FIRSTHDR(&mh);
1279         cmsg->cmsg_level = SOL_SOCKET;
1280         cmsg->cmsg_type = SCM_RIGHTS;
1281         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1282         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1283
1284         mh.msg_controllen = cmsg->cmsg_len;
1285
1286         /* Store away the fd in the socket, so that it stays open as
1287          * long as we run the child */
1288         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1289         safe_close(fd);
1290
1291         if (k < 0)
1292                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1293
1294         /* And now make the FIFO unavailable as /dev/kmsg... */
1295         unlink(from);
1296         return 0;
1297 }
1298
1299 static int setup_hostname(void) {
1300
1301         if (arg_share_system)
1302                 return 0;
1303
1304         if (sethostname_idempotent(arg_machine) < 0)
1305                 return -errno;
1306
1307         return 0;
1308 }
1309
1310 static int setup_journal(const char *directory) {
1311         sd_id128_t machine_id, this_id;
1312         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1313         char *id;
1314         int r;
1315
1316         p = strappend(directory, "/etc/machine-id");
1317         if (!p)
1318                 return log_oom();
1319
1320         r = read_one_line_file(p, &b);
1321         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1322                 return 0;
1323         else if (r < 0)
1324                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1325
1326         id = strstrip(b);
1327         if (isempty(id) && arg_link_journal == LINK_AUTO)
1328                 return 0;
1329
1330         /* Verify validity */
1331         r = sd_id128_from_string(id, &machine_id);
1332         if (r < 0)
1333                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1334
1335         r = sd_id128_get_machine(&this_id);
1336         if (r < 0)
1337                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1338
1339         if (sd_id128_equal(machine_id, this_id)) {
1340                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1341                          "Host and machine ids are equal (%s): refusing to link journals", id);
1342                 if (arg_link_journal == LINK_AUTO)
1343                         return 0;
1344                 return
1345                         -EEXIST;
1346         }
1347
1348         if (arg_link_journal == LINK_NO)
1349                 return 0;
1350
1351         free(p);
1352         p = strappend("/var/log/journal/", id);
1353         q = strjoin(directory, "/var/log/journal/", id, NULL);
1354         if (!p || !q)
1355                 return log_oom();
1356
1357         if (path_is_mount_point(p, false) > 0) {
1358                 if (arg_link_journal != LINK_AUTO) {
1359                         log_error("%s: already a mount point, refusing to use for journal", p);
1360                         return -EEXIST;
1361                 }
1362
1363                 return 0;
1364         }
1365
1366         if (path_is_mount_point(q, false) > 0) {
1367                 if (arg_link_journal != LINK_AUTO) {
1368                         log_error("%s: already a mount point, refusing to use for journal", q);
1369                         return -EEXIST;
1370                 }
1371
1372                 return 0;
1373         }
1374
1375         r = readlink_and_make_absolute(p, &d);
1376         if (r >= 0) {
1377                 if ((arg_link_journal == LINK_GUEST ||
1378                      arg_link_journal == LINK_AUTO) &&
1379                     path_equal(d, q)) {
1380
1381                         r = mkdir_p(q, 0755);
1382                         if (r < 0)
1383                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1384                         return 0;
1385                 }
1386
1387                 if (unlink(p) < 0)
1388                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1389         } else if (r == -EINVAL) {
1390
1391                 if (arg_link_journal == LINK_GUEST &&
1392                     rmdir(p) < 0) {
1393
1394                         if (errno == ENOTDIR) {
1395                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1396                                 return r;
1397                         } else {
1398                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1399                                 return -errno;
1400                         }
1401                 }
1402         } else if (r != -ENOENT) {
1403                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1404                 return r;
1405         }
1406
1407         if (arg_link_journal == LINK_GUEST) {
1408
1409                 if (symlink(q, p) < 0) {
1410                         if (arg_link_journal_try) {
1411                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1412                                 return 0;
1413                         } else {
1414                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1415                                 return -errno;
1416                         }
1417                 }
1418
1419                 r = mkdir_p(q, 0755);
1420                 if (r < 0)
1421                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1422                 return 0;
1423         }
1424
1425         if (arg_link_journal == LINK_HOST) {
1426                 /* don't create parents here -- if the host doesn't have
1427                  * permanent journal set up, don't force it here */
1428                 r = mkdir(p, 0755);
1429                 if (r < 0) {
1430                         if (arg_link_journal_try) {
1431                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1432                                 return 0;
1433                         } else {
1434                                 log_error_errno(errno, "Failed to create %s: %m", p);
1435                                 return r;
1436                         }
1437                 }
1438
1439         } else if (access(p, F_OK) < 0)
1440                 return 0;
1441
1442         if (dir_is_empty(q) == 0)
1443                 log_warning("%s is not empty, proceeding anyway.", q);
1444
1445         r = mkdir_p(q, 0755);
1446         if (r < 0) {
1447                 log_error_errno(errno, "Failed to create %s: %m", q);
1448                 return r;
1449         }
1450
1451         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1452                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1453
1454         return 0;
1455 }
1456
1457 static int drop_capabilities(void) {
1458         return capability_bounding_set_drop(~arg_retain, false);
1459 }
1460
1461 static int register_machine(pid_t pid, int local_ifindex) {
1462         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1463         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1464         int r;
1465
1466         if (!arg_register)
1467                 return 0;
1468
1469         r = sd_bus_default_system(&bus);
1470         if (r < 0)
1471                 return log_error_errno(r, "Failed to open system bus: %m");
1472
1473         if (arg_keep_unit) {
1474                 r = sd_bus_call_method(
1475                                 bus,
1476                                 "org.freedesktop.machine1",
1477                                 "/org/freedesktop/machine1",
1478                                 "org.freedesktop.machine1.Manager",
1479                                 "RegisterMachineWithNetwork",
1480                                 &error,
1481                                 NULL,
1482                                 "sayssusai",
1483                                 arg_machine,
1484                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1485                                 "nspawn",
1486                                 "container",
1487                                 (uint32_t) pid,
1488                                 strempty(arg_directory),
1489                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1490         } else {
1491                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1492
1493                 r = sd_bus_message_new_method_call(
1494                                 bus,
1495                                 &m,
1496                                 "org.freedesktop.machine1",
1497                                 "/org/freedesktop/machine1",
1498                                 "org.freedesktop.machine1.Manager",
1499                                 "CreateMachineWithNetwork");
1500                 if (r < 0)
1501                         return log_error_errno(r, "Failed to create message: %m");
1502
1503                 r = sd_bus_message_append(
1504                                 m,
1505                                 "sayssusai",
1506                                 arg_machine,
1507                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1508                                 "nspawn",
1509                                 "container",
1510                                 (uint32_t) pid,
1511                                 strempty(arg_directory),
1512                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1513                 if (r < 0)
1514                         return log_error_errno(r, "Failed to append message arguments: %m");
1515
1516                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1517                 if (r < 0)
1518                         return log_error_errno(r, "Failed to open container: %m");
1519
1520                 if (!isempty(arg_slice)) {
1521                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1522                         if (r < 0)
1523                                 return log_error_errno(r, "Failed to append slice: %m");
1524                 }
1525
1526                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1527                 if (r < 0)
1528                         return log_error_errno(r, "Failed to add device policy: %m");
1529
1530                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1531                                           /* Allow the container to
1532                                            * access and create the API
1533                                            * device nodes, so that
1534                                            * PrivateDevices= in the
1535                                            * container can work
1536                                            * fine */
1537                                           "/dev/null", "rwm",
1538                                           "/dev/zero", "rwm",
1539                                           "/dev/full", "rwm",
1540                                           "/dev/random", "rwm",
1541                                           "/dev/urandom", "rwm",
1542                                           "/dev/tty", "rwm",
1543                                           "/dev/net/tun", "rwm",
1544                                           /* Allow the container
1545                                            * access to ptys. However,
1546                                            * do not permit the
1547                                            * container to ever create
1548                                            * these device nodes. */
1549                                           "/dev/pts/ptmx", "rw",
1550                                           "char-pts", "rw");
1551                 if (r < 0)
1552                         return log_error_errno(r, "Failed to add device whitelist: %m");
1553
1554                 r = sd_bus_message_close_container(m);
1555                 if (r < 0)
1556                         return log_error_errno(r, "Failed to close container: %m");
1557
1558                 r = sd_bus_call(bus, m, 0, &error, NULL);
1559         }
1560
1561         if (r < 0) {
1562                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1563                 return r;
1564         }
1565
1566         return 0;
1567 }
1568
1569 static int terminate_machine(pid_t pid) {
1570         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1571         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1572         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1573         const char *path;
1574         int r;
1575
1576         if (!arg_register)
1577                 return 0;
1578
1579         r = sd_bus_default_system(&bus);
1580         if (r < 0)
1581                 return log_error_errno(r, "Failed to open system bus: %m");
1582
1583         r = sd_bus_call_method(
1584                         bus,
1585                         "org.freedesktop.machine1",
1586                         "/org/freedesktop/machine1",
1587                         "org.freedesktop.machine1.Manager",
1588                         "GetMachineByPID",
1589                         &error,
1590                         &reply,
1591                         "u",
1592                         (uint32_t) pid);
1593         if (r < 0) {
1594                 /* Note that the machine might already have been
1595                  * cleaned up automatically, hence don't consider it a
1596                  * failure if we cannot get the machine object. */
1597                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1598                 return 0;
1599         }
1600
1601         r = sd_bus_message_read(reply, "o", &path);
1602         if (r < 0)
1603                 return bus_log_parse_error(r);
1604
1605         r = sd_bus_call_method(
1606                         bus,
1607                         "org.freedesktop.machine1",
1608                         path,
1609                         "org.freedesktop.machine1.Machine",
1610                         "Terminate",
1611                         &error,
1612                         NULL,
1613                         NULL);
1614         if (r < 0) {
1615                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1616                 return 0;
1617         }
1618
1619         return 0;
1620 }
1621
1622 static int reset_audit_loginuid(void) {
1623         _cleanup_free_ char *p = NULL;
1624         int r;
1625
1626         if (arg_share_system)
1627                 return 0;
1628
1629         r = read_one_line_file("/proc/self/loginuid", &p);
1630         if (r == -ENOENT)
1631                 return 0;
1632         if (r < 0)
1633                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1634
1635         /* Already reset? */
1636         if (streq(p, "4294967295"))
1637                 return 0;
1638
1639         r = write_string_file("/proc/self/loginuid", "4294967295");
1640         if (r < 0) {
1641                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1642                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1643                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1644                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1645                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1646
1647                 sleep(5);
1648         }
1649
1650         return 0;
1651 }
1652
1653 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1654 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1655 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1656
1657 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1658         uint8_t result[8];
1659         size_t l, sz;
1660         uint8_t *v, *i;
1661         int r;
1662
1663         l = strlen(arg_machine);
1664         sz = sizeof(sd_id128_t) + l;
1665         if (idx > 0)
1666                 sz += sizeof(idx);
1667
1668         v = alloca(sz);
1669
1670         /* fetch some persistent data unique to the host */
1671         r = sd_id128_get_machine((sd_id128_t*) v);
1672         if (r < 0)
1673                 return r;
1674
1675         /* combine with some data unique (on this host) to this
1676          * container instance */
1677         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1678         if (idx > 0) {
1679                 idx = htole64(idx);
1680                 memcpy(i, &idx, sizeof(idx));
1681         }
1682
1683         /* Let's hash the host machine ID plus the container name. We
1684          * use a fixed, but originally randomly created hash key here. */
1685         siphash24(result, v, sz, hash_key.bytes);
1686
1687         assert_cc(ETH_ALEN <= sizeof(result));
1688         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1689
1690         /* see eth_random_addr in the kernel */
1691         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1692         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1693
1694         return 0;
1695 }
1696
1697 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1698         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1699         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1700         struct ether_addr mac_host, mac_container;
1701         int r, i;
1702
1703         if (!arg_private_network)
1704                 return 0;
1705
1706         if (!arg_network_veth)
1707                 return 0;
1708
1709         /* Use two different interface name prefixes depending whether
1710          * we are in bridge mode or not. */
1711         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1712                  arg_network_bridge ? "vb" : "ve", arg_machine);
1713
1714         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1715         if (r < 0)
1716                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1717
1718         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1719         if (r < 0)
1720                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1721
1722         r = sd_rtnl_open(&rtnl, 0);
1723         if (r < 0)
1724                 return log_error_errno(r, "Failed to connect to netlink: %m");
1725
1726         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1729
1730         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1731         if (r < 0)
1732                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1733
1734         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1735         if (r < 0)
1736                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1737
1738         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1739         if (r < 0)
1740                 return log_error_errno(r, "Failed to open netlink container: %m");
1741
1742         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1743         if (r < 0)
1744                 return log_error_errno(r, "Failed to open netlink container: %m");
1745
1746         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1747         if (r < 0)
1748                 return log_error_errno(r, "Failed to open netlink container: %m");
1749
1750         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1753
1754         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1755         if (r < 0)
1756                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1757
1758         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1759         if (r < 0)
1760                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1761
1762         r = sd_rtnl_message_close_container(m);
1763         if (r < 0)
1764                 return log_error_errno(r, "Failed to close netlink container: %m");
1765
1766         r = sd_rtnl_message_close_container(m);
1767         if (r < 0)
1768                 return log_error_errno(r, "Failed to close netlink container: %m");
1769
1770         r = sd_rtnl_message_close_container(m);
1771         if (r < 0)
1772                 return log_error_errno(r, "Failed to close netlink container: %m");
1773
1774         r = sd_rtnl_call(rtnl, m, 0, NULL);
1775         if (r < 0)
1776                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1777
1778         i = (int) if_nametoindex(iface_name);
1779         if (i <= 0)
1780                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1781
1782         *ifi = i;
1783
1784         return 0;
1785 }
1786
1787 static int setup_bridge(const char veth_name[], int *ifi) {
1788         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1790         int r, bridge;
1791
1792         if (!arg_private_network)
1793                 return 0;
1794
1795         if (!arg_network_veth)
1796                 return 0;
1797
1798         if (!arg_network_bridge)
1799                 return 0;
1800
1801         bridge = (int) if_nametoindex(arg_network_bridge);
1802         if (bridge <= 0)
1803                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1804
1805         *ifi = bridge;
1806
1807         r = sd_rtnl_open(&rtnl, 0);
1808         if (r < 0)
1809                 return log_error_errno(r, "Failed to connect to netlink: %m");
1810
1811         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1812         if (r < 0)
1813                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1814
1815         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1816         if (r < 0)
1817                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1818
1819         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1820         if (r < 0)
1821                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1822
1823         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1824         if (r < 0)
1825                 return log_error_errno(r, "Failed to add netlink master field: %m");
1826
1827         r = sd_rtnl_call(rtnl, m, 0, NULL);
1828         if (r < 0)
1829                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1830
1831         return 0;
1832 }
1833
1834 static int parse_interface(struct udev *udev, const char *name) {
1835         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1836         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1837         int ifi;
1838
1839         ifi = (int) if_nametoindex(name);
1840         if (ifi <= 0)
1841                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1842
1843         sprintf(ifi_str, "n%i", ifi);
1844         d = udev_device_new_from_device_id(udev, ifi_str);
1845         if (!d)
1846                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1847
1848         if (udev_device_get_is_initialized(d) <= 0) {
1849                 log_error("Network interface %s is not initialized yet.", name);
1850                 return -EBUSY;
1851         }
1852
1853         return ifi;
1854 }
1855
1856 static int move_network_interfaces(pid_t pid) {
1857         _cleanup_udev_unref_ struct udev *udev = NULL;
1858         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1859         char **i;
1860         int r;
1861
1862         if (!arg_private_network)
1863                 return 0;
1864
1865         if (strv_isempty(arg_network_interfaces))
1866                 return 0;
1867
1868         r = sd_rtnl_open(&rtnl, 0);
1869         if (r < 0)
1870                 return log_error_errno(r, "Failed to connect to netlink: %m");
1871
1872         udev = udev_new();
1873         if (!udev) {
1874                 log_error("Failed to connect to udev.");
1875                 return -ENOMEM;
1876         }
1877
1878         STRV_FOREACH(i, arg_network_interfaces) {
1879                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1880                 int ifi;
1881
1882                 ifi = parse_interface(udev, *i);
1883                 if (ifi < 0)
1884                         return ifi;
1885
1886                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1887                 if (r < 0)
1888                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1889
1890                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1891                 if (r < 0)
1892                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1893
1894                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1895                 if (r < 0)
1896                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1897         }
1898
1899         return 0;
1900 }
1901
1902 static int setup_macvlan(pid_t pid) {
1903         _cleanup_udev_unref_ struct udev *udev = NULL;
1904         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1905         unsigned idx = 0;
1906         char **i;
1907         int r;
1908
1909         if (!arg_private_network)
1910                 return 0;
1911
1912         if (strv_isempty(arg_network_macvlan))
1913                 return 0;
1914
1915         r = sd_rtnl_open(&rtnl, 0);
1916         if (r < 0)
1917                 return log_error_errno(r, "Failed to connect to netlink: %m");
1918
1919         udev = udev_new();
1920         if (!udev) {
1921                 log_error("Failed to connect to udev.");
1922                 return -ENOMEM;
1923         }
1924
1925         STRV_FOREACH(i, arg_network_macvlan) {
1926                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1927                 _cleanup_free_ char *n = NULL;
1928                 struct ether_addr mac;
1929                 int ifi;
1930
1931                 ifi = parse_interface(udev, *i);
1932                 if (ifi < 0)
1933                         return ifi;
1934
1935                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1936                 if (r < 0)
1937                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1938
1939                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1942
1943                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1946
1947                 n = strappend("mv-", *i);
1948                 if (!n)
1949                         return log_oom();
1950
1951                 strshorten(n, IFNAMSIZ-1);
1952
1953                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1954                 if (r < 0)
1955                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1956
1957                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1958                 if (r < 0)
1959                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1960
1961                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1962                 if (r < 0)
1963                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1964
1965                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1966                 if (r < 0)
1967                         return log_error_errno(r, "Failed to open netlink container: %m");
1968
1969                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1970                 if (r < 0)
1971                         return log_error_errno(r, "Failed to open netlink container: %m");
1972
1973                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1974                 if (r < 0)
1975                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1976
1977                 r = sd_rtnl_message_close_container(m);
1978                 if (r < 0)
1979                         return log_error_errno(r, "Failed to close netlink container: %m");
1980
1981                 r = sd_rtnl_message_close_container(m);
1982                 if (r < 0)
1983                         return log_error_errno(r, "Failed to close netlink container: %m");
1984
1985                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1986                 if (r < 0)
1987                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1988         }
1989
1990         return 0;
1991 }
1992
1993 static int setup_seccomp(void) {
1994
1995 #ifdef HAVE_SECCOMP
1996         static const int blacklist[] = {
1997                 SCMP_SYS(kexec_load),
1998                 SCMP_SYS(open_by_handle_at),
1999                 SCMP_SYS(init_module),
2000                 SCMP_SYS(finit_module),
2001                 SCMP_SYS(delete_module),
2002                 SCMP_SYS(iopl),
2003                 SCMP_SYS(ioperm),
2004                 SCMP_SYS(swapon),
2005                 SCMP_SYS(swapoff),
2006         };
2007
2008         scmp_filter_ctx seccomp;
2009         unsigned i;
2010         int r;
2011
2012         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2013         if (!seccomp)
2014                 return log_oom();
2015
2016         r = seccomp_add_secondary_archs(seccomp);
2017         if (r < 0) {
2018                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2019                 goto finish;
2020         }
2021
2022         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2023                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2024                 if (r == -EFAULT)
2025                         continue; /* unknown syscall */
2026                 if (r < 0) {
2027                         log_error_errno(r, "Failed to block syscall: %m");
2028                         goto finish;
2029                 }
2030         }
2031
2032         /*
2033            Audit is broken in containers, much of the userspace audit
2034            hookup will fail if running inside a container. We don't
2035            care and just turn off creation of audit sockets.
2036
2037            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2038            with EAFNOSUPPORT which audit userspace uses as indication
2039            that audit is disabled in the kernel.
2040          */
2041
2042         r = seccomp_rule_add(
2043                         seccomp,
2044                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2045                         SCMP_SYS(socket),
2046                         2,
2047                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2048                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2049         if (r < 0) {
2050                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2051                 goto finish;
2052         }
2053
2054         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2055         if (r < 0) {
2056                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2057                 goto finish;
2058         }
2059
2060         r = seccomp_load(seccomp);
2061         if (r < 0)
2062                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2063
2064 finish:
2065         seccomp_release(seccomp);
2066         return r;
2067 #else
2068         return 0;
2069 #endif
2070
2071 }
2072
2073 static int setup_image(char **device_path, int *loop_nr) {
2074         struct loop_info64 info = {
2075                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2076         };
2077         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2078         _cleanup_free_ char* loopdev = NULL;
2079         struct stat st;
2080         int r, nr;
2081
2082         assert(device_path);
2083         assert(loop_nr);
2084         assert(arg_image);
2085
2086         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2087         if (fd < 0)
2088                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2089
2090         if (fstat(fd, &st) < 0)
2091                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2092
2093         if (S_ISBLK(st.st_mode)) {
2094                 char *p;
2095
2096                 p = strdup(arg_image);
2097                 if (!p)
2098                         return log_oom();
2099
2100                 *device_path = p;
2101
2102                 *loop_nr = -1;
2103
2104                 r = fd;
2105                 fd = -1;
2106
2107                 return r;
2108         }
2109
2110         if (!S_ISREG(st.st_mode)) {
2111                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2112                 return -EINVAL;
2113         }
2114
2115         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2116         if (control < 0)
2117                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2118
2119         nr = ioctl(control, LOOP_CTL_GET_FREE);
2120         if (nr < 0)
2121                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2122
2123         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2124                 return log_oom();
2125
2126         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2127         if (loop < 0)
2128                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2129
2130         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2131                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2132
2133         if (arg_read_only)
2134                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2135
2136         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2137                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2138
2139         *device_path = loopdev;
2140         loopdev = NULL;
2141
2142         *loop_nr = nr;
2143
2144         r = loop;
2145         loop = -1;
2146
2147         return r;
2148 }
2149
2150 static int dissect_image(
2151                 int fd,
2152                 char **root_device, bool *root_device_rw,
2153                 char **home_device, bool *home_device_rw,
2154                 char **srv_device, bool *srv_device_rw,
2155                 bool *secondary) {
2156
2157 #ifdef HAVE_BLKID
2158         int home_nr = -1, srv_nr = -1;
2159 #ifdef GPT_ROOT_NATIVE
2160         int root_nr = -1;
2161 #endif
2162 #ifdef GPT_ROOT_SECONDARY
2163         int secondary_root_nr = -1;
2164 #endif
2165
2166         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2167         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2168         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2169         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2170         _cleanup_udev_unref_ struct udev *udev = NULL;
2171         struct udev_list_entry *first, *item;
2172         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2173         const char *pttype = NULL;
2174         blkid_partlist pl;
2175         struct stat st;
2176         int r;
2177
2178         assert(fd >= 0);
2179         assert(root_device);
2180         assert(home_device);
2181         assert(srv_device);
2182         assert(secondary);
2183         assert(arg_image);
2184
2185         b = blkid_new_probe();
2186         if (!b)
2187                 return log_oom();
2188
2189         errno = 0;
2190         r = blkid_probe_set_device(b, fd, 0, 0);
2191         if (r != 0) {
2192                 if (errno == 0)
2193                         return log_oom();
2194
2195                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2196                 return -errno;
2197         }
2198
2199         blkid_probe_enable_partitions(b, 1);
2200         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2201
2202         errno = 0;
2203         r = blkid_do_safeprobe(b);
2204         if (r == -2 || r == 1) {
2205                 log_error("Failed to identify any partition table on %s.\n"
2206                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2207                 return -EINVAL;
2208         } else if (r != 0) {
2209                 if (errno == 0)
2210                         errno = EIO;
2211                 log_error_errno(errno, "Failed to probe: %m");
2212                 return -errno;
2213         }
2214
2215         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2216         if (!streq_ptr(pttype, "gpt")) {
2217                 log_error("Image %s does not carry a GUID Partition Table.\n"
2218                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2219                 return -EINVAL;
2220         }
2221
2222         errno = 0;
2223         pl = blkid_probe_get_partitions(b);
2224         if (!pl) {
2225                 if (errno == 0)
2226                         return log_oom();
2227
2228                 log_error("Failed to list partitions of %s", arg_image);
2229                 return -errno;
2230         }
2231
2232         udev = udev_new();
2233         if (!udev)
2234                 return log_oom();
2235
2236         if (fstat(fd, &st) < 0)
2237                 return log_error_errno(errno, "Failed to stat block device: %m");
2238
2239         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2240         if (!d)
2241                 return log_oom();
2242
2243         e = udev_enumerate_new(udev);
2244         if (!e)
2245                 return log_oom();
2246
2247         r = udev_enumerate_add_match_parent(e, d);
2248         if (r < 0)
2249                 return log_oom();
2250
2251         r = udev_enumerate_scan_devices(e);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2254
2255         first = udev_enumerate_get_list_entry(e);
2256         udev_list_entry_foreach(item, first) {
2257                 _cleanup_udev_device_unref_ struct udev_device *q;
2258                 const char *stype, *node;
2259                 unsigned long long flags;
2260                 sd_id128_t type_id;
2261                 blkid_partition pp;
2262                 dev_t qn;
2263                 int nr;
2264
2265                 errno = 0;
2266                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2267                 if (!q) {
2268                         if (!errno)
2269                                 errno = ENOMEM;
2270
2271                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2272                         return -errno;
2273                 }
2274
2275                 qn = udev_device_get_devnum(q);
2276                 if (major(qn) == 0)
2277                         continue;
2278
2279                 if (st.st_rdev == qn)
2280                         continue;
2281
2282                 node = udev_device_get_devnode(q);
2283                 if (!node)
2284                         continue;
2285
2286                 pp = blkid_partlist_devno_to_partition(pl, qn);
2287                 if (!pp)
2288                         continue;
2289
2290                 flags = blkid_partition_get_flags(pp);
2291                 if (flags & GPT_FLAG_NO_AUTO)
2292                         continue;
2293
2294                 nr = blkid_partition_get_partno(pp);
2295                 if (nr < 0)
2296                         continue;
2297
2298                 stype = blkid_partition_get_type_string(pp);
2299                 if (!stype)
2300                         continue;
2301
2302                 if (sd_id128_from_string(stype, &type_id) < 0)
2303                         continue;
2304
2305                 if (sd_id128_equal(type_id, GPT_HOME)) {
2306
2307                         if (home && nr >= home_nr)
2308                                 continue;
2309
2310                         home_nr = nr;
2311                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2312
2313                         free(home);
2314                         home = strdup(node);
2315                         if (!home)
2316                                 return log_oom();
2317                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2318
2319                         if (srv && nr >= srv_nr)
2320                                 continue;
2321
2322                         srv_nr = nr;
2323                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2324
2325                         free(srv);
2326                         srv = strdup(node);
2327                         if (!srv)
2328                                 return log_oom();
2329                 }
2330 #ifdef GPT_ROOT_NATIVE
2331                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2332
2333                         if (root && nr >= root_nr)
2334                                 continue;
2335
2336                         root_nr = nr;
2337                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2338
2339                         free(root);
2340                         root = strdup(node);
2341                         if (!root)
2342                                 return log_oom();
2343                 }
2344 #endif
2345 #ifdef GPT_ROOT_SECONDARY
2346                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2347
2348                         if (secondary_root && nr >= secondary_root_nr)
2349                                 continue;
2350
2351                         secondary_root_nr = nr;
2352                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2353
2354
2355                         free(secondary_root);
2356                         secondary_root = strdup(node);
2357                         if (!secondary_root)
2358                                 return log_oom();
2359                 }
2360 #endif
2361         }
2362
2363         if (!root && !secondary_root) {
2364                 log_error("Failed to identify root partition in disk image %s.\n"
2365                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2366                 return -EINVAL;
2367         }
2368
2369         if (root) {
2370                 *root_device = root;
2371                 root = NULL;
2372
2373                 *root_device_rw = root_rw;
2374                 *secondary = false;
2375         } else if (secondary_root) {
2376                 *root_device = secondary_root;
2377                 secondary_root = NULL;
2378
2379                 *root_device_rw = secondary_root_rw;
2380                 *secondary = true;
2381         }
2382
2383         if (home) {
2384                 *home_device = home;
2385                 home = NULL;
2386
2387                 *home_device_rw = home_rw;
2388         }
2389
2390         if (srv) {
2391                 *srv_device = srv;
2392                 srv = NULL;
2393
2394                 *srv_device_rw = srv_rw;
2395         }
2396
2397         return 0;
2398 #else
2399         log_error("--image= is not supported, compiled without blkid support.");
2400         return -ENOTSUP;
2401 #endif
2402 }
2403
2404 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2405 #ifdef HAVE_BLKID
2406         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2407         const char *fstype, *p;
2408         int r;
2409
2410         assert(what);
2411         assert(where);
2412
2413         if (arg_read_only)
2414                 rw = false;
2415
2416         if (directory)
2417                 p = strappenda(where, directory);
2418         else
2419                 p = where;
2420
2421         errno = 0;
2422         b = blkid_new_probe_from_filename(what);
2423         if (!b) {
2424                 if (errno == 0)
2425                         return log_oom();
2426                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2427                 return -errno;
2428         }
2429
2430         blkid_probe_enable_superblocks(b, 1);
2431         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2432
2433         errno = 0;
2434         r = blkid_do_safeprobe(b);
2435         if (r == -1 || r == 1) {
2436                 log_error("Cannot determine file system type of %s", what);
2437                 return -EINVAL;
2438         } else if (r != 0) {
2439                 if (errno == 0)
2440                         errno = EIO;
2441                 log_error_errno(errno, "Failed to probe %s: %m", what);
2442                 return -errno;
2443         }
2444
2445         errno = 0;
2446         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2447                 if (errno == 0)
2448                         errno = EINVAL;
2449                 log_error("Failed to determine file system type of %s", what);
2450                 return -errno;
2451         }
2452
2453         if (streq(fstype, "crypto_LUKS")) {
2454                 log_error("nspawn currently does not support LUKS disk images.");
2455                 return -ENOTSUP;
2456         }
2457
2458         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2459                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2460
2461         return 0;
2462 #else
2463         log_error("--image= is not supported, compiled without blkid support.");
2464         return -ENOTSUP;
2465 #endif
2466 }
2467
2468 static int mount_devices(
2469                 const char *where,
2470                 const char *root_device, bool root_device_rw,
2471                 const char *home_device, bool home_device_rw,
2472                 const char *srv_device, bool srv_device_rw) {
2473         int r;
2474
2475         assert(where);
2476
2477         if (root_device) {
2478                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2479                 if (r < 0)
2480                         return log_error_errno(r, "Failed to mount root directory: %m");
2481         }
2482
2483         if (home_device) {
2484                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2485                 if (r < 0)
2486                         return log_error_errno(r, "Failed to mount home directory: %m");
2487         }
2488
2489         if (srv_device) {
2490                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2491                 if (r < 0)
2492                         return log_error_errno(r, "Failed to mount server data directory: %m");
2493         }
2494
2495         return 0;
2496 }
2497
2498 static void loop_remove(int nr, int *image_fd) {
2499         _cleanup_close_ int control = -1;
2500         int r;
2501
2502         if (nr < 0)
2503                 return;
2504
2505         if (image_fd && *image_fd >= 0) {
2506                 r = ioctl(*image_fd, LOOP_CLR_FD);
2507                 if (r < 0)
2508                         log_warning_errno(errno, "Failed to close loop image: %m");
2509                 *image_fd = safe_close(*image_fd);
2510         }
2511
2512         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2513         if (control < 0) {
2514                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2515                 return;
2516         }
2517
2518         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2519         if (r < 0)
2520                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2521 }
2522
2523 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2524         int pipe_fds[2];
2525         pid_t pid;
2526
2527         assert(database);
2528         assert(key);
2529         assert(rpid);
2530
2531         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2532                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2533
2534         pid = fork();
2535         if (pid < 0)
2536                 return log_error_errno(errno, "Failed to fork getent child: %m");
2537         else if (pid == 0) {
2538                 int nullfd;
2539                 char *empty_env = NULL;
2540
2541                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2542                         _exit(EXIT_FAILURE);
2543
2544                 if (pipe_fds[0] > 2)
2545                         safe_close(pipe_fds[0]);
2546                 if (pipe_fds[1] > 2)
2547                         safe_close(pipe_fds[1]);
2548
2549                 nullfd = open("/dev/null", O_RDWR);
2550                 if (nullfd < 0)
2551                         _exit(EXIT_FAILURE);
2552
2553                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2554                         _exit(EXIT_FAILURE);
2555
2556                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2557                         _exit(EXIT_FAILURE);
2558
2559                 if (nullfd > 2)
2560                         safe_close(nullfd);
2561
2562                 reset_all_signal_handlers();
2563                 close_all_fds(NULL, 0);
2564
2565                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2566                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2567                 _exit(EXIT_FAILURE);
2568         }
2569
2570         pipe_fds[1] = safe_close(pipe_fds[1]);
2571
2572         *rpid = pid;
2573
2574         return pipe_fds[0];
2575 }
2576
2577 static int change_uid_gid(char **_home) {
2578         char line[LINE_MAX], *x, *u, *g, *h;
2579         const char *word, *state;
2580         _cleanup_free_ uid_t *uids = NULL;
2581         _cleanup_free_ char *home = NULL;
2582         _cleanup_fclose_ FILE *f = NULL;
2583         _cleanup_close_ int fd = -1;
2584         unsigned n_uids = 0;
2585         size_t sz = 0, l;
2586         uid_t uid;
2587         gid_t gid;
2588         pid_t pid;
2589         int r;
2590
2591         assert(_home);
2592
2593         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2594                 /* Reset everything fully to 0, just in case */
2595
2596                 if (setgroups(0, NULL) < 0)
2597                         return log_error_errno(errno, "setgroups() failed: %m");
2598
2599                 if (setresgid(0, 0, 0) < 0)
2600                         return log_error_errno(errno, "setregid() failed: %m");
2601
2602                 if (setresuid(0, 0, 0) < 0)
2603                         return log_error_errno(errno, "setreuid() failed: %m");
2604
2605                 *_home = NULL;
2606                 return 0;
2607         }
2608
2609         /* First, get user credentials */
2610         fd = spawn_getent("passwd", arg_user, &pid);
2611         if (fd < 0)
2612                 return fd;
2613
2614         f = fdopen(fd, "r");
2615         if (!f)
2616                 return log_oom();
2617         fd = -1;
2618
2619         if (!fgets(line, sizeof(line), f)) {
2620
2621                 if (!ferror(f)) {
2622                         log_error("Failed to resolve user %s.", arg_user);
2623                         return -ESRCH;
2624                 }
2625
2626                 log_error_errno(errno, "Failed to read from getent: %m");
2627                 return -errno;
2628         }
2629
2630         truncate_nl(line);
2631
2632         wait_for_terminate_and_warn("getent passwd", pid, true);
2633
2634         x = strchr(line, ':');
2635         if (!x) {
2636                 log_error("/etc/passwd entry has invalid user field.");
2637                 return -EIO;
2638         }
2639
2640         u = strchr(x+1, ':');
2641         if (!u) {
2642                 log_error("/etc/passwd entry has invalid password field.");
2643                 return -EIO;
2644         }
2645
2646         u++;
2647         g = strchr(u, ':');
2648         if (!g) {
2649                 log_error("/etc/passwd entry has invalid UID field.");
2650                 return -EIO;
2651         }
2652
2653         *g = 0;
2654         g++;
2655         x = strchr(g, ':');
2656         if (!x) {
2657                 log_error("/etc/passwd entry has invalid GID field.");
2658                 return -EIO;
2659         }
2660
2661         *x = 0;
2662         h = strchr(x+1, ':');
2663         if (!h) {
2664                 log_error("/etc/passwd entry has invalid GECOS field.");
2665                 return -EIO;
2666         }
2667
2668         h++;
2669         x = strchr(h, ':');
2670         if (!x) {
2671                 log_error("/etc/passwd entry has invalid home directory field.");
2672                 return -EIO;
2673         }
2674
2675         *x = 0;
2676
2677         r = parse_uid(u, &uid);
2678         if (r < 0) {
2679                 log_error("Failed to parse UID of user.");
2680                 return -EIO;
2681         }
2682
2683         r = parse_gid(g, &gid);
2684         if (r < 0) {
2685                 log_error("Failed to parse GID of user.");
2686                 return -EIO;
2687         }
2688
2689         home = strdup(h);
2690         if (!home)
2691                 return log_oom();
2692
2693         /* Second, get group memberships */
2694         fd = spawn_getent("initgroups", arg_user, &pid);
2695         if (fd < 0)
2696                 return fd;
2697
2698         fclose(f);
2699         f = fdopen(fd, "r");
2700         if (!f)
2701                 return log_oom();
2702         fd = -1;
2703
2704         if (!fgets(line, sizeof(line), f)) {
2705                 if (!ferror(f)) {
2706                         log_error("Failed to resolve user %s.", arg_user);
2707                         return -ESRCH;
2708                 }
2709
2710                 log_error_errno(errno, "Failed to read from getent: %m");
2711                 return -errno;
2712         }
2713
2714         truncate_nl(line);
2715
2716         wait_for_terminate_and_warn("getent initgroups", pid, true);
2717
2718         /* Skip over the username and subsequent separator whitespace */
2719         x = line;
2720         x += strcspn(x, WHITESPACE);
2721         x += strspn(x, WHITESPACE);
2722
2723         FOREACH_WORD(word, l, x, state) {
2724                 char c[l+1];
2725
2726                 memcpy(c, word, l);
2727                 c[l] = 0;
2728
2729                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2730                         return log_oom();
2731
2732                 r = parse_uid(c, &uids[n_uids++]);
2733                 if (r < 0) {
2734                         log_error("Failed to parse group data from getent.");
2735                         return -EIO;
2736                 }
2737         }
2738
2739         r = mkdir_parents(home, 0775);
2740         if (r < 0)
2741                 return log_error_errno(r, "Failed to make home root directory: %m");
2742
2743         r = mkdir_safe(home, 0755, uid, gid);
2744         if (r < 0 && r != -EEXIST)
2745                 return log_error_errno(r, "Failed to make home directory: %m");
2746
2747         fchown(STDIN_FILENO, uid, gid);
2748         fchown(STDOUT_FILENO, uid, gid);
2749         fchown(STDERR_FILENO, uid, gid);
2750
2751         if (setgroups(n_uids, uids) < 0)
2752                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2753
2754         if (setresgid(gid, gid, gid) < 0)
2755                 return log_error_errno(errno, "setregid() failed: %m");
2756
2757         if (setresuid(uid, uid, uid) < 0)
2758                 return log_error_errno(errno, "setreuid() failed: %m");
2759
2760         if (_home) {
2761                 *_home = home;
2762                 home = NULL;
2763         }
2764
2765         return 0;
2766 }
2767
2768 /*
2769  * Return values:
2770  * < 0 : wait_for_terminate() failed to get the state of the
2771  *       container, the container was terminated by a signal, or
2772  *       failed for an unknown reason.  No change is made to the
2773  *       container argument.
2774  * > 0 : The program executed in the container terminated with an
2775  *       error.  The exit code of the program executed in the
2776  *       container is returned.  The container argument has been set
2777  *       to CONTAINER_TERMINATED.
2778  *   0 : The container is being rebooted, has been shut down or exited
2779  *       successfully.  The container argument has been set to either
2780  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2781  *
2782  * That is, success is indicated by a return value of zero, and an
2783  * error is indicated by a non-zero value.
2784  */
2785 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2786         siginfo_t status;
2787         int r;
2788
2789         r = wait_for_terminate(pid, &status);
2790         if (r < 0)
2791                 return log_warning_errno(r, "Failed to wait for container: %m");
2792
2793         switch (status.si_code) {
2794
2795         case CLD_EXITED:
2796                 if (status.si_status == 0) {
2797                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2798
2799                 } else
2800                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2801
2802                 *container = CONTAINER_TERMINATED;
2803                 return status.si_status;
2804
2805         case CLD_KILLED:
2806                 if (status.si_status == SIGINT) {
2807
2808                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2809                         *container = CONTAINER_TERMINATED;
2810                         return 0;
2811
2812                 } else if (status.si_status == SIGHUP) {
2813
2814                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2815                         *container = CONTAINER_REBOOTED;
2816                         return 0;
2817                 }
2818
2819                 /* CLD_KILLED fallthrough */
2820
2821         case CLD_DUMPED:
2822                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2823                 return -EIO;
2824
2825         default:
2826                 log_error("Container %s failed due to unknown reason.", arg_machine);
2827                 return -EIO;
2828         }
2829
2830         return r;
2831 }
2832
2833 static void nop_handler(int sig) {}
2834
2835 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2836         pid_t pid;
2837
2838         pid = PTR_TO_UINT32(userdata);
2839         if (pid > 0) {
2840                 if (kill(pid, SIGRTMIN+3) >= 0) {
2841                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2842                         sd_event_source_set_userdata(s, NULL);
2843                         return 0;
2844                 }
2845         }
2846
2847         sd_event_exit(sd_event_source_get_event(s), 0);
2848         return 0;
2849 }
2850
2851 static int determine_names(void) {
2852
2853         if (!arg_image && !arg_directory) {
2854                 if (arg_machine)
2855                         arg_directory = strappend("/var/lib/container/", arg_machine);
2856                 else
2857                         arg_directory = get_current_dir_name();
2858
2859                 if (!arg_directory) {
2860                         log_error("Failed to determine path, please use -D.");
2861                         return -EINVAL;
2862                 }
2863         }
2864
2865         if (!arg_machine) {
2866                 arg_machine = strdup(basename(arg_image ?: arg_directory));
2867                 if (!arg_machine)
2868                         return log_oom();
2869
2870                 hostname_cleanup(arg_machine, false);
2871                 if (!machine_name_is_valid(arg_machine)) {
2872                         log_error("Failed to determine machine name automatically, please use -M.");
2873                         return -EINVAL;
2874                 }
2875         }
2876
2877         return 0;
2878 }
2879
2880 int main(int argc, char *argv[]) {
2881
2882         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2883         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2884         _cleanup_close_ int master = -1, image_fd = -1;
2885         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2886         _cleanup_fdset_free_ FDSet *fds = NULL;
2887         int r, n_fd_passed, loop_nr = -1;
2888         const char *console = NULL;
2889         char veth_name[IFNAMSIZ];
2890         bool secondary = false, remove_subvol = false;
2891         sigset_t mask, mask_chld;
2892         pid_t pid = 0;
2893         int ret = EXIT_SUCCESS;
2894
2895         log_parse_environment();
2896         log_open();
2897
2898         r = parse_argv(argc, argv);
2899         if (r <= 0)
2900                 goto finish;
2901
2902         r = determine_names();
2903         if (r < 0)
2904                 goto finish;
2905
2906         if (geteuid() != 0) {
2907                 log_error("Need to be root.");
2908                 r = -EPERM;
2909                 goto finish;
2910         }
2911
2912         if (sd_booted() <= 0) {
2913                 log_error("Not running on a systemd system.");
2914                 r = -EINVAL;
2915                 goto finish;
2916         }
2917
2918         log_close();
2919         n_fd_passed = sd_listen_fds(false);
2920         if (n_fd_passed > 0) {
2921                 r = fdset_new_listen_fds(&fds, false);
2922                 if (r < 0) {
2923                         log_error_errno(r, "Failed to collect file descriptors: %m");
2924                         goto finish;
2925                 }
2926         }
2927         fdset_close_others(fds);
2928         log_open();
2929
2930         if (arg_directory) {
2931                 assert(!arg_image);
2932
2933                 if (path_equal(arg_directory, "/")) {
2934                         log_error("Spawning container on root directory not supported.");
2935                         r = -EINVAL;
2936                         goto finish;
2937                 }
2938
2939                 if (arg_template) {
2940                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2941                         if (r == -EEXIST) {
2942                                 if (!arg_quiet)
2943                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2944                         } else if (r < 0) {
2945                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2946                                 goto finish;
2947                         } else {
2948                                 if (!arg_quiet)
2949                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
2950                         }
2951
2952                 } else if (arg_ephemeral) {
2953                         char *np;
2954
2955                         r = tempfn_random(arg_directory, &np);
2956                         if (r < 0) {
2957                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
2958                                 goto finish;
2959                         }
2960
2961                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
2962                         if (r < 0) {
2963                                 free(np);
2964                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
2965                                 goto finish;
2966                         }
2967
2968                         free(arg_directory);
2969                         arg_directory = np;
2970
2971                         remove_subvol = true;
2972                 }
2973
2974                 if (arg_boot) {
2975                         if (path_is_os_tree(arg_directory) <= 0) {
2976                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2977                                 r = -EINVAL;
2978                                 goto finish;
2979                         }
2980                 } else {
2981                         const char *p;
2982
2983                         p = strappenda(arg_directory,
2984                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2985                         if (access(p, F_OK) < 0) {
2986                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2987                                 r = -EINVAL;
2988                                 goto finish;
2989                         }
2990                 }
2991
2992         } else {
2993                 char template[] = "/tmp/nspawn-root-XXXXXX";
2994
2995                 assert(arg_image);
2996                 assert(!arg_template);
2997
2998                 if (!mkdtemp(template)) {
2999                         log_error_errno(errno, "Failed to create temporary directory: %m");
3000                         r = -errno;
3001                         goto finish;
3002                 }
3003
3004                 arg_directory = strdup(template);
3005                 if (!arg_directory) {
3006                         r = log_oom();
3007                         goto finish;
3008                 }
3009
3010                 image_fd = setup_image(&device_path, &loop_nr);
3011                 if (image_fd < 0) {
3012                         r = image_fd;
3013                         goto finish;
3014                 }
3015
3016                 r = dissect_image(image_fd,
3017                                   &root_device, &root_device_rw,
3018                                   &home_device, &home_device_rw,
3019                                   &srv_device, &srv_device_rw,
3020                                   &secondary);
3021                 if (r < 0)
3022                         goto finish;
3023         }
3024
3025         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3026         if (master < 0) {
3027                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3028                 goto finish;
3029         }
3030
3031         console = ptsname(master);
3032         if (!console) {
3033                 r = log_error_errno(errno, "Failed to determine tty name: %m");
3034                 goto finish;
3035         }
3036
3037         if (!arg_quiet)
3038                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3039                          arg_machine, arg_image ?: arg_directory);
3040
3041         if (unlockpt(master) < 0) {
3042                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3043                 goto finish;
3044         }
3045
3046         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3047                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3048                 goto finish;
3049         }
3050
3051         sd_notify(false,
3052                   "READY=1\n"
3053                   "STATUS=Container running.");
3054
3055         assert_se(sigemptyset(&mask) == 0);
3056         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3057         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3058
3059         assert_se(sigemptyset(&mask_chld) == 0);
3060         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3061
3062         for (;;) {
3063                 ContainerStatus container_status;
3064                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3065                 struct sigaction sa = {
3066                         .sa_handler = nop_handler,
3067                         .sa_flags = SA_NOCLDSTOP,
3068                 };
3069
3070                 r = barrier_create(&barrier);
3071                 if (r < 0) {
3072                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3073                         goto finish;
3074                 }
3075
3076                 /* Child can be killed before execv(), so handle SIGCHLD
3077                  * in order to interrupt parent's blocking calls and
3078                  * give it a chance to call wait() and terminate. */
3079                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3080                 if (r < 0) {
3081                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3082                         goto finish;
3083                 }
3084
3085                 r = sigaction(SIGCHLD, &sa, NULL);
3086                 if (r < 0) {
3087                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3088                         goto finish;
3089                 }
3090
3091                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3092                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3093                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3094                 if (pid < 0) {
3095                         if (errno == EINVAL)
3096                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3097                         else
3098                                 r = log_error_errno(errno, "clone() failed: %m");
3099
3100                         goto finish;
3101                 }
3102
3103                 if (pid == 0) {
3104                         /* child */
3105                         _cleanup_free_ char *home = NULL;
3106                         unsigned n_env = 2;
3107                         const char *envp[] = {
3108                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3109                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3110                                 NULL, /* TERM */
3111                                 NULL, /* HOME */
3112                                 NULL, /* USER */
3113                                 NULL, /* LOGNAME */
3114                                 NULL, /* container_uuid */
3115                                 NULL, /* LISTEN_FDS */
3116                                 NULL, /* LISTEN_PID */
3117                                 NULL
3118                         };
3119                         char **env_use;
3120
3121                         barrier_set_role(&barrier, BARRIER_CHILD);
3122
3123                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3124                         if (envp[n_env])
3125                                 n_env ++;
3126
3127                         master = safe_close(master);
3128
3129                         close_nointr(STDIN_FILENO);
3130                         close_nointr(STDOUT_FILENO);
3131                         close_nointr(STDERR_FILENO);
3132
3133                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3134
3135                         reset_all_signal_handlers();
3136                         reset_signal_mask();
3137
3138                         r = open_terminal(console, O_RDWR);
3139                         if (r != STDIN_FILENO) {
3140                                 if (r >= 0) {
3141                                         safe_close(r);
3142                                         r = -EINVAL;
3143                                 }
3144
3145                                 log_error_errno(r, "Failed to open console: %m");
3146                                 _exit(EXIT_FAILURE);
3147                         }
3148
3149                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3150                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3151                                 log_error_errno(errno, "Failed to duplicate console: %m");
3152                                 _exit(EXIT_FAILURE);
3153                         }
3154
3155                         if (setsid() < 0) {
3156                                 log_error_errno(errno, "setsid() failed: %m");
3157                                 _exit(EXIT_FAILURE);
3158                         }
3159
3160                         if (reset_audit_loginuid() < 0)
3161                                 _exit(EXIT_FAILURE);
3162
3163                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3164                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3165                                 _exit(EXIT_FAILURE);
3166                         }
3167
3168                         /* Mark everything as slave, so that we still
3169                          * receive mounts from the real root, but don't
3170                          * propagate mounts to the real root. */
3171                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3172                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3173                                 _exit(EXIT_FAILURE);
3174                         }
3175
3176                         if (mount_devices(arg_directory,
3177                                           root_device, root_device_rw,
3178                                           home_device, home_device_rw,
3179                                           srv_device, srv_device_rw) < 0)
3180                                 _exit(EXIT_FAILURE);
3181
3182                         /* Turn directory into bind mount */
3183                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3184                                 log_error_errno(errno, "Failed to make bind mount: %m");
3185                                 _exit(EXIT_FAILURE);
3186                         }
3187
3188                         r = setup_volatile(arg_directory);
3189                         if (r < 0)
3190                                 _exit(EXIT_FAILURE);
3191
3192                         if (setup_volatile_state(arg_directory) < 0)
3193                                 _exit(EXIT_FAILURE);
3194
3195                         r = base_filesystem_create(arg_directory);
3196                         if (r < 0)
3197                                 _exit(EXIT_FAILURE);
3198
3199                         if (arg_read_only) {
3200                                 r = bind_remount_recursive(arg_directory, true);
3201                                 if (r < 0) {
3202                                         log_error_errno(r, "Failed to make tree read-only: %m");
3203                                         _exit(EXIT_FAILURE);
3204                                 }
3205                         }
3206
3207                         if (mount_all(arg_directory) < 0)
3208                                 _exit(EXIT_FAILURE);
3209
3210                         if (copy_devnodes(arg_directory) < 0)
3211                                 _exit(EXIT_FAILURE);
3212
3213                         if (setup_ptmx(arg_directory) < 0)
3214                                 _exit(EXIT_FAILURE);
3215
3216                         dev_setup(arg_directory);
3217
3218                         if (setup_seccomp() < 0)
3219                                 _exit(EXIT_FAILURE);
3220
3221                         if (setup_dev_console(arg_directory, console) < 0)
3222                                 _exit(EXIT_FAILURE);
3223
3224                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3225                                 _exit(EXIT_FAILURE);
3226
3227                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3228
3229                         if (setup_boot_id(arg_directory) < 0)
3230                                 _exit(EXIT_FAILURE);
3231
3232                         if (setup_timezone(arg_directory) < 0)
3233                                 _exit(EXIT_FAILURE);
3234
3235                         if (setup_resolv_conf(arg_directory) < 0)
3236                                 _exit(EXIT_FAILURE);
3237
3238                         if (setup_journal(arg_directory) < 0)
3239                                 _exit(EXIT_FAILURE);
3240
3241                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3242                                 _exit(EXIT_FAILURE);
3243
3244                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3245                                 _exit(EXIT_FAILURE);
3246
3247                         if (mount_tmpfs(arg_directory) < 0)
3248                                 _exit(EXIT_FAILURE);
3249
3250                         /* Tell the parent that we are ready, and that
3251                          * it can cgroupify us to that we lack access
3252                          * to certain devices and resources. */
3253                         (void)barrier_place(&barrier);
3254
3255                         if (chdir(arg_directory) < 0) {
3256                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3257                                 _exit(EXIT_FAILURE);
3258                         }
3259
3260                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3261                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3262                                 _exit(EXIT_FAILURE);
3263                         }
3264
3265                         if (chroot(".") < 0) {
3266                                 log_error_errno(errno, "chroot() failed: %m");
3267                                 _exit(EXIT_FAILURE);
3268                         }
3269
3270                         if (chdir("/") < 0) {
3271                                 log_error_errno(errno, "chdir() failed: %m");
3272                                 _exit(EXIT_FAILURE);
3273                         }
3274
3275                         umask(0022);
3276
3277                         if (arg_private_network)
3278                                 loopback_setup();
3279
3280                         if (drop_capabilities() < 0) {
3281                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3282                                 _exit(EXIT_FAILURE);
3283                         }
3284
3285                         r = change_uid_gid(&home);
3286                         if (r < 0)
3287                                 _exit(EXIT_FAILURE);
3288
3289                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3290                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3291                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3292                                 log_oom();
3293                                 _exit(EXIT_FAILURE);
3294                         }
3295
3296                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3297                                 char as_uuid[37];
3298
3299                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3300                                         log_oom();
3301                                         _exit(EXIT_FAILURE);
3302                                 }
3303                         }
3304
3305                         if (fdset_size(fds) > 0) {
3306                                 r = fdset_cloexec(fds, false);
3307                                 if (r < 0) {
3308                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3309                                         _exit(EXIT_FAILURE);
3310                                 }
3311
3312                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3313                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3314                                         log_oom();
3315                                         _exit(EXIT_FAILURE);
3316                                 }
3317                         }
3318
3319                         setup_hostname();
3320
3321                         if (arg_personality != 0xffffffffLU) {
3322                                 if (personality(arg_personality) < 0) {
3323                                         log_error_errno(errno, "personality() failed: %m");
3324                                         _exit(EXIT_FAILURE);
3325                                 }
3326                         } else if (secondary) {
3327                                 if (personality(PER_LINUX32) < 0) {
3328                                         log_error_errno(errno, "personality() failed: %m");
3329                                         _exit(EXIT_FAILURE);
3330                                 }
3331                         }
3332
3333 #ifdef HAVE_SELINUX
3334                         if (arg_selinux_context)
3335                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3336                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3337                                         _exit(EXIT_FAILURE);
3338                                 }
3339 #endif
3340
3341                         if (!strv_isempty(arg_setenv)) {
3342                                 char **n;
3343
3344                                 n = strv_env_merge(2, envp, arg_setenv);
3345                                 if (!n) {
3346                                         log_oom();
3347                                         _exit(EXIT_FAILURE);
3348                                 }
3349
3350                                 env_use = n;
3351                         } else
3352                                 env_use = (char**) envp;
3353
3354                         /* Wait until the parent is ready with the setup, too... */
3355                         if (!barrier_place_and_sync(&barrier))
3356                                 _exit(EXIT_FAILURE);
3357
3358                         if (arg_boot) {
3359                                 char **a;
3360                                 size_t l;
3361
3362                                 /* Automatically search for the init system */
3363
3364                                 l = 1 + argc - optind;
3365                                 a = newa(char*, l + 1);
3366                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3367
3368                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3369                                 execve(a[0], a, env_use);
3370
3371                                 a[0] = (char*) "/lib/systemd/systemd";
3372                                 execve(a[0], a, env_use);
3373
3374                                 a[0] = (char*) "/sbin/init";
3375                                 execve(a[0], a, env_use);
3376                         } else if (argc > optind)
3377                                 execvpe(argv[optind], argv + optind, env_use);
3378                         else {
3379                                 chdir(home ? home : "/root");
3380                                 execle("/bin/bash", "-bash", NULL, env_use);
3381                                 execle("/bin/sh", "-sh", NULL, env_use);
3382                         }
3383
3384                         log_error_errno(errno, "execv() failed: %m");
3385                         _exit(EXIT_FAILURE);
3386                 }
3387
3388                 barrier_set_role(&barrier, BARRIER_PARENT);
3389                 fdset_free(fds);
3390                 fds = NULL;
3391
3392                 /* wait for child-setup to be done */
3393                 if (barrier_place_and_sync(&barrier)) {
3394                         _cleanup_event_unref_ sd_event *event = NULL;
3395                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3396                         int ifi = 0;
3397
3398                         r = move_network_interfaces(pid);
3399                         if (r < 0)
3400                                 goto finish;
3401
3402                         r = setup_veth(pid, veth_name, &ifi);
3403                         if (r < 0)
3404                                 goto finish;
3405
3406                         r = setup_bridge(veth_name, &ifi);
3407                         if (r < 0)
3408                                 goto finish;
3409
3410                         r = setup_macvlan(pid);
3411                         if (r < 0)
3412                                 goto finish;
3413
3414                         r = register_machine(pid, ifi);
3415                         if (r < 0)
3416                                 goto finish;
3417
3418                         /* Block SIGCHLD here, before notifying child.
3419                          * process_pty() will handle it with the other signals. */
3420                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3421                         if (r < 0)
3422                                 goto finish;
3423
3424                         /* Reset signal to default */
3425                         r = default_signals(SIGCHLD, -1);
3426                         if (r < 0)
3427                                 goto finish;
3428
3429                         /* Notify the child that the parent is ready with all
3430                          * its setup, and that the child can now hand over
3431                          * control to the code to run inside the container. */
3432                         (void)barrier_place(&barrier);
3433
3434                         r = sd_event_new(&event);
3435                         if (r < 0) {
3436                                 log_error_errno(r, "Failed to get default event source: %m");
3437                                 goto finish;
3438                         }
3439
3440                         if (arg_boot) {
3441                                 /* Try to kill the init system on SIGINT or SIGTERM */
3442                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3443                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3444                         } else {
3445                                 /* Immediately exit */
3446                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3447                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3448                         }
3449
3450                         /* simply exit on sigchld */
3451                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3452
3453                         r = pty_forward_new(event, master, &forward);
3454                         if (r < 0) {
3455                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3456                                 goto finish;
3457                         }
3458
3459                         r = sd_event_loop(event);
3460                         if (r < 0) {
3461                                 log_error_errno(r, "Failed to run event loop: %m");
3462                                 goto finish;
3463                         }
3464
3465                         forward = pty_forward_free(forward);
3466
3467                         if (!arg_quiet)
3468                                 putc('\n', stdout);
3469
3470                         /* Kill if it is not dead yet anyway */
3471                         terminate_machine(pid);
3472                 }
3473
3474                 /* Normally redundant, but better safe than sorry */
3475                 kill(pid, SIGKILL);
3476
3477                 r = wait_for_container(pid, &container_status);
3478                 pid = 0;
3479
3480                 if (r < 0)
3481                         /* We failed to wait for the container, or the
3482                          * container exited abnormally */
3483                         goto finish;
3484                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3485                         /* The container exited with a non-zero
3486                          * status, or with zero status and no reboot
3487                          * was requested. */
3488                         ret = r;
3489                         break;
3490                 }
3491
3492                 /* CONTAINER_REBOOTED, loop again */
3493
3494                 if (arg_keep_unit) {
3495                         /* Special handling if we are running as a
3496                          * service: instead of simply restarting the
3497                          * machine we want to restart the entire
3498                          * service, so let's inform systemd about this
3499                          * with the special exit code 133. The service
3500                          * file uses RestartForceExitStatus=133 so
3501                          * that this results in a full nspawn
3502                          * restart. This is necessary since we might
3503                          * have cgroup parameters set we want to have
3504                          * flushed out. */
3505                         ret = 133;
3506                         r = 0;
3507                         break;
3508                 }
3509         }
3510
3511 finish:
3512         sd_notify(false,
3513                   "STOPPING=1\n"
3514                   "STATUS=Terminating...");
3515
3516         loop_remove(loop_nr, &image_fd);
3517
3518         if (pid > 0)
3519                 kill(pid, SIGKILL);
3520
3521         if (remove_subvol && arg_directory) {
3522                 int k;
3523
3524                 k = btrfs_subvol_remove(arg_directory);
3525                 if (k < 0)
3526                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3527         }
3528
3529         free(arg_directory);
3530         free(arg_template);
3531         free(arg_image);
3532         free(arg_machine);
3533         free(arg_user);
3534         strv_free(arg_setenv);
3535         strv_free(arg_network_interfaces);
3536         strv_free(arg_network_macvlan);
3537         strv_free(arg_bind);
3538         strv_free(arg_bind_ro);
3539         strv_free(arg_tmpfs);
3540
3541         return r < 0 ? EXIT_FAILURE : ret;
3542 }