chiark / gitweb /
nspawn: fallback on bind mount when mknod fails
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "macro.h"
63 #include "missing.h"
64 #include "cgroup-util.h"
65 #include "strv.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
69 #include "fdset.h"
70 #include "build.h"
71 #include "fileio.h"
72 #include "bus-util.h"
73 #include "bus-error.h"
74 #include "ptyfwd.h"
75 #include "env-util.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
79 #include "gpt.h"
80 #include "siphash24.h"
81 #include "copy.h"
82 #include "base-filesystem.h"
83 #include "barrier.h"
84 #include "event-util.h"
85 #include "capability.h"
86 #include "cap-list.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
89 #include "list.h"
90 #include "in-addr-util.h"
91 #include "fw-util.h"
92 #include "local-addresses.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef struct ExposePort {
99         int protocol;
100         uint16_t host_port;
101         uint16_t container_port;
102         LIST_FIELDS(struct ExposePort, ports);
103 } ExposePort;
104
105 typedef enum ContainerStatus {
106         CONTAINER_TERMINATED,
107         CONTAINER_REBOOTED
108 } ContainerStatus;
109
110 typedef enum LinkJournal {
111         LINK_NO,
112         LINK_AUTO,
113         LINK_HOST,
114         LINK_GUEST
115 } LinkJournal;
116
117 typedef enum Volatile {
118         VOLATILE_NO,
119         VOLATILE_YES,
120         VOLATILE_STATE,
121 } Volatile;
122
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138         (1ULL << CAP_CHOWN) |
139         (1ULL << CAP_DAC_OVERRIDE) |
140         (1ULL << CAP_DAC_READ_SEARCH) |
141         (1ULL << CAP_FOWNER) |
142         (1ULL << CAP_FSETID) |
143         (1ULL << CAP_IPC_OWNER) |
144         (1ULL << CAP_KILL) |
145         (1ULL << CAP_LEASE) |
146         (1ULL << CAP_LINUX_IMMUTABLE) |
147         (1ULL << CAP_NET_BIND_SERVICE) |
148         (1ULL << CAP_NET_BROADCAST) |
149         (1ULL << CAP_NET_RAW) |
150         (1ULL << CAP_SETGID) |
151         (1ULL << CAP_SETFCAP) |
152         (1ULL << CAP_SETPCAP) |
153         (1ULL << CAP_SETUID) |
154         (1ULL << CAP_SYS_ADMIN) |
155         (1ULL << CAP_SYS_CHROOT) |
156         (1ULL << CAP_SYS_NICE) |
157         (1ULL << CAP_SYS_PTRACE) |
158         (1ULL << CAP_SYS_TTY_CONFIG) |
159         (1ULL << CAP_SYS_RESOURCE) |
160         (1ULL << CAP_SYS_BOOT) |
161         (1ULL << CAP_AUDIT_WRITE) |
162         (1ULL << CAP_AUDIT_CONTROL) |
163         (1ULL << CAP_MKNOD);
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184 static int arg_kill_signal = 0;
185
186 static void help(void) {
187         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189                "  -h --help                 Show this help\n"
190                "     --version              Print version string\n"
191                "  -q --quiet                Do not show status information\n"
192                "  -D --directory=PATH       Root directory for the container\n"
193                "     --template=PATH        Initialize root directory from template directory,\n"
194                "                            if missing\n"
195                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
196                "                            remove it after exit\n"
197                "  -i --image=PATH           File system device or disk image for the container\n"
198                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
199                "  -u --user=USER            Run the command under specified user or uid\n"
200                "  -M --machine=NAME         Set the machine name for the container\n"
201                "     --uuid=UUID            Set a specific machine UUID for the container\n"
202                "  -S --slice=SLICE          Place the container in the specified slice\n"
203                "     --property=NAME=VALUE  Set scope unit property\n"
204                "     --private-network      Disable network in container\n"
205                "     --network-interface=INTERFACE\n"
206                "                            Assign an existing network interface to the\n"
207                "                            container\n"
208                "     --network-macvlan=INTERFACE\n"
209                "                            Create a macvlan network interface based on an\n"
210                "                            existing network interface to the container\n"
211                "     --network-ipvlan=INTERFACE\n"
212                "                            Create a ipvlan network interface based on an\n"
213                "                            existing network interface to the container\n"
214                "  -n --network-veth         Add a virtual ethernet connection between host\n"
215                "                            and container\n"
216                "     --network-bridge=INTERFACE\n"
217                "                            Add a virtual ethernet connection between host\n"
218                "                            and container and add it to an existing bridge on\n"
219                "                            the host\n"
220                "     --private-users[=UIDBASE[:NUIDS]]\n"
221                "                            Run within user namespace\n"
222                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223                "                            Expose a container IP port on the host\n"
224                "  -Z --selinux-context=SECLABEL\n"
225                "                            Set the SELinux security context to be used by\n"
226                "                            processes in the container\n"
227                "  -L --selinux-apifs-context=SECLABEL\n"
228                "                            Set the SELinux security context to be used by\n"
229                "                            API/tmpfs file systems in the container\n"
230                "     --capability=CAP       In addition to the default, retain specified\n"
231                "                            capability\n"
232                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
233                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
234                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
235                "                            try-guest, try-host\n"
236                "  -j                        Equivalent to --link-journal=try-guest\n"
237                "     --read-only            Mount the root directory read-only\n"
238                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
239                "                            the container\n"
240                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
241                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
242                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
243                "     --share-system         Share system namespaces with host\n"
244                "     --register=BOOLEAN     Register container as machine\n"
245                "     --keep-unit            Do not register a scope for the machine, reuse\n"
246                "                            the service unit nspawn is running in\n"
247                "     --volatile[=MODE]      Run the system in volatile mode\n"
248                , program_invocation_short_name);
249 }
250
251 static int set_sanitized_path(char **b, const char *path) {
252         char *p;
253
254         assert(b);
255         assert(path);
256
257         p = canonicalize_file_name(path);
258         if (!p) {
259                 if (errno != ENOENT)
260                         return -errno;
261
262                 p = path_make_absolute_cwd(path);
263                 if (!p)
264                         return -ENOMEM;
265         }
266
267         free(*b);
268         *b = path_kill_slashes(p);
269         return 0;
270 }
271
272 static int parse_argv(int argc, char *argv[]) {
273
274         enum {
275                 ARG_VERSION = 0x100,
276                 ARG_PRIVATE_NETWORK,
277                 ARG_UUID,
278                 ARG_READ_ONLY,
279                 ARG_CAPABILITY,
280                 ARG_DROP_CAPABILITY,
281                 ARG_LINK_JOURNAL,
282                 ARG_BIND,
283                 ARG_BIND_RO,
284                 ARG_TMPFS,
285                 ARG_SETENV,
286                 ARG_SHARE_SYSTEM,
287                 ARG_REGISTER,
288                 ARG_KEEP_UNIT,
289                 ARG_NETWORK_INTERFACE,
290                 ARG_NETWORK_MACVLAN,
291                 ARG_NETWORK_IPVLAN,
292                 ARG_NETWORK_BRIDGE,
293                 ARG_PERSONALITY,
294                 ARG_VOLATILE,
295                 ARG_TEMPLATE,
296                 ARG_PROPERTY,
297                 ARG_PRIVATE_USERS,
298                 ARG_KILL_SIGNAL,
299         };
300
301         static const struct option options[] = {
302                 { "help",                  no_argument,       NULL, 'h'                   },
303                 { "version",               no_argument,       NULL, ARG_VERSION           },
304                 { "directory",             required_argument, NULL, 'D'                   },
305                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
306                 { "ephemeral",             no_argument,       NULL, 'x'                   },
307                 { "user",                  required_argument, NULL, 'u'                   },
308                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
309                 { "boot",                  no_argument,       NULL, 'b'                   },
310                 { "uuid",                  required_argument, NULL, ARG_UUID              },
311                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
312                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
313                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
314                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
315                 { "bind",                  required_argument, NULL, ARG_BIND              },
316                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
317                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
318                 { "machine",               required_argument, NULL, 'M'                   },
319                 { "slice",                 required_argument, NULL, 'S'                   },
320                 { "setenv",                required_argument, NULL, ARG_SETENV            },
321                 { "selinux-context",       required_argument, NULL, 'Z'                   },
322                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
323                 { "quiet",                 no_argument,       NULL, 'q'                   },
324                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
325                 { "register",              required_argument, NULL, ARG_REGISTER          },
326                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
327                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
328                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
329                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
330                 { "network-veth",          no_argument,       NULL, 'n'                   },
331                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
332                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
333                 { "image",                 required_argument, NULL, 'i'                   },
334                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
335                 { "port",                  required_argument, NULL, 'p'                   },
336                 { "property",              required_argument, NULL, ARG_PROPERTY          },
337                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
338                 { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
339                 {}
340         };
341
342         int c, r;
343         uint64_t plus = 0, minus = 0;
344
345         assert(argc >= 0);
346         assert(argv);
347
348         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
349
350                 switch (c) {
351
352                 case 'h':
353                         help();
354                         return 0;
355
356                 case ARG_VERSION:
357                         puts(PACKAGE_STRING);
358                         puts(SYSTEMD_FEATURES);
359                         return 0;
360
361                 case 'D':
362                         r = set_sanitized_path(&arg_directory, optarg);
363                         if (r < 0)
364                                 return log_error_errno(r, "Invalid root directory: %m");
365
366                         break;
367
368                 case ARG_TEMPLATE:
369                         r = set_sanitized_path(&arg_template, optarg);
370                         if (r < 0)
371                                 return log_error_errno(r, "Invalid template directory: %m");
372
373                         break;
374
375                 case 'i':
376                         r = set_sanitized_path(&arg_image, optarg);
377                         if (r < 0)
378                                 return log_error_errno(r, "Invalid image path: %m");
379
380                         break;
381
382                 case 'x':
383                         arg_ephemeral = true;
384                         break;
385
386                 case 'u':
387                         free(arg_user);
388                         arg_user = strdup(optarg);
389                         if (!arg_user)
390                                 return log_oom();
391
392                         break;
393
394                 case ARG_NETWORK_BRIDGE:
395                         arg_network_bridge = optarg;
396
397                         /* fall through */
398
399                 case 'n':
400                         arg_network_veth = true;
401                         arg_private_network = true;
402                         break;
403
404                 case ARG_NETWORK_INTERFACE:
405                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
406                                 return log_oom();
407
408                         arg_private_network = true;
409                         break;
410
411                 case ARG_NETWORK_MACVLAN:
412                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
413                                 return log_oom();
414
415                         arg_private_network = true;
416                         break;
417
418                 case ARG_NETWORK_IPVLAN:
419                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420                                 return log_oom();
421
422                         /* fall through */
423
424                 case ARG_PRIVATE_NETWORK:
425                         arg_private_network = true;
426                         break;
427
428                 case 'b':
429                         arg_boot = true;
430                         break;
431
432                 case ARG_UUID:
433                         r = sd_id128_from_string(optarg, &arg_uuid);
434                         if (r < 0) {
435                                 log_error("Invalid UUID: %s", optarg);
436                                 return r;
437                         }
438                         break;
439
440                 case 'S':
441                         arg_slice = optarg;
442                         break;
443
444                 case 'M':
445                         if (isempty(optarg)) {
446                                 free(arg_machine);
447                                 arg_machine = NULL;
448                         } else {
449                                 if (!machine_name_is_valid(optarg)) {
450                                         log_error("Invalid machine name: %s", optarg);
451                                         return -EINVAL;
452                                 }
453
454                                 r = free_and_strdup(&arg_machine, optarg);
455                                 if (r < 0)
456                                         return log_oom();
457
458                                 break;
459                         }
460
461                 case 'Z':
462                         arg_selinux_context = optarg;
463                         break;
464
465                 case 'L':
466                         arg_selinux_apifs_context = optarg;
467                         break;
468
469                 case ARG_READ_ONLY:
470                         arg_read_only = true;
471                         break;
472
473                 case ARG_CAPABILITY:
474                 case ARG_DROP_CAPABILITY: {
475                         const char *state, *word;
476                         size_t length;
477
478                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
479                                 _cleanup_free_ char *t;
480
481                                 t = strndup(word, length);
482                                 if (!t)
483                                         return log_oom();
484
485                                 if (streq(t, "all")) {
486                                         if (c == ARG_CAPABILITY)
487                                                 plus = (uint64_t) -1;
488                                         else
489                                                 minus = (uint64_t) -1;
490                                 } else {
491                                         int cap;
492
493                                         cap = capability_from_name(t);
494                                         if (cap < 0) {
495                                                 log_error("Failed to parse capability %s.", t);
496                                                 return -EINVAL;
497                                         }
498
499                                         if (c == ARG_CAPABILITY)
500                                                 plus |= 1ULL << (uint64_t) cap;
501                                         else
502                                                 minus |= 1ULL << (uint64_t) cap;
503                                 }
504                         }
505
506                         break;
507                 }
508
509                 case 'j':
510                         arg_link_journal = LINK_GUEST;
511                         arg_link_journal_try = true;
512                         break;
513
514                 case ARG_LINK_JOURNAL:
515                         if (streq(optarg, "auto")) {
516                                 arg_link_journal = LINK_AUTO;
517                                 arg_link_journal_try = false;
518                         } else if (streq(optarg, "no")) {
519                                 arg_link_journal = LINK_NO;
520                                 arg_link_journal_try = false;
521                         } else if (streq(optarg, "guest")) {
522                                 arg_link_journal = LINK_GUEST;
523                                 arg_link_journal_try = false;
524                         } else if (streq(optarg, "host")) {
525                                 arg_link_journal = LINK_HOST;
526                                 arg_link_journal_try = false;
527                         } else if (streq(optarg, "try-guest")) {
528                                 arg_link_journal = LINK_GUEST;
529                                 arg_link_journal_try = true;
530                         } else if (streq(optarg, "try-host")) {
531                                 arg_link_journal = LINK_HOST;
532                                 arg_link_journal_try = true;
533                         } else {
534                                 log_error("Failed to parse link journal mode %s", optarg);
535                                 return -EINVAL;
536                         }
537
538                         break;
539
540                 case ARG_BIND:
541                 case ARG_BIND_RO: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544                         char ***x;
545
546                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
547
548                         e = strchr(optarg, ':');
549                         if (e) {
550                                 a = strndup(optarg, e - optarg);
551                                 b = strdup(e + 1);
552                         } else {
553                                 a = strdup(optarg);
554                                 b = strdup(optarg);
555                         }
556
557                         if (!a || !b)
558                                 return log_oom();
559
560                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
561                                 log_error("Invalid bind mount specification: %s", optarg);
562                                 return -EINVAL;
563                         }
564
565                         r = strv_extend(x, a);
566                         if (r < 0)
567                                 return log_oom();
568
569                         r = strv_extend(x, b);
570                         if (r < 0)
571                                 return log_oom();
572
573                         break;
574                 }
575
576                 case ARG_TMPFS: {
577                         _cleanup_free_ char *a = NULL, *b = NULL;
578                         char *e;
579
580                         e = strchr(optarg, ':');
581                         if (e) {
582                                 a = strndup(optarg, e - optarg);
583                                 b = strdup(e + 1);
584                         } else {
585                                 a = strdup(optarg);
586                                 b = strdup("mode=0755");
587                         }
588
589                         if (!a || !b)
590                                 return log_oom();
591
592                         if (!path_is_absolute(a)) {
593                                 log_error("Invalid tmpfs specification: %s", optarg);
594                                 return -EINVAL;
595                         }
596
597                         r = strv_push(&arg_tmpfs, a);
598                         if (r < 0)
599                                 return log_oom();
600
601                         a = NULL;
602
603                         r = strv_push(&arg_tmpfs, b);
604                         if (r < 0)
605                                 return log_oom();
606
607                         b = NULL;
608
609                         break;
610                 }
611
612                 case ARG_SETENV: {
613                         char **n;
614
615                         if (!env_assignment_is_valid(optarg)) {
616                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
617                                 return -EINVAL;
618                         }
619
620                         n = strv_env_set(arg_setenv, optarg);
621                         if (!n)
622                                 return log_oom();
623
624                         strv_free(arg_setenv);
625                         arg_setenv = n;
626                         break;
627                 }
628
629                 case 'q':
630                         arg_quiet = true;
631                         break;
632
633                 case ARG_SHARE_SYSTEM:
634                         arg_share_system = true;
635                         break;
636
637                 case ARG_REGISTER:
638                         r = parse_boolean(optarg);
639                         if (r < 0) {
640                                 log_error("Failed to parse --register= argument: %s", optarg);
641                                 return r;
642                         }
643
644                         arg_register = r;
645                         break;
646
647                 case ARG_KEEP_UNIT:
648                         arg_keep_unit = true;
649                         break;
650
651                 case ARG_PERSONALITY:
652
653                         arg_personality = personality_from_string(optarg);
654                         if (arg_personality == 0xffffffffLU) {
655                                 log_error("Unknown or unsupported personality '%s'.", optarg);
656                                 return -EINVAL;
657                         }
658
659                         break;
660
661                 case ARG_VOLATILE:
662
663                         if (!optarg)
664                                 arg_volatile = VOLATILE_YES;
665                         else {
666                                 r = parse_boolean(optarg);
667                                 if (r < 0) {
668                                         if (streq(optarg, "state"))
669                                                 arg_volatile = VOLATILE_STATE;
670                                         else {
671                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
672                                                 return r;
673                                         }
674                                 } else
675                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676                         }
677
678                         break;
679
680                 case 'p': {
681                         const char *split, *e;
682                         uint16_t container_port, host_port;
683                         int protocol;
684                         ExposePort *p;
685
686                         if ((e = startswith(optarg, "tcp:")))
687                                 protocol = IPPROTO_TCP;
688                         else if ((e = startswith(optarg, "udp:")))
689                                 protocol = IPPROTO_UDP;
690                         else {
691                                 e = optarg;
692                                 protocol = IPPROTO_TCP;
693                         }
694
695                         split = strchr(e, ':');
696                         if (split) {
697                                 char v[split - e + 1];
698
699                                 memcpy(v, e, split - e);
700                                 v[split - e] = 0;
701
702                                 r = safe_atou16(v, &host_port);
703                                 if (r < 0 || host_port <= 0) {
704                                         log_error("Failed to parse host port: %s", optarg);
705                                         return -EINVAL;
706                                 }
707
708                                 r = safe_atou16(split + 1, &container_port);
709                         } else {
710                                 r = safe_atou16(e, &container_port);
711                                 host_port = container_port;
712                         }
713
714                         if (r < 0 || container_port <= 0) {
715                                 log_error("Failed to parse host port: %s", optarg);
716                                 return -EINVAL;
717                         }
718
719                         LIST_FOREACH(ports, p, arg_expose_ports) {
720                                 if (p->protocol == protocol && p->host_port == host_port) {
721                                         log_error("Duplicate port specification: %s", optarg);
722                                         return -EINVAL;
723                                 }
724                         }
725
726                         p = new(ExposePort, 1);
727                         if (!p)
728                                 return log_oom();
729
730                         p->protocol = protocol;
731                         p->host_port = host_port;
732                         p->container_port = container_port;
733
734                         LIST_PREPEND(ports, arg_expose_ports, p);
735
736                         break;
737                 }
738
739                 case ARG_PROPERTY:
740                         if (strv_extend(&arg_property, optarg) < 0)
741                                 return log_oom();
742
743                         break;
744
745                 case ARG_PRIVATE_USERS:
746                         if (optarg) {
747                                 _cleanup_free_ char *buffer = NULL;
748                                 const char *range, *shift;
749
750                                 range = strchr(optarg, ':');
751                                 if (range) {
752                                         buffer = strndup(optarg, range - optarg);
753                                         if (!buffer)
754                                                 return log_oom();
755                                         shift = buffer;
756
757                                         range++;
758                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759                                                 log_error("Failed to parse UID range: %s", range);
760                                                 return -EINVAL;
761                                         }
762                                 } else
763                                         shift = optarg;
764
765                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
766                                         log_error("Failed to parse UID: %s", optarg);
767                                         return -EINVAL;
768                                 }
769                         }
770
771                         arg_userns = true;
772                         break;
773
774                 case ARG_KILL_SIGNAL:
775                         arg_kill_signal = signal_from_string_try_harder(optarg);
776                         if (arg_kill_signal < 0) {
777                                 log_error("Cannot parse signal: %s", optarg);
778                                 return -EINVAL;
779                         }
780
781                         break;
782
783                 case '?':
784                         return -EINVAL;
785
786                 default:
787                         assert_not_reached("Unhandled option");
788                 }
789
790         if (arg_share_system)
791                 arg_register = false;
792
793         if (arg_boot && arg_share_system) {
794                 log_error("--boot and --share-system may not be combined.");
795                 return -EINVAL;
796         }
797
798         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799                 log_error("--keep-unit may not be used when invoked from a user session.");
800                 return -EINVAL;
801         }
802
803         if (arg_directory && arg_image) {
804                 log_error("--directory= and --image= may not be combined.");
805                 return -EINVAL;
806         }
807
808         if (arg_template && arg_image) {
809                 log_error("--template= and --image= may not be combined.");
810                 return -EINVAL;
811         }
812
813         if (arg_template && !(arg_directory || arg_machine)) {
814                 log_error("--template= needs --directory= or --machine=.");
815                 return -EINVAL;
816         }
817
818         if (arg_ephemeral && arg_template) {
819                 log_error("--ephemeral and --template= may not be combined.");
820                 return -EINVAL;
821         }
822
823         if (arg_ephemeral && arg_image) {
824                 log_error("--ephemeral and --image= may not be combined.");
825                 return -EINVAL;
826         }
827
828         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829                 log_error("--ephemeral and --link-journal= may not be combined.");
830                 return -EINVAL;
831         }
832
833         if (arg_volatile != VOLATILE_NO && arg_read_only) {
834                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
835                 return -EINVAL;
836         }
837
838         if (arg_expose_ports && !arg_private_network) {
839                 log_error("Cannot use --port= without private networking.");
840                 return -EINVAL;
841         }
842
843         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
844
845         if (arg_boot && arg_kill_signal <= 0)
846                 arg_kill_signal = SIGRTMIN+3;
847
848         return 1;
849 }
850
851 static int mount_all(const char *dest) {
852
853         typedef struct MountPoint {
854                 const char *what;
855                 const char *where;
856                 const char *type;
857                 const char *options;
858                 unsigned long flags;
859                 bool fatal;
860         } MountPoint;
861
862         static const MountPoint mount_table[] = {
863                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
864                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
865                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
866                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
867                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
868                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
869                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
870                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
871                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
872 #ifdef HAVE_SELINUX
873                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
874                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
875 #endif
876         };
877
878         unsigned k;
879         int r = 0;
880
881         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
882                 _cleanup_free_ char *where = NULL, *options = NULL;
883                 const char *o;
884                 int t;
885
886                 where = strjoin(dest, "/", mount_table[k].where, NULL);
887                 if (!where)
888                         return log_oom();
889
890                 t = path_is_mount_point(where, true);
891                 if (t < 0) {
892                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
893
894                         if (r == 0)
895                                 r = t;
896
897                         continue;
898                 }
899
900                 /* Skip this entry if it is not a remount. */
901                 if (mount_table[k].what && t > 0)
902                         continue;
903
904                 t = mkdir_p(where, 0755);
905                 if (t < 0) {
906                         if (mount_table[k].fatal) {
907                                log_error_errno(t, "Failed to create directory %s: %m", where);
908
909                                 if (r == 0)
910                                         r = t;
911                         } else
912                                log_warning_errno(t, "Failed to create directory %s: %m", where);
913
914                         continue;
915                 }
916
917 #ifdef HAVE_SELINUX
918                 if (arg_selinux_apifs_context &&
919                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
921                         if (!options)
922                                 return log_oom();
923
924                         o = options;
925                 } else
926 #endif
927                         o = mount_table[k].options;
928
929                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930                         char *uid_options = NULL;
931
932                         if (o)
933                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
934                         else
935                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
936                         if (!uid_options)
937                                 return log_oom();
938
939                         free(options);
940                         o = options = uid_options;
941                 }
942
943                 if (mount(mount_table[k].what,
944                           where,
945                           mount_table[k].type,
946                           mount_table[k].flags,
947                           o) < 0) {
948
949                         if (mount_table[k].fatal) {
950                                 log_error_errno(errno, "mount(%s) failed: %m", where);
951
952                                 if (r == 0)
953                                         r = -errno;
954                         } else
955                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
956                 }
957         }
958
959         return r;
960 }
961
962 static int mount_binds(const char *dest, char **l, bool ro) {
963         char **x, **y;
964
965         STRV_FOREACH_PAIR(x, y, l) {
966                 _cleanup_free_ char *where = NULL;
967                 struct stat source_st, dest_st;
968                 int r;
969
970                 if (stat(*x, &source_st) < 0)
971                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
972
973                 where = strappend(dest, *y);
974                 if (!where)
975                         return log_oom();
976
977                 r = stat(where, &dest_st);
978                 if (r == 0) {
979                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
981                                 return -EINVAL;
982                         }
983                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
985                                 return -EINVAL;
986                         }
987                 } else if (errno == ENOENT) {
988                         r = mkdir_parents_label(where, 0755);
989                         if (r < 0)
990                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
991                 } else {
992                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
993                         return -errno;
994                 }
995
996                 /* Create the mount point. Any non-directory file can be
997                  * mounted on any non-directory file (regular, fifo, socket,
998                  * char, block).
999                  */
1000                 if (S_ISDIR(source_st.st_mode)) {
1001                         r = mkdir_label(where, 0755);
1002                         if (r < 0 && errno != EEXIST)
1003                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1004                 } else {
1005                         r = touch(where);
1006                         if (r < 0)
1007                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1008                 }
1009
1010                 if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
1011                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1012
1013                 if (ro) {
1014                         r = bind_remount_recursive(where, true);
1015                         if (r < 0)
1016                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1017                 }
1018         }
1019
1020         return 0;
1021 }
1022
1023 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1024         char *to;
1025         int r;
1026
1027         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1028
1029         r = path_is_mount_point(to, false);
1030         if (r < 0)
1031                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1032         if (r > 0)
1033                 return 0;
1034
1035         mkdir_p(to, 0755);
1036
1037         /* The superblock mount options of the mount point need to be
1038          * identical to the hosts', and hence writable... */
1039         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1040                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1041
1042         /* ... hence let's only make the bind mount read-only, not the
1043          * superblock. */
1044         if (read_only) {
1045                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1047         }
1048         return 1;
1049 }
1050
1051 static int mount_cgroup(const char *dest) {
1052         _cleanup_set_free_free_ Set *controllers = NULL;
1053         _cleanup_free_ char *own_cgroup_path = NULL;
1054         const char *cgroup_root, *systemd_root, *systemd_own;
1055         int r;
1056
1057         controllers = set_new(&string_hash_ops);
1058         if (!controllers)
1059                 return log_oom();
1060
1061         r = cg_kernel_controllers(controllers);
1062         if (r < 0)
1063                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1064
1065         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1066         if (r < 0)
1067                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1068
1069         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1070         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1072
1073         for (;;) {
1074                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1075
1076                 controller = set_steal_first(controllers);
1077                 if (!controller)
1078                         break;
1079
1080                 origin = strappend("/sys/fs/cgroup/", controller);
1081                 if (!origin)
1082                         return log_oom();
1083
1084                 r = readlink_malloc(origin, &combined);
1085                 if (r == -EINVAL) {
1086                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1087
1088                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1089                         if (r < 0)
1090                                 return r;
1091
1092                 } else if (r < 0)
1093                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1094                 else {
1095                         _cleanup_free_ char *target = NULL;
1096
1097                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1098                         if (!target)
1099                                 return log_oom();
1100
1101                         /* A symbolic link, a combination of controllers in one hierarchy */
1102
1103                         if (!filename_is_valid(combined)) {
1104                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1105                                 continue;
1106                         }
1107
1108                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1109                         if (r < 0)
1110                                 return r;
1111
1112                         if (symlink(combined, target) < 0)
1113                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1114                 }
1115         }
1116
1117         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1118         if (r < 0)
1119                 return r;
1120
1121         /* Make our own cgroup a (writable) bind mount */
1122         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1123         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1124                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1125
1126         /* And then remount the systemd cgroup root read-only */
1127         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1128         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1130
1131         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1133
1134         return 0;
1135 }
1136
1137 static int mount_tmpfs(const char *dest) {
1138         char **i, **o;
1139
1140         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141                 _cleanup_free_ char *where = NULL;
1142                 int r;
1143
1144                 where = strappend(dest, *i);
1145                 if (!where)
1146                         return log_oom();
1147
1148                 r = mkdir_label(where, 0755);
1149                 if (r < 0 && r != -EEXIST)
1150                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1151
1152                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1154         }
1155
1156         return 0;
1157 }
1158
1159 static int setup_timezone(const char *dest) {
1160         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1161         char *z, *y;
1162         int r;
1163
1164         assert(dest);
1165
1166         /* Fix the timezone, if possible */
1167         r = readlink_malloc("/etc/localtime", &p);
1168         if (r < 0) {
1169                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1170                 return 0;
1171         }
1172
1173         z = path_startswith(p, "../usr/share/zoneinfo/");
1174         if (!z)
1175                 z = path_startswith(p, "/usr/share/zoneinfo/");
1176         if (!z) {
1177                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1178                 return 0;
1179         }
1180
1181         where = strappend(dest, "/etc/localtime");
1182         if (!where)
1183                 return log_oom();
1184
1185         r = readlink_malloc(where, &q);
1186         if (r >= 0) {
1187                 y = path_startswith(q, "../usr/share/zoneinfo/");
1188                 if (!y)
1189                         y = path_startswith(q, "/usr/share/zoneinfo/");
1190
1191                 /* Already pointing to the right place? Then do nothing .. */
1192                 if (y && streq(y, z))
1193                         return 0;
1194         }
1195
1196         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1197         if (!check)
1198                 return log_oom();
1199
1200         if (access(check, F_OK) < 0) {
1201                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1202                 return 0;
1203         }
1204
1205         what = strappend("../usr/share/zoneinfo/", z);
1206         if (!what)
1207                 return log_oom();
1208
1209         r = mkdir_parents(where, 0755);
1210         if (r < 0) {
1211                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         r = unlink(where);
1217         if (r < 0 && errno != ENOENT) {
1218                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1219
1220                 return 0;
1221         }
1222
1223         if (symlink(what, where) < 0) {
1224                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1225                 return 0;
1226         }
1227
1228         return 0;
1229 }
1230
1231 static int setup_resolv_conf(const char *dest) {
1232         _cleanup_free_ char *where = NULL;
1233         int r;
1234
1235         assert(dest);
1236
1237         if (arg_private_network)
1238                 return 0;
1239
1240         /* Fix resolv.conf, if possible */
1241         where = strappend(dest, "/etc/resolv.conf");
1242         if (!where)
1243                 return log_oom();
1244
1245         /* We don't really care for the results of this really. If it
1246          * fails, it fails, but meh... */
1247         r = mkdir_parents(where, 0755);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1255         if (r < 0) {
1256                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1257
1258                 return 0;
1259         }
1260
1261         return 0;
1262 }
1263
1264 static int setup_volatile_state(const char *directory) {
1265         const char *p;
1266         int r;
1267
1268         assert(directory);
1269
1270         if (arg_volatile != VOLATILE_STATE)
1271                 return 0;
1272
1273         /* --volatile=state means we simply overmount /var
1274            with a tmpfs, and the rest read-only. */
1275
1276         r = bind_remount_recursive(directory, true);
1277         if (r < 0)
1278                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1279
1280         p = strjoina(directory, "/var");
1281         r = mkdir(p, 0755);
1282         if (r < 0 && errno != EEXIST)
1283                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1284
1285         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1287
1288         return 0;
1289 }
1290
1291 static int setup_volatile(const char *directory) {
1292         bool tmpfs_mounted = false, bind_mounted = false;
1293         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1294         const char *f, *t;
1295         int r;
1296
1297         assert(directory);
1298
1299         if (arg_volatile != VOLATILE_YES)
1300                 return 0;
1301
1302         /* --volatile=yes means we mount a tmpfs to the root dir, and
1303            the original /usr to use inside it, and that read-only. */
1304
1305         if (!mkdtemp(template))
1306                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1307
1308         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1309                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1310                 r = -errno;
1311                 goto fail;
1312         }
1313
1314         tmpfs_mounted = true;
1315
1316         f = strjoina(directory, "/usr");
1317         t = strjoina(template, "/usr");
1318
1319         r = mkdir(t, 0755);
1320         if (r < 0 && errno != EEXIST) {
1321                 log_error_errno(errno, "Failed to create %s: %m", t);
1322                 r = -errno;
1323                 goto fail;
1324         }
1325
1326         if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
1327                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1328                 r = -errno;
1329                 goto fail;
1330         }
1331
1332         bind_mounted = true;
1333
1334         r = bind_remount_recursive(t, true);
1335         if (r < 0) {
1336                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1337                 goto fail;
1338         }
1339
1340         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1341                 log_error_errno(errno, "Failed to move root mount: %m");
1342                 r = -errno;
1343                 goto fail;
1344         }
1345
1346         rmdir(template);
1347
1348         return 0;
1349
1350 fail:
1351         if (bind_mounted)
1352                 umount(t);
1353         if (tmpfs_mounted)
1354                 umount(template);
1355         rmdir(template);
1356         return r;
1357 }
1358
1359 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1360
1361         snprintf(s, 37,
1362                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363                  SD_ID128_FORMAT_VAL(id));
1364
1365         return s;
1366 }
1367
1368 static int setup_boot_id(const char *dest) {
1369         _cleanup_free_ char *from = NULL, *to = NULL;
1370         sd_id128_t rnd = {};
1371         char as_uuid[37];
1372         int r;
1373
1374         assert(dest);
1375
1376         if (arg_share_system)
1377                 return 0;
1378
1379         /* Generate a new randomized boot ID, so that each boot-up of
1380          * the container gets a new one */
1381
1382         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1383         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1384         if (!from || !to)
1385                 return log_oom();
1386
1387         r = sd_id128_randomize(&rnd);
1388         if (r < 0)
1389                 return log_error_errno(r, "Failed to generate random boot id: %m");
1390
1391         id128_format_as_uuid(rnd, as_uuid);
1392
1393         r = write_string_file(from, as_uuid);
1394         if (r < 0)
1395                 return log_error_errno(r, "Failed to write boot id: %m");
1396
1397         if (mount(from, to, NULL, MS_BIND, NULL) < 0) {
1398                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1399                 r = -errno;
1400         } else if (mount(from, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1401                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1402
1403         unlink(from);
1404         return r;
1405 }
1406
1407 static int copy_devnodes(const char *dest) {
1408
1409         static const char devnodes[] =
1410                 "null\0"
1411                 "zero\0"
1412                 "full\0"
1413                 "random\0"
1414                 "urandom\0"
1415                 "tty\0"
1416                 "net/tun\0";
1417
1418         const char *d;
1419         int r = 0;
1420         _cleanup_umask_ mode_t u;
1421
1422         assert(dest);
1423
1424         u = umask(0000);
1425
1426         NULSTR_FOREACH(d, devnodes) {
1427                 _cleanup_free_ char *from = NULL, *to = NULL;
1428                 struct stat st;
1429
1430                 from = strappend("/dev/", d);
1431                 to = strjoin(dest, "/dev/", d, NULL);
1432                 if (!from || !to)
1433                         return log_oom();
1434
1435                 if (stat(from, &st) < 0) {
1436
1437                         if (errno != ENOENT)
1438                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1439
1440                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1441
1442                         log_error("%s is not a char or block device, cannot copy", from);
1443                         return -EIO;
1444
1445                 } else {
1446                         r = mkdir_parents(to, 0775);
1447                         if (r < 0) {
1448                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1449                                 return -r;
1450                         }
1451
1452                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1453                                 if (errno != EPERM)
1454                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
1455
1456                                 /* Some systems abusively restrict mknod but
1457                                  * allow bind mounts. */
1458                                 r = touch(to);
1459                                 if (r < 0)
1460                                         return log_error_errno(r, "touch (%s) failed: %m", to);
1461                                 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1462                                         return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
1463                         }
1464
1465                         if (arg_userns && arg_uid_shift != UID_INVALID)
1466                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1467                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1468                 }
1469         }
1470
1471         return r;
1472 }
1473
1474 static int setup_ptmx(const char *dest) {
1475         _cleanup_free_ char *p = NULL;
1476
1477         p = strappend(dest, "/dev/ptmx");
1478         if (!p)
1479                 return log_oom();
1480
1481         if (symlink("pts/ptmx", p) < 0)
1482                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1483
1484         if (arg_userns && arg_uid_shift != UID_INVALID)
1485                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1486                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1487
1488         return 0;
1489 }
1490
1491 static int setup_dev_console(const char *dest, const char *console) {
1492         _cleanup_umask_ mode_t u;
1493         const char *to;
1494         int r;
1495
1496         assert(dest);
1497         assert(console);
1498
1499         u = umask(0000);
1500
1501         r = chmod_and_chown(console, 0600, 0, 0);
1502         if (r < 0)
1503                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1504
1505         /* We need to bind mount the right tty to /dev/console since
1506          * ptys can only exist on pts file systems. To have something
1507          * to bind mount things on we create a empty regular file. */
1508
1509         to = strjoina(dest, "/dev/console");
1510         r = touch(to);
1511         if (r < 0)
1512                 return log_error_errno(r, "touch() for /dev/console failed: %m");
1513
1514         if (mount(console, to, NULL, MS_BIND, NULL) < 0)
1515                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1516
1517         return 0;
1518 }
1519
1520 static int setup_kmsg(const char *dest, int kmsg_socket) {
1521         _cleanup_free_ char *from = NULL, *to = NULL;
1522         _cleanup_umask_ mode_t u;
1523         int r, fd, k;
1524         union {
1525                 struct cmsghdr cmsghdr;
1526                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1527         } control = {};
1528         struct msghdr mh = {
1529                 .msg_control = &control,
1530                 .msg_controllen = sizeof(control),
1531         };
1532         struct cmsghdr *cmsg;
1533
1534         assert(dest);
1535         assert(kmsg_socket >= 0);
1536
1537         u = umask(0000);
1538
1539         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1540          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1541          * on the reading side behave very similar to /proc/kmsg,
1542          * their writing side behaves differently from /dev/kmsg in
1543          * that writing blocks when nothing is reading. In order to
1544          * avoid any problems with containers deadlocking due to this
1545          * we simply make /dev/kmsg unavailable to the container. */
1546         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1547             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1548                 return log_oom();
1549
1550         if (mkfifo(from, 0600) < 0)
1551                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1552
1553         r = chmod_and_chown(from, 0600, 0, 0);
1554         if (r < 0)
1555                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1556
1557         if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1558                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1559
1560         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1561         if (fd < 0)
1562                 return log_error_errno(errno, "Failed to open fifo: %m");
1563
1564         cmsg = CMSG_FIRSTHDR(&mh);
1565         cmsg->cmsg_level = SOL_SOCKET;
1566         cmsg->cmsg_type = SCM_RIGHTS;
1567         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1568         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1569
1570         mh.msg_controllen = cmsg->cmsg_len;
1571
1572         /* Store away the fd in the socket, so that it stays open as
1573          * long as we run the child */
1574         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1575         safe_close(fd);
1576
1577         if (k < 0)
1578                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1579
1580         /* And now make the FIFO unavailable as /dev/kmsg... */
1581         unlink(from);
1582         return 0;
1583 }
1584
1585 static int send_rtnl(int send_fd) {
1586         union {
1587                 struct cmsghdr cmsghdr;
1588                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1589         } control = {};
1590         struct msghdr mh = {
1591                 .msg_control = &control,
1592                 .msg_controllen = sizeof(control),
1593         };
1594         struct cmsghdr *cmsg;
1595         _cleanup_close_ int fd = -1;
1596         ssize_t k;
1597
1598         assert(send_fd >= 0);
1599
1600         if (!arg_expose_ports)
1601                 return 0;
1602
1603         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1604         if (fd < 0)
1605                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1606
1607         cmsg = CMSG_FIRSTHDR(&mh);
1608         cmsg->cmsg_level = SOL_SOCKET;
1609         cmsg->cmsg_type = SCM_RIGHTS;
1610         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1611         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1612
1613         mh.msg_controllen = cmsg->cmsg_len;
1614
1615         /* Store away the fd in the socket, so that it stays open as
1616          * long as we run the child */
1617         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1618         if (k < 0)
1619                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1620
1621         return 0;
1622 }
1623
1624 static int flush_ports(union in_addr_union *exposed) {
1625         ExposePort *p;
1626         int r, af = AF_INET;
1627
1628         assert(exposed);
1629
1630         if (!arg_expose_ports)
1631                 return 0;
1632
1633         if (in_addr_is_null(af, exposed))
1634                 return 0;
1635
1636         log_debug("Lost IP address.");
1637
1638         LIST_FOREACH(ports, p, arg_expose_ports) {
1639                 r = fw_add_local_dnat(false,
1640                                       af,
1641                                       p->protocol,
1642                                       NULL,
1643                                       NULL, 0,
1644                                       NULL, 0,
1645                                       p->host_port,
1646                                       exposed,
1647                                       p->container_port,
1648                                       NULL);
1649                 if (r < 0)
1650                         log_warning_errno(r, "Failed to modify firewall: %m");
1651         }
1652
1653         *exposed = IN_ADDR_NULL;
1654         return 0;
1655 }
1656
1657 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1658         _cleanup_free_ struct local_address *addresses = NULL;
1659         _cleanup_free_ char *pretty = NULL;
1660         union in_addr_union new_exposed;
1661         ExposePort *p;
1662         bool add;
1663         int af = AF_INET, r;
1664
1665         assert(exposed);
1666
1667         /* Invoked each time an address is added or removed inside the
1668          * container */
1669
1670         if (!arg_expose_ports)
1671                 return 0;
1672
1673         r = local_addresses(rtnl, 0, af, &addresses);
1674         if (r < 0)
1675                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1676
1677         add = r > 0 &&
1678                 addresses[0].family == af &&
1679                 addresses[0].scope < RT_SCOPE_LINK;
1680
1681         if (!add)
1682                 return flush_ports(exposed);
1683
1684         new_exposed = addresses[0].address;
1685         if (in_addr_equal(af, exposed, &new_exposed))
1686                 return 0;
1687
1688         in_addr_to_string(af, &new_exposed, &pretty);
1689         log_debug("New container IP is %s.", strna(pretty));
1690
1691         LIST_FOREACH(ports, p, arg_expose_ports) {
1692
1693                 r = fw_add_local_dnat(true,
1694                                       af,
1695                                       p->protocol,
1696                                       NULL,
1697                                       NULL, 0,
1698                                       NULL, 0,
1699                                       p->host_port,
1700                                       &new_exposed,
1701                                       p->container_port,
1702                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1703                 if (r < 0)
1704                         log_warning_errno(r, "Failed to modify firewall: %m");
1705         }
1706
1707         *exposed = new_exposed;
1708         return 0;
1709 }
1710
1711 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1712         union in_addr_union *exposed = userdata;
1713
1714         assert(rtnl);
1715         assert(m);
1716         assert(exposed);
1717
1718         expose_ports(rtnl, exposed);
1719         return 0;
1720 }
1721
1722 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1723         union {
1724                 struct cmsghdr cmsghdr;
1725                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1726         } control = {};
1727         struct msghdr mh = {
1728                 .msg_control = &control,
1729                 .msg_controllen = sizeof(control),
1730         };
1731         struct cmsghdr *cmsg;
1732         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1733         int fd, r;
1734         ssize_t k;
1735
1736         assert(event);
1737         assert(recv_fd >= 0);
1738         assert(ret);
1739
1740         if (!arg_expose_ports)
1741                 return 0;
1742
1743         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1744         if (k < 0)
1745                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1746
1747         cmsg = CMSG_FIRSTHDR(&mh);
1748         assert(cmsg->cmsg_level == SOL_SOCKET);
1749         assert(cmsg->cmsg_type == SCM_RIGHTS);
1750         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1751         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1752
1753         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1754         if (r < 0) {
1755                 safe_close(fd);
1756                 return log_error_errno(r, "Failed to create rtnl object: %m");
1757         }
1758
1759         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1760         if (r < 0)
1761                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1762
1763         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1764         if (r < 0)
1765                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1766
1767         r = sd_rtnl_attach_event(rtnl, event, 0);
1768         if (r < 0)
1769                 return log_error_errno(r, "Failed to add to even loop: %m");
1770
1771         *ret = rtnl;
1772         rtnl = NULL;
1773
1774         return 0;
1775 }
1776
1777 static int setup_hostname(void) {
1778
1779         if (arg_share_system)
1780                 return 0;
1781
1782         if (sethostname_idempotent(arg_machine) < 0)
1783                 return -errno;
1784
1785         return 0;
1786 }
1787
1788 static int setup_journal(const char *directory) {
1789         sd_id128_t machine_id, this_id;
1790         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1791         char *id;
1792         int r;
1793
1794         /* Don't link journals in ephemeral mode */
1795         if (arg_ephemeral)
1796                 return 0;
1797
1798         p = strappend(directory, "/etc/machine-id");
1799         if (!p)
1800                 return log_oom();
1801
1802         r = read_one_line_file(p, &b);
1803         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1804                 return 0;
1805         else if (r < 0)
1806                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1807
1808         id = strstrip(b);
1809         if (isempty(id) && arg_link_journal == LINK_AUTO)
1810                 return 0;
1811
1812         /* Verify validity */
1813         r = sd_id128_from_string(id, &machine_id);
1814         if (r < 0)
1815                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1816
1817         r = sd_id128_get_machine(&this_id);
1818         if (r < 0)
1819                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1820
1821         if (sd_id128_equal(machine_id, this_id)) {
1822                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1823                          "Host and machine ids are equal (%s): refusing to link journals", id);
1824                 if (arg_link_journal == LINK_AUTO)
1825                         return 0;
1826                 return -EEXIST;
1827         }
1828
1829         if (arg_link_journal == LINK_NO)
1830                 return 0;
1831
1832         free(p);
1833         p = strappend("/var/log/journal/", id);
1834         q = strjoin(directory, "/var/log/journal/", id, NULL);
1835         if (!p || !q)
1836                 return log_oom();
1837
1838         if (path_is_mount_point(p, false) > 0) {
1839                 if (arg_link_journal != LINK_AUTO) {
1840                         log_error("%s: already a mount point, refusing to use for journal", p);
1841                         return -EEXIST;
1842                 }
1843
1844                 return 0;
1845         }
1846
1847         if (path_is_mount_point(q, false) > 0) {
1848                 if (arg_link_journal != LINK_AUTO) {
1849                         log_error("%s: already a mount point, refusing to use for journal", q);
1850                         return -EEXIST;
1851                 }
1852
1853                 return 0;
1854         }
1855
1856         r = readlink_and_make_absolute(p, &d);
1857         if (r >= 0) {
1858                 if ((arg_link_journal == LINK_GUEST ||
1859                      arg_link_journal == LINK_AUTO) &&
1860                     path_equal(d, q)) {
1861
1862                         r = mkdir_p(q, 0755);
1863                         if (r < 0)
1864                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1865                         return 0;
1866                 }
1867
1868                 if (unlink(p) < 0)
1869                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1870         } else if (r == -EINVAL) {
1871
1872                 if (arg_link_journal == LINK_GUEST &&
1873                     rmdir(p) < 0) {
1874
1875                         if (errno == ENOTDIR) {
1876                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1877                                 return r;
1878                         } else {
1879                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1880                                 return -errno;
1881                         }
1882                 }
1883         } else if (r != -ENOENT) {
1884                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1885                 return r;
1886         }
1887
1888         if (arg_link_journal == LINK_GUEST) {
1889
1890                 if (symlink(q, p) < 0) {
1891                         if (arg_link_journal_try) {
1892                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1893                                 return 0;
1894                         } else {
1895                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1896                                 return -errno;
1897                         }
1898                 }
1899
1900                 r = mkdir_p(q, 0755);
1901                 if (r < 0)
1902                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1903                 return 0;
1904         }
1905
1906         if (arg_link_journal == LINK_HOST) {
1907                 /* don't create parents here -- if the host doesn't have
1908                  * permanent journal set up, don't force it here */
1909                 r = mkdir(p, 0755);
1910                 if (r < 0) {
1911                         if (arg_link_journal_try) {
1912                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1913                                 return 0;
1914                         } else {
1915                                 log_error_errno(errno, "Failed to create %s: %m", p);
1916                                 return r;
1917                         }
1918                 }
1919
1920         } else if (access(p, F_OK) < 0)
1921                 return 0;
1922
1923         if (dir_is_empty(q) == 0)
1924                 log_warning("%s is not empty, proceeding anyway.", q);
1925
1926         r = mkdir_p(q, 0755);
1927         if (r < 0) {
1928                 log_error_errno(errno, "Failed to create %s: %m", q);
1929                 return r;
1930         }
1931
1932         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1933                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1934
1935         return 0;
1936 }
1937
1938 static int drop_capabilities(void) {
1939         return capability_bounding_set_drop(~arg_retain, false);
1940 }
1941
1942 static int register_machine(pid_t pid, int local_ifindex) {
1943         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1944         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1945         int r;
1946
1947         if (!arg_register)
1948                 return 0;
1949
1950         r = sd_bus_default_system(&bus);
1951         if (r < 0)
1952                 return log_error_errno(r, "Failed to open system bus: %m");
1953
1954         if (arg_keep_unit) {
1955                 r = sd_bus_call_method(
1956                                 bus,
1957                                 "org.freedesktop.machine1",
1958                                 "/org/freedesktop/machine1",
1959                                 "org.freedesktop.machine1.Manager",
1960                                 "RegisterMachineWithNetwork",
1961                                 &error,
1962                                 NULL,
1963                                 "sayssusai",
1964                                 arg_machine,
1965                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1966                                 "nspawn",
1967                                 "container",
1968                                 (uint32_t) pid,
1969                                 strempty(arg_directory),
1970                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1971         } else {
1972                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1973                 char **i;
1974
1975                 r = sd_bus_message_new_method_call(
1976                                 bus,
1977                                 &m,
1978                                 "org.freedesktop.machine1",
1979                                 "/org/freedesktop/machine1",
1980                                 "org.freedesktop.machine1.Manager",
1981                                 "CreateMachineWithNetwork");
1982                 if (r < 0)
1983                         return bus_log_create_error(r);
1984
1985                 r = sd_bus_message_append(
1986                                 m,
1987                                 "sayssusai",
1988                                 arg_machine,
1989                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1990                                 "nspawn",
1991                                 "container",
1992                                 (uint32_t) pid,
1993                                 strempty(arg_directory),
1994                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1995                 if (r < 0)
1996                         return bus_log_create_error(r);
1997
1998                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1999                 if (r < 0)
2000                         return bus_log_create_error(r);
2001
2002                 if (!isempty(arg_slice)) {
2003                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2004                         if (r < 0)
2005                                 return bus_log_create_error(r);
2006                 }
2007
2008                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2009                 if (r < 0)
2010                         return bus_log_create_error(r);
2011
2012                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2013                                           /* Allow the container to
2014                                            * access and create the API
2015                                            * device nodes, so that
2016                                            * PrivateDevices= in the
2017                                            * container can work
2018                                            * fine */
2019                                           "/dev/null", "rwm",
2020                                           "/dev/zero", "rwm",
2021                                           "/dev/full", "rwm",
2022                                           "/dev/random", "rwm",
2023                                           "/dev/urandom", "rwm",
2024                                           "/dev/tty", "rwm",
2025                                           "/dev/net/tun", "rwm",
2026                                           /* Allow the container
2027                                            * access to ptys. However,
2028                                            * do not permit the
2029                                            * container to ever create
2030                                            * these device nodes. */
2031                                           "/dev/pts/ptmx", "rw",
2032                                           "char-pts", "rw");
2033                 if (r < 0)
2034                         return log_error_errno(r, "Failed to add device whitelist: %m");
2035
2036                 STRV_FOREACH(i, arg_property) {
2037                         r = sd_bus_message_open_container(m, 'r', "sv");
2038                         if (r < 0)
2039                                 return bus_log_create_error(r);
2040
2041                         r = bus_append_unit_property_assignment(m, *i);
2042                         if (r < 0)
2043                                 return r;
2044
2045                         r = sd_bus_message_close_container(m);
2046                         if (r < 0)
2047                                 return bus_log_create_error(r);
2048                 }
2049
2050                 r = sd_bus_message_close_container(m);
2051                 if (r < 0)
2052                         return bus_log_create_error(r);
2053
2054                 r = sd_bus_call(bus, m, 0, &error, NULL);
2055         }
2056
2057         if (r < 0) {
2058                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2059                 return r;
2060         }
2061
2062         return 0;
2063 }
2064
2065 static int terminate_machine(pid_t pid) {
2066         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2067         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2068         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2069         const char *path;
2070         int r;
2071
2072         if (!arg_register)
2073                 return 0;
2074
2075         r = sd_bus_default_system(&bus);
2076         if (r < 0)
2077                 return log_error_errno(r, "Failed to open system bus: %m");
2078
2079         r = sd_bus_call_method(
2080                         bus,
2081                         "org.freedesktop.machine1",
2082                         "/org/freedesktop/machine1",
2083                         "org.freedesktop.machine1.Manager",
2084                         "GetMachineByPID",
2085                         &error,
2086                         &reply,
2087                         "u",
2088                         (uint32_t) pid);
2089         if (r < 0) {
2090                 /* Note that the machine might already have been
2091                  * cleaned up automatically, hence don't consider it a
2092                  * failure if we cannot get the machine object. */
2093                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2094                 return 0;
2095         }
2096
2097         r = sd_bus_message_read(reply, "o", &path);
2098         if (r < 0)
2099                 return bus_log_parse_error(r);
2100
2101         r = sd_bus_call_method(
2102                         bus,
2103                         "org.freedesktop.machine1",
2104                         path,
2105                         "org.freedesktop.machine1.Machine",
2106                         "Terminate",
2107                         &error,
2108                         NULL,
2109                         NULL);
2110         if (r < 0) {
2111                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2112                 return 0;
2113         }
2114
2115         return 0;
2116 }
2117
2118 static int reset_audit_loginuid(void) {
2119         _cleanup_free_ char *p = NULL;
2120         int r;
2121
2122         if (arg_share_system)
2123                 return 0;
2124
2125         r = read_one_line_file("/proc/self/loginuid", &p);
2126         if (r == -ENOENT)
2127                 return 0;
2128         if (r < 0)
2129                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2130
2131         /* Already reset? */
2132         if (streq(p, "4294967295"))
2133                 return 0;
2134
2135         r = write_string_file("/proc/self/loginuid", "4294967295");
2136         if (r < 0) {
2137                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2138                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2139                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2140                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2141                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2142
2143                 sleep(5);
2144         }
2145
2146         return 0;
2147 }
2148
2149 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2150 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2151 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2152
2153 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2154         uint8_t result[8];
2155         size_t l, sz;
2156         uint8_t *v, *i;
2157         int r;
2158
2159         l = strlen(arg_machine);
2160         sz = sizeof(sd_id128_t) + l;
2161         if (idx > 0)
2162                 sz += sizeof(idx);
2163
2164         v = alloca(sz);
2165
2166         /* fetch some persistent data unique to the host */
2167         r = sd_id128_get_machine((sd_id128_t*) v);
2168         if (r < 0)
2169                 return r;
2170
2171         /* combine with some data unique (on this host) to this
2172          * container instance */
2173         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2174         if (idx > 0) {
2175                 idx = htole64(idx);
2176                 memcpy(i, &idx, sizeof(idx));
2177         }
2178
2179         /* Let's hash the host machine ID plus the container name. We
2180          * use a fixed, but originally randomly created hash key here. */
2181         siphash24(result, v, sz, hash_key.bytes);
2182
2183         assert_cc(ETH_ALEN <= sizeof(result));
2184         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2185
2186         /* see eth_random_addr in the kernel */
2187         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2188         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2189
2190         return 0;
2191 }
2192
2193 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2194         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2195         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2196         struct ether_addr mac_host, mac_container;
2197         int r, i;
2198
2199         if (!arg_private_network)
2200                 return 0;
2201
2202         if (!arg_network_veth)
2203                 return 0;
2204
2205         /* Use two different interface name prefixes depending whether
2206          * we are in bridge mode or not. */
2207         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2208                  arg_network_bridge ? "vb" : "ve", arg_machine);
2209
2210         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2211         if (r < 0)
2212                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2213
2214         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2215         if (r < 0)
2216                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2217
2218         r = sd_rtnl_open(&rtnl, 0);
2219         if (r < 0)
2220                 return log_error_errno(r, "Failed to connect to netlink: %m");
2221
2222         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2223         if (r < 0)
2224                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2225
2226         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2227         if (r < 0)
2228                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2229
2230         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2231         if (r < 0)
2232                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2233
2234         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2235         if (r < 0)
2236                 return log_error_errno(r, "Failed to open netlink container: %m");
2237
2238         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2239         if (r < 0)
2240                 return log_error_errno(r, "Failed to open netlink container: %m");
2241
2242         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2243         if (r < 0)
2244                 return log_error_errno(r, "Failed to open netlink container: %m");
2245
2246         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2247         if (r < 0)
2248                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2249
2250         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2251         if (r < 0)
2252                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2253
2254         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2255         if (r < 0)
2256                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2257
2258         r = sd_rtnl_message_close_container(m);
2259         if (r < 0)
2260                 return log_error_errno(r, "Failed to close netlink container: %m");
2261
2262         r = sd_rtnl_message_close_container(m);
2263         if (r < 0)
2264                 return log_error_errno(r, "Failed to close netlink container: %m");
2265
2266         r = sd_rtnl_message_close_container(m);
2267         if (r < 0)
2268                 return log_error_errno(r, "Failed to close netlink container: %m");
2269
2270         r = sd_rtnl_call(rtnl, m, 0, NULL);
2271         if (r < 0)
2272                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2273
2274         i = (int) if_nametoindex(iface_name);
2275         if (i <= 0)
2276                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2277
2278         *ifi = i;
2279
2280         return 0;
2281 }
2282
2283 static int setup_bridge(const char veth_name[], int *ifi) {
2284         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2285         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2286         int r, bridge;
2287
2288         if (!arg_private_network)
2289                 return 0;
2290
2291         if (!arg_network_veth)
2292                 return 0;
2293
2294         if (!arg_network_bridge)
2295                 return 0;
2296
2297         bridge = (int) if_nametoindex(arg_network_bridge);
2298         if (bridge <= 0)
2299                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2300
2301         *ifi = bridge;
2302
2303         r = sd_rtnl_open(&rtnl, 0);
2304         if (r < 0)
2305                 return log_error_errno(r, "Failed to connect to netlink: %m");
2306
2307         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2308         if (r < 0)
2309                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2310
2311         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2312         if (r < 0)
2313                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2314
2315         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2316         if (r < 0)
2317                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2318
2319         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2320         if (r < 0)
2321                 return log_error_errno(r, "Failed to add netlink master field: %m");
2322
2323         r = sd_rtnl_call(rtnl, m, 0, NULL);
2324         if (r < 0)
2325                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2326
2327         return 0;
2328 }
2329
2330 static int parse_interface(struct udev *udev, const char *name) {
2331         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2332         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2333         int ifi;
2334
2335         ifi = (int) if_nametoindex(name);
2336         if (ifi <= 0)
2337                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2338
2339         sprintf(ifi_str, "n%i", ifi);
2340         d = udev_device_new_from_device_id(udev, ifi_str);
2341         if (!d)
2342                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2343
2344         if (udev_device_get_is_initialized(d) <= 0) {
2345                 log_error("Network interface %s is not initialized yet.", name);
2346                 return -EBUSY;
2347         }
2348
2349         return ifi;
2350 }
2351
2352 static int move_network_interfaces(pid_t pid) {
2353         _cleanup_udev_unref_ struct udev *udev = NULL;
2354         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2355         char **i;
2356         int r;
2357
2358         if (!arg_private_network)
2359                 return 0;
2360
2361         if (strv_isempty(arg_network_interfaces))
2362                 return 0;
2363
2364         r = sd_rtnl_open(&rtnl, 0);
2365         if (r < 0)
2366                 return log_error_errno(r, "Failed to connect to netlink: %m");
2367
2368         udev = udev_new();
2369         if (!udev) {
2370                 log_error("Failed to connect to udev.");
2371                 return -ENOMEM;
2372         }
2373
2374         STRV_FOREACH(i, arg_network_interfaces) {
2375                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2376                 int ifi;
2377
2378                 ifi = parse_interface(udev, *i);
2379                 if (ifi < 0)
2380                         return ifi;
2381
2382                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2383                 if (r < 0)
2384                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2385
2386                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2387                 if (r < 0)
2388                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2389
2390                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2391                 if (r < 0)
2392                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2393         }
2394
2395         return 0;
2396 }
2397
2398 static int setup_macvlan(pid_t pid) {
2399         _cleanup_udev_unref_ struct udev *udev = NULL;
2400         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2401         unsigned idx = 0;
2402         char **i;
2403         int r;
2404
2405         if (!arg_private_network)
2406                 return 0;
2407
2408         if (strv_isempty(arg_network_macvlan))
2409                 return 0;
2410
2411         r = sd_rtnl_open(&rtnl, 0);
2412         if (r < 0)
2413                 return log_error_errno(r, "Failed to connect to netlink: %m");
2414
2415         udev = udev_new();
2416         if (!udev) {
2417                 log_error("Failed to connect to udev.");
2418                 return -ENOMEM;
2419         }
2420
2421         STRV_FOREACH(i, arg_network_macvlan) {
2422                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2423                 _cleanup_free_ char *n = NULL;
2424                 struct ether_addr mac;
2425                 int ifi;
2426
2427                 ifi = parse_interface(udev, *i);
2428                 if (ifi < 0)
2429                         return ifi;
2430
2431                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2432                 if (r < 0)
2433                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2434
2435                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2436                 if (r < 0)
2437                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2438
2439                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2440                 if (r < 0)
2441                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2442
2443                 n = strappend("mv-", *i);
2444                 if (!n)
2445                         return log_oom();
2446
2447                 strshorten(n, IFNAMSIZ-1);
2448
2449                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2450                 if (r < 0)
2451                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2452
2453                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2454                 if (r < 0)
2455                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2456
2457                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2458                 if (r < 0)
2459                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2460
2461                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2462                 if (r < 0)
2463                         return log_error_errno(r, "Failed to open netlink container: %m");
2464
2465                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2466                 if (r < 0)
2467                         return log_error_errno(r, "Failed to open netlink container: %m");
2468
2469                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2470                 if (r < 0)
2471                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2472
2473                 r = sd_rtnl_message_close_container(m);
2474                 if (r < 0)
2475                         return log_error_errno(r, "Failed to close netlink container: %m");
2476
2477                 r = sd_rtnl_message_close_container(m);
2478                 if (r < 0)
2479                         return log_error_errno(r, "Failed to close netlink container: %m");
2480
2481                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2482                 if (r < 0)
2483                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2484         }
2485
2486         return 0;
2487 }
2488
2489 static int setup_ipvlan(pid_t pid) {
2490         _cleanup_udev_unref_ struct udev *udev = NULL;
2491         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2492         char **i;
2493         int r;
2494
2495         if (!arg_private_network)
2496                 return 0;
2497
2498         if (strv_isempty(arg_network_ipvlan))
2499                 return 0;
2500
2501         r = sd_rtnl_open(&rtnl, 0);
2502         if (r < 0)
2503                 return log_error_errno(r, "Failed to connect to netlink: %m");
2504
2505         udev = udev_new();
2506         if (!udev) {
2507                 log_error("Failed to connect to udev.");
2508                 return -ENOMEM;
2509         }
2510
2511         STRV_FOREACH(i, arg_network_ipvlan) {
2512                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2513                 _cleanup_free_ char *n = NULL;
2514                 int ifi;
2515
2516                 ifi = parse_interface(udev, *i);
2517                 if (ifi < 0)
2518                         return ifi;
2519
2520                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2521                 if (r < 0)
2522                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2523
2524                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2525                 if (r < 0)
2526                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2527
2528                 n = strappend("iv-", *i);
2529                 if (!n)
2530                         return log_oom();
2531
2532                 strshorten(n, IFNAMSIZ-1);
2533
2534                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2535                 if (r < 0)
2536                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2537
2538                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2539                 if (r < 0)
2540                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2541
2542                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2543                 if (r < 0)
2544                         return log_error_errno(r, "Failed to open netlink container: %m");
2545
2546                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2547                 if (r < 0)
2548                         return log_error_errno(r, "Failed to open netlink container: %m");
2549
2550                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2551                 if (r < 0)
2552                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2553
2554                 r = sd_rtnl_message_close_container(m);
2555                 if (r < 0)
2556                         return log_error_errno(r, "Failed to close netlink container: %m");
2557
2558                 r = sd_rtnl_message_close_container(m);
2559                 if (r < 0)
2560                         return log_error_errno(r, "Failed to close netlink container: %m");
2561
2562                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2563                 if (r < 0)
2564                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2565         }
2566
2567         return 0;
2568 }
2569
2570 static int setup_seccomp(void) {
2571
2572 #ifdef HAVE_SECCOMP
2573         static const struct {
2574                 uint64_t capability;
2575                 int syscall_num;
2576         } blacklist[] = {
2577                 { CAP_SYS_RAWIO,  SCMP_SYS(iopl)},
2578                 { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)},
2579                 { CAP_SYS_BOOT,   SCMP_SYS(kexec_load)},
2580                 { CAP_SYS_ADMIN,  SCMP_SYS(swapon)},
2581                 { CAP_SYS_ADMIN,  SCMP_SYS(swapoff)},
2582                 { CAP_SYS_ADMIN,  SCMP_SYS(open_by_handle_at)},
2583                 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2584                 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2585                 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
2586         };
2587
2588         scmp_filter_ctx seccomp;
2589         unsigned i;
2590         int r;
2591
2592         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2593         if (!seccomp)
2594                 return log_oom();
2595
2596         r = seccomp_add_secondary_archs(seccomp);
2597         if (r < 0) {
2598                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2599                 goto finish;
2600         }
2601
2602         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2603                 if (arg_retain & (1ULL << blacklist[i].capability))
2604                         continue;
2605
2606                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2607                 if (r == -EFAULT)
2608                         continue; /* unknown syscall */
2609                 if (r < 0) {
2610                         log_error_errno(r, "Failed to block syscall: %m");
2611                         goto finish;
2612                 }
2613         }
2614
2615
2616         /*
2617            Audit is broken in containers, much of the userspace audit
2618            hookup will fail if running inside a container. We don't
2619            care and just turn off creation of audit sockets.
2620
2621            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2622            with EAFNOSUPPORT which audit userspace uses as indication
2623            that audit is disabled in the kernel.
2624          */
2625
2626         r = seccomp_rule_add(
2627                         seccomp,
2628                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2629                         SCMP_SYS(socket),
2630                         2,
2631                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2632                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2633         if (r < 0) {
2634                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2635                 goto finish;
2636         }
2637
2638         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2639         if (r < 0) {
2640                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2641                 goto finish;
2642         }
2643
2644         r = seccomp_load(seccomp);
2645         if (r < 0)
2646                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2647
2648 finish:
2649         seccomp_release(seccomp);
2650         return r;
2651 #else
2652         return 0;
2653 #endif
2654
2655 }
2656
2657 static int setup_propagate(const char *root) {
2658         const char *p, *q;
2659
2660         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2661         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2662         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2663         (void) mkdir_p(p, 0600);
2664
2665         q = strjoina(root, "/run/systemd/nspawn/incoming");
2666         mkdir_parents(q, 0755);
2667         mkdir_p(q, 0600);
2668
2669         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2670                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2671
2672         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2673                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2674
2675         return 0;
2676 }
2677
2678 static int setup_image(char **device_path, int *loop_nr) {
2679         struct loop_info64 info = {
2680                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2681         };
2682         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2683         _cleanup_free_ char* loopdev = NULL;
2684         struct stat st;
2685         int r, nr;
2686
2687         assert(device_path);
2688         assert(loop_nr);
2689         assert(arg_image);
2690
2691         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2692         if (fd < 0)
2693                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2694
2695         if (fstat(fd, &st) < 0)
2696                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2697
2698         if (S_ISBLK(st.st_mode)) {
2699                 char *p;
2700
2701                 p = strdup(arg_image);
2702                 if (!p)
2703                         return log_oom();
2704
2705                 *device_path = p;
2706
2707                 *loop_nr = -1;
2708
2709                 r = fd;
2710                 fd = -1;
2711
2712                 return r;
2713         }
2714
2715         if (!S_ISREG(st.st_mode)) {
2716                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2717                 return -EINVAL;
2718         }
2719
2720         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2721         if (control < 0)
2722                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2723
2724         nr = ioctl(control, LOOP_CTL_GET_FREE);
2725         if (nr < 0)
2726                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2727
2728         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2729                 return log_oom();
2730
2731         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2732         if (loop < 0)
2733                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2734
2735         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2736                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2737
2738         if (arg_read_only)
2739                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2740
2741         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2742                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2743
2744         *device_path = loopdev;
2745         loopdev = NULL;
2746
2747         *loop_nr = nr;
2748
2749         r = loop;
2750         loop = -1;
2751
2752         return r;
2753 }
2754
2755 #define PARTITION_TABLE_BLURB \
2756         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2757         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2758         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2759         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2760         "to be bootable with systemd-nspawn."
2761
2762 static int dissect_image(
2763                 int fd,
2764                 char **root_device, bool *root_device_rw,
2765                 char **home_device, bool *home_device_rw,
2766                 char **srv_device, bool *srv_device_rw,
2767                 bool *secondary) {
2768
2769 #ifdef HAVE_BLKID
2770         int home_nr = -1, srv_nr = -1;
2771 #ifdef GPT_ROOT_NATIVE
2772         int root_nr = -1;
2773 #endif
2774 #ifdef GPT_ROOT_SECONDARY
2775         int secondary_root_nr = -1;
2776 #endif
2777         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2778         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2779         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2780         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2781         _cleanup_udev_unref_ struct udev *udev = NULL;
2782         struct udev_list_entry *first, *item;
2783         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2784         bool is_gpt, is_mbr, multiple_generic = false;
2785         const char *pttype = NULL;
2786         blkid_partlist pl;
2787         struct stat st;
2788         unsigned i;
2789         int r;
2790
2791         assert(fd >= 0);
2792         assert(root_device);
2793         assert(home_device);
2794         assert(srv_device);
2795         assert(secondary);
2796         assert(arg_image);
2797
2798         b = blkid_new_probe();
2799         if (!b)
2800                 return log_oom();
2801
2802         errno = 0;
2803         r = blkid_probe_set_device(b, fd, 0, 0);
2804         if (r != 0) {
2805                 if (errno == 0)
2806                         return log_oom();
2807
2808                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2809                 return -errno;
2810         }
2811
2812         blkid_probe_enable_partitions(b, 1);
2813         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2814
2815         errno = 0;
2816         r = blkid_do_safeprobe(b);
2817         if (r == -2 || r == 1) {
2818                 log_error("Failed to identify any partition table on\n"
2819                           "    %s\n"
2820                           PARTITION_TABLE_BLURB, arg_image);
2821                 return -EINVAL;
2822         } else if (r != 0) {
2823                 if (errno == 0)
2824                         errno = EIO;
2825                 log_error_errno(errno, "Failed to probe: %m");
2826                 return -errno;
2827         }
2828
2829         (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2830
2831         is_gpt = streq_ptr(pttype, "gpt");
2832         is_mbr = streq_ptr(pttype, "dos");
2833
2834         if (!is_gpt && !is_mbr) {
2835                 log_error("No GPT or MBR partition table discovered on\n"
2836                           "    %s\n"
2837                           PARTITION_TABLE_BLURB, arg_image);
2838                 return -EINVAL;
2839         }
2840
2841         errno = 0;
2842         pl = blkid_probe_get_partitions(b);
2843         if (!pl) {
2844                 if (errno == 0)
2845                         return log_oom();
2846
2847                 log_error("Failed to list partitions of %s", arg_image);
2848                 return -errno;
2849         }
2850
2851         udev = udev_new();
2852         if (!udev)
2853                 return log_oom();
2854
2855         if (fstat(fd, &st) < 0)
2856                 return log_error_errno(errno, "Failed to stat block device: %m");
2857
2858         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2859         if (!d)
2860                 return log_oom();
2861
2862         for (i = 0;; i++) {
2863                 int n, m;
2864
2865                 if (i >= 10) {
2866                         log_error("Kernel partitions never appeared.");
2867                         return -ENXIO;
2868                 }
2869
2870                 e = udev_enumerate_new(udev);
2871                 if (!e)
2872                         return log_oom();
2873
2874                 r = udev_enumerate_add_match_parent(e, d);
2875                 if (r < 0)
2876                         return log_oom();
2877
2878                 r = udev_enumerate_scan_devices(e);
2879                 if (r < 0)
2880                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2881
2882                 /* Count the partitions enumerated by the kernel */
2883                 n = 0;
2884                 first = udev_enumerate_get_list_entry(e);
2885                 udev_list_entry_foreach(item, first)
2886                         n++;
2887
2888                 /* Count the partitions enumerated by blkid */
2889                 m = blkid_partlist_numof_partitions(pl);
2890                 if (n == m + 1)
2891                         break;
2892                 if (n > m + 1) {
2893                         log_error("blkid and kernel partition list do not match.");
2894                         return -EIO;
2895                 }
2896                 if (n < m + 1) {
2897                         unsigned j;
2898
2899                         /* The kernel has probed fewer partitions than
2900                          * blkid? Maybe the kernel prober is still
2901                          * running or it got EBUSY because udev
2902                          * already opened the device. Let's reprobe
2903                          * the device, which is a synchronous call
2904                          * that waits until probing is complete. */
2905
2906                         for (j = 0; j < 20; j++) {
2907
2908                                 r = ioctl(fd, BLKRRPART, 0);
2909                                 if (r < 0)
2910                                         r = -errno;
2911                                 if (r >= 0 || r != -EBUSY)
2912                                         break;
2913
2914                                 /* If something else has the device
2915                                  * open, such as an udev rule, the
2916                                  * ioctl will return EBUSY. Since
2917                                  * there's no way to wait until it
2918                                  * isn't busy anymore, let's just wait
2919                                  * a bit, and try again.
2920                                  *
2921                                  * This is really something they
2922                                  * should fix in the kernel! */
2923
2924                                 usleep(50 * USEC_PER_MSEC);
2925                         }
2926
2927                         if (r < 0)
2928                                 return log_error_errno(r, "Failed to reread partition table: %m");
2929                 }
2930
2931                 e = udev_enumerate_unref(e);
2932         }
2933
2934         first = udev_enumerate_get_list_entry(e);
2935         udev_list_entry_foreach(item, first) {
2936                 _cleanup_udev_device_unref_ struct udev_device *q;
2937                 const char *node;
2938                 unsigned long long flags;
2939                 blkid_partition pp;
2940                 dev_t qn;
2941                 int nr;
2942
2943                 errno = 0;
2944                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2945                 if (!q) {
2946                         if (!errno)
2947                                 errno = ENOMEM;
2948
2949                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2950                         return -errno;
2951                 }
2952
2953                 qn = udev_device_get_devnum(q);
2954                 if (major(qn) == 0)
2955                         continue;
2956
2957                 if (st.st_rdev == qn)
2958                         continue;
2959
2960                 node = udev_device_get_devnode(q);
2961                 if (!node)
2962                         continue;
2963
2964                 pp = blkid_partlist_devno_to_partition(pl, qn);
2965                 if (!pp)
2966                         continue;
2967
2968                 flags = blkid_partition_get_flags(pp);
2969
2970                 nr = blkid_partition_get_partno(pp);
2971                 if (nr < 0)
2972                         continue;
2973
2974                 if (is_gpt) {
2975                         sd_id128_t type_id;
2976                         const char *stype;
2977
2978                         if (flags & GPT_FLAG_NO_AUTO)
2979                                 continue;
2980
2981                         stype = blkid_partition_get_type_string(pp);
2982                         if (!stype)
2983                                 continue;
2984
2985                         if (sd_id128_from_string(stype, &type_id) < 0)
2986                                 continue;
2987
2988                         if (sd_id128_equal(type_id, GPT_HOME)) {
2989
2990                                 if (home && nr >= home_nr)
2991                                         continue;
2992
2993                                 home_nr = nr;
2994                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2995
2996                                 r = free_and_strdup(&home, node);
2997                                 if (r < 0)
2998                                         return log_oom();
2999
3000                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
3001
3002                                 if (srv && nr >= srv_nr)
3003                                         continue;
3004
3005                                 srv_nr = nr;
3006                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3007
3008                                 r = free_and_strdup(&srv, node);
3009                                 if (r < 0)
3010                                         return log_oom();
3011                         }
3012 #ifdef GPT_ROOT_NATIVE
3013                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3014
3015                                 if (root && nr >= root_nr)
3016                                         continue;
3017
3018                                 root_nr = nr;
3019                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3020
3021                                 r = free_and_strdup(&root, node);
3022                                 if (r < 0)
3023                                         return log_oom();
3024                         }
3025 #endif
3026 #ifdef GPT_ROOT_SECONDARY
3027                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3028
3029                                 if (secondary_root && nr >= secondary_root_nr)
3030                                         continue;
3031
3032                                 secondary_root_nr = nr;
3033                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3034
3035                                 r = free_and_strdup(&secondary_root, node);
3036                                 if (r < 0)
3037                                         return log_oom();
3038                         }
3039 #endif
3040                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3041
3042                                 if (generic)
3043                                         multiple_generic = true;
3044                                 else {
3045                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3046
3047                                         r = free_and_strdup(&generic, node);
3048                                         if (r < 0)
3049                                                 return log_oom();
3050                                 }
3051                         }
3052
3053                 } else if (is_mbr) {
3054                         int type;
3055
3056                         if (flags != 0x80) /* Bootable flag */
3057                                 continue;
3058
3059                         type = blkid_partition_get_type(pp);
3060                         if (type != 0x83) /* Linux partition */
3061                                 continue;
3062
3063                         if (generic)
3064                                 multiple_generic = true;
3065                         else {
3066                                 generic_rw = true;
3067
3068                                 r = free_and_strdup(&root, node);
3069                                 if (r < 0)
3070                                         return log_oom();
3071                         }
3072                 }
3073         }
3074
3075         if (root) {
3076                 *root_device = root;
3077                 root = NULL;
3078
3079                 *root_device_rw = root_rw;
3080                 *secondary = false;
3081         } else if (secondary_root) {
3082                 *root_device = secondary_root;
3083                 secondary_root = NULL;
3084
3085                 *root_device_rw = secondary_root_rw;
3086                 *secondary = true;
3087         } else if (generic) {
3088
3089                 /* There were no partitions with precise meanings
3090                  * around, but we found generic partitions. In this
3091                  * case, if there's only one, we can go ahead and boot
3092                  * it, otherwise we bail out, because we really cannot
3093                  * make any sense of it. */
3094
3095                 if (multiple_generic) {
3096                         log_error("Identified multiple bootable Linux partitions on\n"
3097                                   "    %s\n"
3098                                   PARTITION_TABLE_BLURB, arg_image);
3099                         return -EINVAL;
3100                 }
3101
3102                 *root_device = generic;
3103                 generic = NULL;
3104
3105                 *root_device_rw = generic_rw;
3106                 *secondary = false;
3107         } else {
3108                 log_error("Failed to identify root partition in disk image\n"
3109                           "    %s\n"
3110                           PARTITION_TABLE_BLURB, arg_image);
3111                 return -EINVAL;
3112         }
3113
3114         if (home) {
3115                 *home_device = home;
3116                 home = NULL;
3117
3118                 *home_device_rw = home_rw;
3119         }
3120
3121         if (srv) {
3122                 *srv_device = srv;
3123                 srv = NULL;
3124
3125                 *srv_device_rw = srv_rw;
3126         }
3127
3128         return 0;
3129 #else
3130         log_error("--image= is not supported, compiled without blkid support.");
3131         return -EOPNOTSUPP;
3132 #endif
3133 }
3134
3135 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3136 #ifdef HAVE_BLKID
3137         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3138         const char *fstype, *p;
3139         int r;
3140
3141         assert(what);
3142         assert(where);
3143
3144         if (arg_read_only)
3145                 rw = false;
3146
3147         if (directory)
3148                 p = strjoina(where, directory);
3149         else
3150                 p = where;
3151
3152         errno = 0;
3153         b = blkid_new_probe_from_filename(what);
3154         if (!b) {
3155                 if (errno == 0)
3156                         return log_oom();
3157                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3158                 return -errno;
3159         }
3160
3161         blkid_probe_enable_superblocks(b, 1);
3162         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3163
3164         errno = 0;
3165         r = blkid_do_safeprobe(b);
3166         if (r == -1 || r == 1) {
3167                 log_error("Cannot determine file system type of %s", what);
3168                 return -EINVAL;
3169         } else if (r != 0) {
3170                 if (errno == 0)
3171                         errno = EIO;
3172                 log_error_errno(errno, "Failed to probe %s: %m", what);
3173                 return -errno;
3174         }
3175
3176         errno = 0;
3177         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3178                 if (errno == 0)
3179                         errno = EINVAL;
3180                 log_error("Failed to determine file system type of %s", what);
3181                 return -errno;
3182         }
3183
3184         if (streq(fstype, "crypto_LUKS")) {
3185                 log_error("nspawn currently does not support LUKS disk images.");
3186                 return -EOPNOTSUPP;
3187         }
3188
3189         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3190                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3191
3192         return 0;
3193 #else
3194         log_error("--image= is not supported, compiled without blkid support.");
3195         return -EOPNOTSUPP;
3196 #endif
3197 }
3198
3199 static int mount_devices(
3200                 const char *where,
3201                 const char *root_device, bool root_device_rw,
3202                 const char *home_device, bool home_device_rw,
3203                 const char *srv_device, bool srv_device_rw) {
3204         int r;
3205
3206         assert(where);
3207
3208         if (root_device) {
3209                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3210                 if (r < 0)
3211                         return log_error_errno(r, "Failed to mount root directory: %m");
3212         }
3213
3214         if (home_device) {
3215                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3216                 if (r < 0)
3217                         return log_error_errno(r, "Failed to mount home directory: %m");
3218         }
3219
3220         if (srv_device) {
3221                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3222                 if (r < 0)
3223                         return log_error_errno(r, "Failed to mount server data directory: %m");
3224         }
3225
3226         return 0;
3227 }
3228
3229 static void loop_remove(int nr, int *image_fd) {
3230         _cleanup_close_ int control = -1;
3231         int r;
3232
3233         if (nr < 0)
3234                 return;
3235
3236         if (image_fd && *image_fd >= 0) {
3237                 r = ioctl(*image_fd, LOOP_CLR_FD);
3238                 if (r < 0)
3239                         log_debug_errno(errno, "Failed to close loop image: %m");
3240                 *image_fd = safe_close(*image_fd);
3241         }
3242
3243         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3244         if (control < 0) {
3245                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3246                 return;
3247         }
3248
3249         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3250         if (r < 0)
3251                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3252 }
3253
3254 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3255         int pipe_fds[2];
3256         pid_t pid;
3257
3258         assert(database);
3259         assert(key);
3260         assert(rpid);
3261
3262         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3263                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3264
3265         pid = fork();
3266         if (pid < 0)
3267                 return log_error_errno(errno, "Failed to fork getent child: %m");
3268         else if (pid == 0) {
3269                 int nullfd;
3270                 char *empty_env = NULL;
3271
3272                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3273                         _exit(EXIT_FAILURE);
3274
3275                 if (pipe_fds[0] > 2)
3276                         safe_close(pipe_fds[0]);
3277                 if (pipe_fds[1] > 2)
3278                         safe_close(pipe_fds[1]);
3279
3280                 nullfd = open("/dev/null", O_RDWR);
3281                 if (nullfd < 0)
3282                         _exit(EXIT_FAILURE);
3283
3284                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3285                         _exit(EXIT_FAILURE);
3286
3287                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3288                         _exit(EXIT_FAILURE);
3289
3290                 if (nullfd > 2)
3291                         safe_close(nullfd);
3292
3293                 reset_all_signal_handlers();
3294                 close_all_fds(NULL, 0);
3295
3296                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3297                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3298                 _exit(EXIT_FAILURE);
3299         }
3300
3301         pipe_fds[1] = safe_close(pipe_fds[1]);
3302
3303         *rpid = pid;
3304
3305         return pipe_fds[0];
3306 }
3307
3308 static int change_uid_gid(char **_home) {
3309         char line[LINE_MAX], *x, *u, *g, *h;
3310         const char *word, *state;
3311         _cleanup_free_ uid_t *uids = NULL;
3312         _cleanup_free_ char *home = NULL;
3313         _cleanup_fclose_ FILE *f = NULL;
3314         _cleanup_close_ int fd = -1;
3315         unsigned n_uids = 0;
3316         size_t sz = 0, l;
3317         uid_t uid;
3318         gid_t gid;
3319         pid_t pid;
3320         int r;
3321
3322         assert(_home);
3323
3324         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3325                 /* Reset everything fully to 0, just in case */
3326
3327                 if (setgroups(0, NULL) < 0)
3328                         return log_error_errno(errno, "setgroups() failed: %m");
3329
3330                 if (setresgid(0, 0, 0) < 0)
3331                         return log_error_errno(errno, "setregid() failed: %m");
3332
3333                 if (setresuid(0, 0, 0) < 0)
3334                         return log_error_errno(errno, "setreuid() failed: %m");
3335
3336                 *_home = NULL;
3337                 return 0;
3338         }
3339
3340         /* First, get user credentials */
3341         fd = spawn_getent("passwd", arg_user, &pid);
3342         if (fd < 0)
3343                 return fd;
3344
3345         f = fdopen(fd, "r");
3346         if (!f)
3347                 return log_oom();
3348         fd = -1;
3349
3350         if (!fgets(line, sizeof(line), f)) {
3351
3352                 if (!ferror(f)) {
3353                         log_error("Failed to resolve user %s.", arg_user);
3354                         return -ESRCH;
3355                 }
3356
3357                 log_error_errno(errno, "Failed to read from getent: %m");
3358                 return -errno;
3359         }
3360
3361         truncate_nl(line);
3362
3363         wait_for_terminate_and_warn("getent passwd", pid, true);
3364
3365         x = strchr(line, ':');
3366         if (!x) {
3367                 log_error("/etc/passwd entry has invalid user field.");
3368                 return -EIO;
3369         }
3370
3371         u = strchr(x+1, ':');
3372         if (!u) {
3373                 log_error("/etc/passwd entry has invalid password field.");
3374                 return -EIO;
3375         }
3376
3377         u++;
3378         g = strchr(u, ':');
3379         if (!g) {
3380                 log_error("/etc/passwd entry has invalid UID field.");
3381                 return -EIO;
3382         }
3383
3384         *g = 0;
3385         g++;
3386         x = strchr(g, ':');
3387         if (!x) {
3388                 log_error("/etc/passwd entry has invalid GID field.");
3389                 return -EIO;
3390         }
3391
3392         *x = 0;
3393         h = strchr(x+1, ':');
3394         if (!h) {
3395                 log_error("/etc/passwd entry has invalid GECOS field.");
3396                 return -EIO;
3397         }
3398
3399         h++;
3400         x = strchr(h, ':');
3401         if (!x) {
3402                 log_error("/etc/passwd entry has invalid home directory field.");
3403                 return -EIO;
3404         }
3405
3406         *x = 0;
3407
3408         r = parse_uid(u, &uid);
3409         if (r < 0) {
3410                 log_error("Failed to parse UID of user.");
3411                 return -EIO;
3412         }
3413
3414         r = parse_gid(g, &gid);
3415         if (r < 0) {
3416                 log_error("Failed to parse GID of user.");
3417                 return -EIO;
3418         }
3419
3420         home = strdup(h);
3421         if (!home)
3422                 return log_oom();
3423
3424         /* Second, get group memberships */
3425         fd = spawn_getent("initgroups", arg_user, &pid);
3426         if (fd < 0)
3427                 return fd;
3428
3429         fclose(f);
3430         f = fdopen(fd, "r");
3431         if (!f)
3432                 return log_oom();
3433         fd = -1;
3434
3435         if (!fgets(line, sizeof(line), f)) {
3436                 if (!ferror(f)) {
3437                         log_error("Failed to resolve user %s.", arg_user);
3438                         return -ESRCH;
3439                 }
3440
3441                 log_error_errno(errno, "Failed to read from getent: %m");
3442                 return -errno;
3443         }
3444
3445         truncate_nl(line);
3446
3447         wait_for_terminate_and_warn("getent initgroups", pid, true);
3448
3449         /* Skip over the username and subsequent separator whitespace */
3450         x = line;
3451         x += strcspn(x, WHITESPACE);
3452         x += strspn(x, WHITESPACE);
3453
3454         FOREACH_WORD(word, l, x, state) {
3455                 char c[l+1];
3456
3457                 memcpy(c, word, l);
3458                 c[l] = 0;
3459
3460                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3461                         return log_oom();
3462
3463                 r = parse_uid(c, &uids[n_uids++]);
3464                 if (r < 0) {
3465                         log_error("Failed to parse group data from getent.");
3466                         return -EIO;
3467                 }
3468         }
3469
3470         r = mkdir_parents(home, 0775);
3471         if (r < 0)
3472                 return log_error_errno(r, "Failed to make home root directory: %m");
3473
3474         r = mkdir_safe(home, 0755, uid, gid);
3475         if (r < 0 && r != -EEXIST)
3476                 return log_error_errno(r, "Failed to make home directory: %m");
3477
3478         fchown(STDIN_FILENO, uid, gid);
3479         fchown(STDOUT_FILENO, uid, gid);
3480         fchown(STDERR_FILENO, uid, gid);
3481
3482         if (setgroups(n_uids, uids) < 0)
3483                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3484
3485         if (setresgid(gid, gid, gid) < 0)
3486                 return log_error_errno(errno, "setregid() failed: %m");
3487
3488         if (setresuid(uid, uid, uid) < 0)
3489                 return log_error_errno(errno, "setreuid() failed: %m");
3490
3491         if (_home) {
3492                 *_home = home;
3493                 home = NULL;
3494         }
3495
3496         return 0;
3497 }
3498
3499 /*
3500  * Return values:
3501  * < 0 : wait_for_terminate() failed to get the state of the
3502  *       container, the container was terminated by a signal, or
3503  *       failed for an unknown reason.  No change is made to the
3504  *       container argument.
3505  * > 0 : The program executed in the container terminated with an
3506  *       error.  The exit code of the program executed in the
3507  *       container is returned.  The container argument has been set
3508  *       to CONTAINER_TERMINATED.
3509  *   0 : The container is being rebooted, has been shut down or exited
3510  *       successfully.  The container argument has been set to either
3511  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3512  *
3513  * That is, success is indicated by a return value of zero, and an
3514  * error is indicated by a non-zero value.
3515  */
3516 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3517         siginfo_t status;
3518         int r;
3519
3520         r = wait_for_terminate(pid, &status);
3521         if (r < 0)
3522                 return log_warning_errno(r, "Failed to wait for container: %m");
3523
3524         switch (status.si_code) {
3525
3526         case CLD_EXITED:
3527                 if (status.si_status == 0) {
3528                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3529
3530                 } else
3531                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3532
3533                 *container = CONTAINER_TERMINATED;
3534                 return status.si_status;
3535
3536         case CLD_KILLED:
3537                 if (status.si_status == SIGINT) {
3538
3539                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3540                         *container = CONTAINER_TERMINATED;
3541                         return 0;
3542
3543                 } else if (status.si_status == SIGHUP) {
3544
3545                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3546                         *container = CONTAINER_REBOOTED;
3547                         return 0;
3548                 }
3549
3550                 /* CLD_KILLED fallthrough */
3551
3552         case CLD_DUMPED:
3553                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3554                 return -EIO;
3555
3556         default:
3557                 log_error("Container %s failed due to unknown reason.", arg_machine);
3558                 return -EIO;
3559         }
3560
3561         return r;
3562 }
3563
3564 static void nop_handler(int sig) {}
3565
3566 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3567         pid_t pid;
3568
3569         pid = PTR_TO_UINT32(userdata);
3570         if (pid > 0) {
3571                 if (kill(pid, arg_kill_signal) >= 0) {
3572                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3573                         sd_event_source_set_userdata(s, NULL);
3574                         return 0;
3575                 }
3576         }
3577
3578         sd_event_exit(sd_event_source_get_event(s), 0);
3579         return 0;
3580 }
3581
3582 static int determine_names(void) {
3583         int r;
3584
3585         if (!arg_image && !arg_directory) {
3586                 if (arg_machine) {
3587                         _cleanup_(image_unrefp) Image *i = NULL;
3588
3589                         r = image_find(arg_machine, &i);
3590                         if (r < 0)
3591                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3592                         else if (r == 0) {
3593                                 log_error("No image for machine '%s': %m", arg_machine);
3594                                 return -ENOENT;
3595                         }
3596
3597                         if (i->type == IMAGE_RAW)
3598                                 r = set_sanitized_path(&arg_image, i->path);
3599                         else
3600                                 r = set_sanitized_path(&arg_directory, i->path);
3601                         if (r < 0)
3602                                 return log_error_errno(r, "Invalid image directory: %m");
3603
3604                         arg_read_only = arg_read_only || i->read_only;
3605                 } else
3606                         arg_directory = get_current_dir_name();
3607
3608                 if (!arg_directory && !arg_machine) {
3609                         log_error("Failed to determine path, please use -D or -i.");
3610                         return -EINVAL;
3611                 }
3612         }
3613
3614         if (!arg_machine) {
3615                 if (arg_directory && path_equal(arg_directory, "/"))
3616                         arg_machine = gethostname_malloc();
3617                 else
3618                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3619
3620                 if (!arg_machine)
3621                         return log_oom();
3622
3623                 hostname_cleanup(arg_machine, false);
3624                 if (!machine_name_is_valid(arg_machine)) {
3625                         log_error("Failed to determine machine name automatically, please use -M.");
3626                         return -EINVAL;
3627                 }
3628
3629                 if (arg_ephemeral) {
3630                         char *b;
3631
3632                         /* Add a random suffix when this is an
3633                          * ephemeral machine, so that we can run many
3634                          * instances at once without manually having
3635                          * to specify -M each time. */
3636
3637                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3638                                 return log_oom();
3639
3640                         free(arg_machine);
3641                         arg_machine = b;
3642                 }
3643         }
3644
3645         return 0;
3646 }
3647
3648 static int determine_uid_shift(void) {
3649         int r;
3650
3651         if (!arg_userns)
3652                 return 0;
3653
3654         if (arg_uid_shift == UID_INVALID) {
3655                 struct stat st;
3656
3657                 r = stat(arg_directory, &st);
3658                 if (r < 0)
3659                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3660
3661                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3662
3663                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3664                         log_error("UID and GID base of %s don't match.", arg_directory);
3665                         return -EINVAL;
3666                 }
3667
3668                 arg_uid_range = UINT32_C(0x10000);
3669         }
3670
3671         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3672                 log_error("UID base too high for UID range.");
3673                 return -EINVAL;
3674         }
3675
3676         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3677         return 0;
3678 }
3679
3680 int main(int argc, char *argv[]) {
3681
3682         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3683         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3684         _cleanup_close_ int master = -1, image_fd = -1;
3685         _cleanup_fdset_free_ FDSet *fds = NULL;
3686         int r, n_fd_passed, loop_nr = -1;
3687         char veth_name[IFNAMSIZ];
3688         bool secondary = false, remove_subvol = false;
3689         sigset_t mask, mask_chld;
3690         pid_t pid = 0;
3691         int ret = EXIT_SUCCESS;
3692         union in_addr_union exposed = {};
3693         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3694         bool interactive;
3695
3696         log_parse_environment();
3697         log_open();
3698
3699         r = parse_argv(argc, argv);
3700         if (r <= 0)
3701                 goto finish;
3702
3703         r = determine_names();
3704         if (r < 0)
3705                 goto finish;
3706
3707         if (geteuid() != 0) {
3708                 log_error("Need to be root.");
3709                 r = -EPERM;
3710                 goto finish;
3711         }
3712
3713         log_close();
3714         n_fd_passed = sd_listen_fds(false);
3715         if (n_fd_passed > 0) {
3716                 r = fdset_new_listen_fds(&fds, false);
3717                 if (r < 0) {
3718                         log_error_errno(r, "Failed to collect file descriptors: %m");
3719                         goto finish;
3720                 }
3721         }
3722         fdset_close_others(fds);
3723         log_open();
3724
3725         if (arg_directory) {
3726                 assert(!arg_image);
3727
3728                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3729                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3730                         r = -EINVAL;
3731                         goto finish;
3732                 }
3733
3734                 if (arg_ephemeral) {
3735                         _cleanup_free_ char *np = NULL;
3736
3737                         /* If the specified path is a mount point we
3738                          * generate the new snapshot immediately
3739                          * inside it under a random name. However if
3740                          * the specified is not a mount point we
3741                          * create the new snapshot in the parent
3742                          * directory, just next to it. */
3743                         r = path_is_mount_point(arg_directory, false);
3744                         if (r < 0) {
3745                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3746                                 goto finish;
3747                         }
3748                         if (r > 0)
3749                                 r = tempfn_random_child(arg_directory, &np);
3750                         else
3751                                 r = tempfn_random(arg_directory, &np);
3752                         if (r < 0) {
3753                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3754                                 goto finish;
3755                         }
3756
3757                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3758                         if (r < 0) {
3759                                 log_error_errno(r, "Failed to lock %s: %m", np);
3760                                 goto finish;
3761                         }
3762
3763                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3764                         if (r < 0) {
3765                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3766                                 goto finish;
3767                         }
3768
3769                         free(arg_directory);
3770                         arg_directory = np;
3771                         np = NULL;
3772
3773                         remove_subvol = true;
3774
3775                 } else {
3776                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3777                         if (r == -EBUSY) {
3778                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3779                                 goto finish;
3780                         }
3781                         if (r < 0) {
3782                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3783                                 return r;
3784                         }
3785
3786                         if (arg_template) {
3787                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3788                                 if (r == -EEXIST) {
3789                                         if (!arg_quiet)
3790                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3791                                 } else if (r < 0) {
3792                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3793                                         goto finish;
3794                                 } else {
3795                                         if (!arg_quiet)
3796                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3797                                 }
3798                         }
3799                 }
3800
3801                 if (arg_boot) {
3802                         if (path_is_os_tree(arg_directory) <= 0) {
3803                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3804                                 r = -EINVAL;
3805                                 goto finish;
3806                         }
3807                 } else {
3808                         const char *p;
3809
3810                         p = strjoina(arg_directory,
3811                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3812                         if (access(p, F_OK) < 0) {
3813                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3814                                 r = -EINVAL;
3815                                 goto finish;
3816                         }
3817                 }
3818
3819         } else {
3820                 char template[] = "/tmp/nspawn-root-XXXXXX";
3821
3822                 assert(arg_image);
3823                 assert(!arg_template);
3824
3825                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3826                 if (r == -EBUSY) {
3827                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3828                         goto finish;
3829                 }
3830                 if (r < 0) {
3831                         r = log_error_errno(r, "Failed to create image lock: %m");
3832                         goto finish;
3833                 }
3834
3835                 if (!mkdtemp(template)) {
3836                         log_error_errno(errno, "Failed to create temporary directory: %m");
3837                         r = -errno;
3838                         goto finish;
3839                 }
3840
3841                 arg_directory = strdup(template);
3842                 if (!arg_directory) {
3843                         r = log_oom();
3844                         goto finish;
3845                 }
3846
3847                 image_fd = setup_image(&device_path, &loop_nr);
3848                 if (image_fd < 0) {
3849                         r = image_fd;
3850                         goto finish;
3851                 }
3852
3853                 r = dissect_image(image_fd,
3854                                   &root_device, &root_device_rw,
3855                                   &home_device, &home_device_rw,
3856                                   &srv_device, &srv_device_rw,
3857                                   &secondary);
3858                 if (r < 0)
3859                         goto finish;
3860         }
3861
3862         r = determine_uid_shift();
3863         if (r < 0)
3864                 goto finish;
3865
3866         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3867
3868         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3869         if (master < 0) {
3870                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3871                 goto finish;
3872         }
3873
3874         r = ptsname_malloc(master, &console);
3875         if (r < 0) {
3876                 r = log_error_errno(r, "Failed to determine tty name: %m");
3877                 goto finish;
3878         }
3879
3880         if (unlockpt(master) < 0) {
3881                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3882                 goto finish;
3883         }
3884
3885         if (!arg_quiet)
3886                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3887                          arg_machine, arg_image ?: arg_directory);
3888
3889         assert_se(sigemptyset(&mask) == 0);
3890         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3891         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3892
3893         assert_se(sigemptyset(&mask_chld) == 0);
3894         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3895
3896         for (;;) {
3897                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3898                 ContainerStatus container_status;
3899                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3900                 struct sigaction sa = {
3901                         .sa_handler = nop_handler,
3902                         .sa_flags = SA_NOCLDSTOP,
3903                 };
3904
3905                 r = barrier_create(&barrier);
3906                 if (r < 0) {
3907                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3908                         goto finish;
3909                 }
3910
3911                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3912                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3913                         goto finish;
3914                 }
3915
3916                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3917                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3918                         goto finish;
3919                 }
3920
3921                 /* Child can be killed before execv(), so handle SIGCHLD
3922                  * in order to interrupt parent's blocking calls and
3923                  * give it a chance to call wait() and terminate. */
3924                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3925                 if (r < 0) {
3926                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3927                         goto finish;
3928                 }
3929
3930                 r = sigaction(SIGCHLD, &sa, NULL);
3931                 if (r < 0) {
3932                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3933                         goto finish;
3934                 }
3935
3936                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3937                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3938                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3939                 if (pid < 0) {
3940                         if (errno == EINVAL)
3941                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3942                         else
3943                                 r = log_error_errno(errno, "clone() failed: %m");
3944
3945                         goto finish;
3946                 }
3947
3948                 if (pid == 0) {
3949                         /* child */
3950                         _cleanup_free_ char *home = NULL;
3951                         unsigned n_env = 2;
3952                         const char *envp[] = {
3953                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3954                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3955                                 NULL, /* TERM */
3956                                 NULL, /* HOME */
3957                                 NULL, /* USER */
3958                                 NULL, /* LOGNAME */
3959                                 NULL, /* container_uuid */
3960                                 NULL, /* LISTEN_FDS */
3961                                 NULL, /* LISTEN_PID */
3962                                 NULL
3963                         };
3964                         char **env_use;
3965
3966                         barrier_set_role(&barrier, BARRIER_CHILD);
3967
3968                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3969                         if (envp[n_env])
3970                                 n_env ++;
3971
3972                         master = safe_close(master);
3973
3974                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3975                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3976
3977                         reset_all_signal_handlers();
3978                         reset_signal_mask();
3979
3980                         if (interactive) {
3981                                 close_nointr(STDIN_FILENO);
3982                                 close_nointr(STDOUT_FILENO);
3983                                 close_nointr(STDERR_FILENO);
3984
3985                                 r = open_terminal(console, O_RDWR);
3986                                 if (r != STDIN_FILENO) {
3987                                         if (r >= 0) {
3988                                                 safe_close(r);
3989                                                 r = -EINVAL;
3990                                         }
3991
3992                                         log_error_errno(r, "Failed to open console: %m");
3993                                         _exit(EXIT_FAILURE);
3994                                 }
3995
3996                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3997                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3998                                         log_error_errno(errno, "Failed to duplicate console: %m");
3999                                         _exit(EXIT_FAILURE);
4000                                 }
4001                         }
4002
4003                         if (setsid() < 0) {
4004                                 log_error_errno(errno, "setsid() failed: %m");
4005                                 _exit(EXIT_FAILURE);
4006                         }
4007
4008                         if (reset_audit_loginuid() < 0)
4009                                 _exit(EXIT_FAILURE);
4010
4011                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4012                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4013                                 _exit(EXIT_FAILURE);
4014                         }
4015
4016                         if (arg_private_network)
4017                                 loopback_setup();
4018
4019                         /* Mark everything as slave, so that we still
4020                          * receive mounts from the real root, but don't
4021                          * propagate mounts to the real root. */
4022                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4023                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4024                                 _exit(EXIT_FAILURE);
4025                         }
4026
4027                         if (mount_devices(arg_directory,
4028                                           root_device, root_device_rw,
4029                                           home_device, home_device_rw,
4030                                           srv_device, srv_device_rw) < 0)
4031                                 _exit(EXIT_FAILURE);
4032
4033                         /* Turn directory into bind mount */
4034                         if (mount(arg_directory, arg_directory, NULL, MS_BIND|MS_REC, NULL) < 0) {
4035                                 log_error_errno(errno, "Failed to make bind mount: %m");
4036                                 _exit(EXIT_FAILURE);
4037                         }
4038
4039                         r = setup_volatile(arg_directory);
4040                         if (r < 0)
4041                                 _exit(EXIT_FAILURE);
4042
4043                         if (setup_volatile_state(arg_directory) < 0)
4044                                 _exit(EXIT_FAILURE);
4045
4046                         r = base_filesystem_create(arg_directory);
4047                         if (r < 0)
4048                                 _exit(EXIT_FAILURE);
4049
4050                         if (arg_read_only) {
4051                                 r = bind_remount_recursive(arg_directory, true);
4052                                 if (r < 0) {
4053                                         log_error_errno(r, "Failed to make tree read-only: %m");
4054                                         _exit(EXIT_FAILURE);
4055                                 }
4056                         }
4057
4058                         if (mount_all(arg_directory) < 0)
4059                                 _exit(EXIT_FAILURE);
4060
4061                         if (copy_devnodes(arg_directory) < 0)
4062                                 _exit(EXIT_FAILURE);
4063
4064                         if (setup_ptmx(arg_directory) < 0)
4065                                 _exit(EXIT_FAILURE);
4066
4067                         dev_setup(arg_directory);
4068
4069                         if (setup_propagate(arg_directory) < 0)
4070                                 _exit(EXIT_FAILURE);
4071
4072                         if (setup_seccomp() < 0)
4073                                 _exit(EXIT_FAILURE);
4074
4075                         if (setup_dev_console(arg_directory, console) < 0)
4076                                 _exit(EXIT_FAILURE);
4077
4078                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4079                                 _exit(EXIT_FAILURE);
4080                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4081
4082                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4083                                 _exit(EXIT_FAILURE);
4084                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4085
4086                         /* Tell the parent that we are ready, and that
4087                          * it can cgroupify us to that we lack access
4088                          * to certain devices and resources. */
4089                         (void) barrier_place(&barrier); /* #1 */
4090
4091                         if (setup_boot_id(arg_directory) < 0)
4092                                 _exit(EXIT_FAILURE);
4093
4094                         if (setup_timezone(arg_directory) < 0)
4095                                 _exit(EXIT_FAILURE);
4096
4097                         if (setup_resolv_conf(arg_directory) < 0)
4098                                 _exit(EXIT_FAILURE);
4099
4100                         if (setup_journal(arg_directory) < 0)
4101                                 _exit(EXIT_FAILURE);
4102
4103                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4104                                 _exit(EXIT_FAILURE);
4105
4106                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4107                                 _exit(EXIT_FAILURE);
4108
4109                         if (mount_tmpfs(arg_directory) < 0)
4110                                 _exit(EXIT_FAILURE);
4111
4112                         /* Wait until we are cgroup-ified, so that we
4113                          * can mount the right cgroup path writable */
4114                         (void) barrier_place_and_sync(&barrier); /* #2 */
4115
4116                         if (mount_cgroup(arg_directory) < 0)
4117                                 _exit(EXIT_FAILURE);
4118
4119                         if (chdir(arg_directory) < 0) {
4120                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4121                                 _exit(EXIT_FAILURE);
4122                         }
4123
4124                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4125                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4126                                 _exit(EXIT_FAILURE);
4127                         }
4128
4129                         if (chroot(".") < 0) {
4130                                 log_error_errno(errno, "chroot() failed: %m");
4131                                 _exit(EXIT_FAILURE);
4132                         }
4133
4134                         if (chdir("/") < 0) {
4135                                 log_error_errno(errno, "chdir() failed: %m");
4136                                 _exit(EXIT_FAILURE);
4137                         }
4138
4139                         if (arg_userns) {
4140                                 if (unshare(CLONE_NEWUSER) < 0) {
4141                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4142                                         _exit(EXIT_FAILURE);
4143                                 }
4144
4145                                 /* Tell the parent, that it now can
4146                                  * write the UID map. */
4147                                 (void) barrier_place(&barrier); /* #3 */
4148
4149                                 /* Wait until the parent wrote the UID
4150                                  * map */
4151                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4152                         }
4153
4154                         umask(0022);
4155
4156                         if (drop_capabilities() < 0) {
4157                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4158                                 _exit(EXIT_FAILURE);
4159                         }
4160
4161                         setup_hostname();
4162
4163                         if (arg_personality != 0xffffffffLU) {
4164                                 if (personality(arg_personality) < 0) {
4165                                         log_error_errno(errno, "personality() failed: %m");
4166                                         _exit(EXIT_FAILURE);
4167                                 }
4168                         } else if (secondary) {
4169                                 if (personality(PER_LINUX32) < 0) {
4170                                         log_error_errno(errno, "personality() failed: %m");
4171                                         _exit(EXIT_FAILURE);
4172                                 }
4173                         }
4174
4175 #ifdef HAVE_SELINUX
4176                         if (arg_selinux_context)
4177                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4178                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4179                                         _exit(EXIT_FAILURE);
4180                                 }
4181 #endif
4182
4183                         r = change_uid_gid(&home);
4184                         if (r < 0)
4185                                 _exit(EXIT_FAILURE);
4186
4187                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4188                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4189                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4190                                 log_oom();
4191                                 _exit(EXIT_FAILURE);
4192                         }
4193
4194                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4195                                 char as_uuid[37];
4196
4197                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4198                                         log_oom();
4199                                         _exit(EXIT_FAILURE);
4200                                 }
4201                         }
4202
4203                         if (fdset_size(fds) > 0) {
4204                                 r = fdset_cloexec(fds, false);
4205                                 if (r < 0) {
4206                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4207                                         _exit(EXIT_FAILURE);
4208                                 }
4209
4210                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4211                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4212                                         log_oom();
4213                                         _exit(EXIT_FAILURE);
4214                                 }
4215                         }
4216
4217                         if (!strv_isempty(arg_setenv)) {
4218                                 char **n;
4219
4220                                 n = strv_env_merge(2, envp, arg_setenv);
4221                                 if (!n) {
4222                                         log_oom();
4223                                         _exit(EXIT_FAILURE);
4224                                 }
4225
4226                                 env_use = n;
4227                         } else
4228                                 env_use = (char**) envp;
4229
4230                         /* Let the parent know that we are ready and
4231                          * wait until the parent is ready with the
4232                          * setup, too... */
4233                         (void) barrier_place_and_sync(&barrier); /* #5 */
4234
4235                         if (arg_boot) {
4236                                 char **a;
4237                                 size_t l;
4238
4239                                 /* Automatically search for the init system */
4240
4241                                 l = 1 + argc - optind;
4242                                 a = newa(char*, l + 1);
4243                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4244
4245                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4246                                 execve(a[0], a, env_use);
4247
4248                                 a[0] = (char*) "/lib/systemd/systemd";
4249                                 execve(a[0], a, env_use);
4250
4251                                 a[0] = (char*) "/sbin/init";
4252                                 execve(a[0], a, env_use);
4253                         } else if (argc > optind)
4254                                 execvpe(argv[optind], argv + optind, env_use);
4255                         else {
4256                                 chdir(home ? home : "/root");
4257                                 execle("/bin/bash", "-bash", NULL, env_use);
4258                                 execle("/bin/sh", "-sh", NULL, env_use);
4259                         }
4260
4261                         log_error_errno(errno, "execv() failed: %m");
4262                         _exit(EXIT_FAILURE);
4263                 }
4264
4265                 barrier_set_role(&barrier, BARRIER_PARENT);
4266                 fdset_free(fds);
4267                 fds = NULL;
4268
4269                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4270                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4271
4272                 (void) barrier_place(&barrier); /* #1 */
4273
4274                 /* Wait for the most basic Child-setup to be done,
4275                  * before we add hardware to it, and place it in a
4276                  * cgroup. */
4277                 if (barrier_sync(&barrier)) { /* #1 */
4278                         int ifi = 0;
4279
4280                         r = move_network_interfaces(pid);
4281                         if (r < 0)
4282                                 goto finish;
4283
4284                         r = setup_veth(pid, veth_name, &ifi);
4285                         if (r < 0)
4286                                 goto finish;
4287
4288                         r = setup_bridge(veth_name, &ifi);
4289                         if (r < 0)
4290                                 goto finish;
4291
4292                         r = setup_macvlan(pid);
4293                         if (r < 0)
4294                                 goto finish;
4295
4296                         r = setup_ipvlan(pid);
4297                         if (r < 0)
4298                                 goto finish;
4299
4300                         r = register_machine(pid, ifi);
4301                         if (r < 0)
4302                                 goto finish;
4303
4304                         /* Notify the child that the parent is ready with all
4305                          * its setup, and that the child can now hand over
4306                          * control to the code to run inside the container. */
4307                         (void) barrier_place(&barrier); /* #2 */
4308
4309                         if (arg_userns) {
4310                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4311
4312                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4313
4314                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4315                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4316                                 r = write_string_file(uid_map, line);
4317                                 if (r < 0) {
4318                                         log_error_errno(r, "Failed to write UID map: %m");
4319                                         goto finish;
4320                                 }
4321
4322                                 /* We always assign the same UID and GID ranges */
4323                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4324                                 r = write_string_file(uid_map, line);
4325                                 if (r < 0) {
4326                                         log_error_errno(r, "Failed to write GID map: %m");
4327                                         goto finish;
4328                                 }
4329
4330                                 (void) barrier_place(&barrier); /* #4 */
4331                         }
4332
4333                         /* Block SIGCHLD here, before notifying child.
4334                          * process_pty() will handle it with the other signals. */
4335                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4336                         if (r < 0)
4337                                 goto finish;
4338
4339                         /* Reset signal to default */
4340                         r = default_signals(SIGCHLD, -1);
4341                         if (r < 0)
4342                                 goto finish;
4343
4344                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4345                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4346                                 _cleanup_event_unref_ sd_event *event = NULL;
4347                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4348                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4349                                 char last_char = 0;
4350
4351                                 sd_notifyf(false,
4352                                            "READY=1\n"
4353                                            "STATUS=Container running.\n"
4354                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4355
4356                                 r = sd_event_new(&event);
4357                                 if (r < 0) {
4358                                         log_error_errno(r, "Failed to get default event source: %m");
4359                                         goto finish;
4360                                 }
4361
4362                                 if (arg_kill_signal > 0) {
4363                                         /* Try to kill the init system on SIGINT or SIGTERM */
4364                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4365                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4366                                 } else {
4367                                         /* Immediately exit */
4368                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4369                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4370                                 }
4371
4372                                 /* simply exit on sigchld */
4373                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4374
4375                                 if (arg_expose_ports) {
4376                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4377                                         if (r < 0)
4378                                                 goto finish;
4379
4380                                         (void) expose_ports(rtnl, &exposed);
4381                                 }
4382
4383                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4384
4385                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4386                                 if (r < 0) {
4387                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4388                                         goto finish;
4389                                 }
4390
4391                                 r = sd_event_loop(event);
4392                                 if (r < 0) {
4393                                         log_error_errno(r, "Failed to run event loop: %m");
4394                                         goto finish;
4395                                 }
4396
4397                                 pty_forward_get_last_char(forward, &last_char);
4398
4399                                 forward = pty_forward_free(forward);
4400
4401                                 if (!arg_quiet && last_char != '\n')
4402                                         putc('\n', stdout);
4403
4404                                 /* Kill if it is not dead yet anyway */
4405                                 terminate_machine(pid);
4406                         }
4407                 }
4408
4409                 /* Normally redundant, but better safe than sorry */
4410                 kill(pid, SIGKILL);
4411
4412                 r = wait_for_container(pid, &container_status);
4413                 pid = 0;
4414
4415                 if (r < 0)
4416                         /* We failed to wait for the container, or the
4417                          * container exited abnormally */
4418                         goto finish;
4419                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4420                         /* The container exited with a non-zero
4421                          * status, or with zero status and no reboot
4422                          * was requested. */
4423                         ret = r;
4424                         break;
4425                 }
4426
4427                 /* CONTAINER_REBOOTED, loop again */
4428
4429                 if (arg_keep_unit) {
4430                         /* Special handling if we are running as a
4431                          * service: instead of simply restarting the
4432                          * machine we want to restart the entire
4433                          * service, so let's inform systemd about this
4434                          * with the special exit code 133. The service
4435                          * file uses RestartForceExitStatus=133 so
4436                          * that this results in a full nspawn
4437                          * restart. This is necessary since we might
4438                          * have cgroup parameters set we want to have
4439                          * flushed out. */
4440                         ret = 133;
4441                         r = 0;
4442                         break;
4443                 }
4444
4445                 flush_ports(&exposed);
4446         }
4447
4448 finish:
4449         sd_notify(false,
4450                   "STOPPING=1\n"
4451                   "STATUS=Terminating...");
4452
4453         loop_remove(loop_nr, &image_fd);
4454
4455         if (pid > 0)
4456                 kill(pid, SIGKILL);
4457
4458         if (remove_subvol && arg_directory) {
4459                 int k;
4460
4461                 k = btrfs_subvol_remove(arg_directory);
4462                 if (k < 0)
4463                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4464         }
4465
4466         if (arg_machine) {
4467                 const char *p;
4468
4469                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4470                 (void) rm_rf(p, false, true, false);
4471         }
4472
4473         free(arg_directory);
4474         free(arg_template);
4475         free(arg_image);
4476         free(arg_machine);
4477         free(arg_user);
4478         strv_free(arg_setenv);
4479         strv_free(arg_network_interfaces);
4480         strv_free(arg_network_macvlan);
4481         strv_free(arg_network_ipvlan);
4482         strv_free(arg_bind);
4483         strv_free(arg_bind_ro);
4484         strv_free(arg_tmpfs);
4485
4486         flush_ports(&exposed);
4487
4488         while (arg_expose_ports) {
4489                 ExposePort *p = arg_expose_ports;
4490                 LIST_REMOVE(ports, arg_expose_ports, p);
4491                 free(p);
4492         }
4493
4494         return r < 0 ? EXIT_FAILURE : ret;
4495 }