chiark / gitweb /
nspawn: fix whitespace and typo in partition table blurb
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
192 static bool arg_userns = false;
193
194 static void help(void) {
195         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197                "  -h --help                 Show this help\n"
198                "     --version              Print version string\n"
199                "  -q --quiet                Do not show status information\n"
200                "  -D --directory=PATH       Root directory for the container\n"
201                "     --template=PATH        Initialize root directory from template directory,\n"
202                "                            if missing\n"
203                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
204                "                            remove it after exit\n"
205                "  -i --image=PATH           File system device or disk image for the container\n"
206                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
207                "  -u --user=USER            Run the command under specified user or uid\n"
208                "  -M --machine=NAME         Set the machine name for the container\n"
209                "     --uuid=UUID            Set a specific machine UUID for the container\n"
210                "  -S --slice=SLICE          Place the container in the specified slice\n"
211                "     --property=NAME=VALUE  Set scope unit property\n"
212                "     --private-network      Disable network in container\n"
213                "     --network-interface=INTERFACE\n"
214                "                            Assign an existing network interface to the\n"
215                "                            container\n"
216                "     --network-macvlan=INTERFACE\n"
217                "                            Create a macvlan network interface based on an\n"
218                "                            existing network interface to the container\n"
219                "     --network-ipvlan=INTERFACE\n"
220                "                            Create a ipvlan network interface based on an\n"
221                "                            existing network interface to the container\n"
222                "  -n --network-veth         Add a virtual ethernet connection between host\n"
223                "                            and container\n"
224                "     --network-bridge=INTERFACE\n"
225                "                            Add a virtual ethernet connection between host\n"
226                "                            and container and add it to an existing bridge on\n"
227                "                            the host\n"
228                "     --private-users[=UIDBASE[:NUIDS]]\n"
229                "                            Run within user namespace\n"
230                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
231                "                            Expose a container IP port on the host\n"
232                "  -Z --selinux-context=SECLABEL\n"
233                "                            Set the SELinux security context to be used by\n"
234                "                            processes in the container\n"
235                "  -L --selinux-apifs-context=SECLABEL\n"
236                "                            Set the SELinux security context to be used by\n"
237                "                            API/tmpfs file systems in the container\n"
238                "     --capability=CAP       In addition to the default, retain specified\n"
239                "                            capability\n"
240                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
241                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
242                "                            try-guest, try-host\n"
243                "  -j                        Equivalent to --link-journal=try-guest\n"
244                "     --read-only            Mount the root directory read-only\n"
245                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
246                "                            the container\n"
247                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
248                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
249                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
250                "     --share-system         Share system namespaces with host\n"
251                "     --register=BOOLEAN     Register container as machine\n"
252                "     --keep-unit            Do not register a scope for the machine, reuse\n"
253                "                            the service unit nspawn is running in\n"
254                "     --volatile[=MODE]      Run the system in volatile mode\n"
255                , program_invocation_short_name);
256 }
257
258 static int set_sanitized_path(char **b, const char *path) {
259         char *p;
260
261         assert(b);
262         assert(path);
263
264         p = canonicalize_file_name(path);
265         if (!p) {
266                 if (errno != ENOENT)
267                         return -errno;
268
269                 p = path_make_absolute_cwd(path);
270                 if (!p)
271                         return -ENOMEM;
272         }
273
274         free(*b);
275         *b = path_kill_slashes(p);
276         return 0;
277 }
278
279 static int parse_argv(int argc, char *argv[]) {
280
281         enum {
282                 ARG_VERSION = 0x100,
283                 ARG_PRIVATE_NETWORK,
284                 ARG_UUID,
285                 ARG_READ_ONLY,
286                 ARG_CAPABILITY,
287                 ARG_DROP_CAPABILITY,
288                 ARG_LINK_JOURNAL,
289                 ARG_BIND,
290                 ARG_BIND_RO,
291                 ARG_TMPFS,
292                 ARG_SETENV,
293                 ARG_SHARE_SYSTEM,
294                 ARG_REGISTER,
295                 ARG_KEEP_UNIT,
296                 ARG_NETWORK_INTERFACE,
297                 ARG_NETWORK_MACVLAN,
298                 ARG_NETWORK_IPVLAN,
299                 ARG_NETWORK_BRIDGE,
300                 ARG_PERSONALITY,
301                 ARG_VOLATILE,
302                 ARG_TEMPLATE,
303                 ARG_PROPERTY,
304                 ARG_PRIVATE_USERS,
305         };
306
307         static const struct option options[] = {
308                 { "help",                  no_argument,       NULL, 'h'                   },
309                 { "version",               no_argument,       NULL, ARG_VERSION           },
310                 { "directory",             required_argument, NULL, 'D'                   },
311                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
312                 { "ephemeral",             no_argument,       NULL, 'x'                   },
313                 { "user",                  required_argument, NULL, 'u'                   },
314                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
315                 { "boot",                  no_argument,       NULL, 'b'                   },
316                 { "uuid",                  required_argument, NULL, ARG_UUID              },
317                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
318                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
319                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
320                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
321                 { "bind",                  required_argument, NULL, ARG_BIND              },
322                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
323                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
324                 { "machine",               required_argument, NULL, 'M'                   },
325                 { "slice",                 required_argument, NULL, 'S'                   },
326                 { "setenv",                required_argument, NULL, ARG_SETENV            },
327                 { "selinux-context",       required_argument, NULL, 'Z'                   },
328                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
329                 { "quiet",                 no_argument,       NULL, 'q'                   },
330                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
331                 { "register",              required_argument, NULL, ARG_REGISTER          },
332                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
333                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
334                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
335                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
336                 { "network-veth",          no_argument,       NULL, 'n'                   },
337                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
338                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
339                 { "image",                 required_argument, NULL, 'i'                   },
340                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
341                 { "port",                  required_argument, NULL, 'p'                   },
342                 { "property",              required_argument, NULL, ARG_PROPERTY          },
343                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
344                 {}
345         };
346
347         int c, r;
348         uint64_t plus = 0, minus = 0;
349
350         assert(argc >= 0);
351         assert(argv);
352
353         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
354
355                 switch (c) {
356
357                 case 'h':
358                         help();
359                         return 0;
360
361                 case ARG_VERSION:
362                         puts(PACKAGE_STRING);
363                         puts(SYSTEMD_FEATURES);
364                         return 0;
365
366                 case 'D':
367                         r = set_sanitized_path(&arg_directory, optarg);
368                         if (r < 0)
369                                 return log_error_errno(r, "Invalid root directory: %m");
370
371                         break;
372
373                 case ARG_TEMPLATE:
374                         r = set_sanitized_path(&arg_template, optarg);
375                         if (r < 0)
376                                 return log_error_errno(r, "Invalid template directory: %m");
377
378                         break;
379
380                 case 'i':
381                         r = set_sanitized_path(&arg_image, optarg);
382                         if (r < 0)
383                                 return log_error_errno(r, "Invalid image path: %m");
384
385                         break;
386
387                 case 'x':
388                         arg_ephemeral = true;
389                         break;
390
391                 case 'u':
392                         free(arg_user);
393                         arg_user = strdup(optarg);
394                         if (!arg_user)
395                                 return log_oom();
396
397                         break;
398
399                 case ARG_NETWORK_BRIDGE:
400                         arg_network_bridge = optarg;
401
402                         /* fall through */
403
404                 case 'n':
405                         arg_network_veth = true;
406                         arg_private_network = true;
407                         break;
408
409                 case ARG_NETWORK_INTERFACE:
410                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
411                                 return log_oom();
412
413                         arg_private_network = true;
414                         break;
415
416                 case ARG_NETWORK_MACVLAN:
417                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
418                                 return log_oom();
419
420                         arg_private_network = true;
421                         break;
422
423                 case ARG_NETWORK_IPVLAN:
424                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425                                 return log_oom();
426
427                         /* fall through */
428
429                 case ARG_PRIVATE_NETWORK:
430                         arg_private_network = true;
431                         break;
432
433                 case 'b':
434                         arg_boot = true;
435                         break;
436
437                 case ARG_UUID:
438                         r = sd_id128_from_string(optarg, &arg_uuid);
439                         if (r < 0) {
440                                 log_error("Invalid UUID: %s", optarg);
441                                 return r;
442                         }
443                         break;
444
445                 case 'S':
446                         arg_slice = optarg;
447                         break;
448
449                 case 'M':
450                         if (isempty(optarg)) {
451                                 free(arg_machine);
452                                 arg_machine = NULL;
453                         } else {
454                                 if (!machine_name_is_valid(optarg)) {
455                                         log_error("Invalid machine name: %s", optarg);
456                                         return -EINVAL;
457                                 }
458
459                                 r = free_and_strdup(&arg_machine, optarg);
460                                 if (r < 0)
461                                         return log_oom();
462
463                                 break;
464                         }
465
466                 case 'Z':
467                         arg_selinux_context = optarg;
468                         break;
469
470                 case 'L':
471                         arg_selinux_apifs_context = optarg;
472                         break;
473
474                 case ARG_READ_ONLY:
475                         arg_read_only = true;
476                         break;
477
478                 case ARG_CAPABILITY:
479                 case ARG_DROP_CAPABILITY: {
480                         const char *state, *word;
481                         size_t length;
482
483                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
484                                 _cleanup_free_ char *t;
485
486                                 t = strndup(word, length);
487                                 if (!t)
488                                         return log_oom();
489
490                                 if (streq(t, "all")) {
491                                         if (c == ARG_CAPABILITY)
492                                                 plus = (uint64_t) -1;
493                                         else
494                                                 minus = (uint64_t) -1;
495                                 } else {
496                                         int cap;
497
498                                         cap = capability_from_name(t);
499                                         if (cap < 0) {
500                                                 log_error("Failed to parse capability %s.", t);
501                                                 return -EINVAL;
502                                         }
503
504                                         if (c == ARG_CAPABILITY)
505                                                 plus |= 1ULL << (uint64_t) cap;
506                                         else
507                                                 minus |= 1ULL << (uint64_t) cap;
508                                 }
509                         }
510
511                         break;
512                 }
513
514                 case 'j':
515                         arg_link_journal = LINK_GUEST;
516                         arg_link_journal_try = true;
517                         break;
518
519                 case ARG_LINK_JOURNAL:
520                         if (streq(optarg, "auto")) {
521                                 arg_link_journal = LINK_AUTO;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "no")) {
524                                 arg_link_journal = LINK_NO;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = false;
529                         } else if (streq(optarg, "host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = false;
532                         } else if (streq(optarg, "try-guest")) {
533                                 arg_link_journal = LINK_GUEST;
534                                 arg_link_journal_try = true;
535                         } else if (streq(optarg, "try-host")) {
536                                 arg_link_journal = LINK_HOST;
537                                 arg_link_journal_try = true;
538                         } else {
539                                 log_error("Failed to parse link journal mode %s", optarg);
540                                 return -EINVAL;
541                         }
542
543                         break;
544
545                 case ARG_BIND:
546                 case ARG_BIND_RO: {
547                         _cleanup_free_ char *a = NULL, *b = NULL;
548                         char *e;
549                         char ***x;
550
551                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553                         e = strchr(optarg, ':');
554                         if (e) {
555                                 a = strndup(optarg, e - optarg);
556                                 b = strdup(e + 1);
557                         } else {
558                                 a = strdup(optarg);
559                                 b = strdup(optarg);
560                         }
561
562                         if (!a || !b)
563                                 return log_oom();
564
565                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
566                                 log_error("Invalid bind mount specification: %s", optarg);
567                                 return -EINVAL;
568                         }
569
570                         r = strv_extend(x, a);
571                         if (r < 0)
572                                 return log_oom();
573
574                         r = strv_extend(x, b);
575                         if (r < 0)
576                                 return log_oom();
577
578                         break;
579                 }
580
581                 case ARG_TMPFS: {
582                         _cleanup_free_ char *a = NULL, *b = NULL;
583                         char *e;
584
585                         e = strchr(optarg, ':');
586                         if (e) {
587                                 a = strndup(optarg, e - optarg);
588                                 b = strdup(e + 1);
589                         } else {
590                                 a = strdup(optarg);
591                                 b = strdup("mode=0755");
592                         }
593
594                         if (!a || !b)
595                                 return log_oom();
596
597                         if (!path_is_absolute(a)) {
598                                 log_error("Invalid tmpfs specification: %s", optarg);
599                                 return -EINVAL;
600                         }
601
602                         r = strv_push(&arg_tmpfs, a);
603                         if (r < 0)
604                                 return log_oom();
605
606                         a = NULL;
607
608                         r = strv_push(&arg_tmpfs, b);
609                         if (r < 0)
610                                 return log_oom();
611
612                         b = NULL;
613
614                         break;
615                 }
616
617                 case ARG_SETENV: {
618                         char **n;
619
620                         if (!env_assignment_is_valid(optarg)) {
621                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         n = strv_env_set(arg_setenv, optarg);
626                         if (!n)
627                                 return log_oom();
628
629                         strv_free(arg_setenv);
630                         arg_setenv = n;
631                         break;
632                 }
633
634                 case 'q':
635                         arg_quiet = true;
636                         break;
637
638                 case ARG_SHARE_SYSTEM:
639                         arg_share_system = true;
640                         break;
641
642                 case ARG_REGISTER:
643                         r = parse_boolean(optarg);
644                         if (r < 0) {
645                                 log_error("Failed to parse --register= argument: %s", optarg);
646                                 return r;
647                         }
648
649                         arg_register = r;
650                         break;
651
652                 case ARG_KEEP_UNIT:
653                         arg_keep_unit = true;
654                         break;
655
656                 case ARG_PERSONALITY:
657
658                         arg_personality = personality_from_string(optarg);
659                         if (arg_personality == 0xffffffffLU) {
660                                 log_error("Unknown or unsupported personality '%s'.", optarg);
661                                 return -EINVAL;
662                         }
663
664                         break;
665
666                 case ARG_VOLATILE:
667
668                         if (!optarg)
669                                 arg_volatile = VOLATILE_YES;
670                         else {
671                                 r = parse_boolean(optarg);
672                                 if (r < 0) {
673                                         if (streq(optarg, "state"))
674                                                 arg_volatile = VOLATILE_STATE;
675                                         else {
676                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
677                                                 return r;
678                                         }
679                                 } else
680                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681                         }
682
683                         break;
684
685                 case 'p': {
686                         const char *split, *e;
687                         uint16_t container_port, host_port;
688                         int protocol;
689                         ExposePort *p;
690
691                         if ((e = startswith(optarg, "tcp:")))
692                                 protocol = IPPROTO_TCP;
693                         else if ((e = startswith(optarg, "udp:")))
694                                 protocol = IPPROTO_UDP;
695                         else {
696                                 e = optarg;
697                                 protocol = IPPROTO_TCP;
698                         }
699
700                         split = strchr(e, ':');
701                         if (split) {
702                                 char v[split - e + 1];
703
704                                 memcpy(v, e, split - e);
705                                 v[split - e] = 0;
706
707                                 r = safe_atou16(v, &host_port);
708                                 if (r < 0 || host_port <= 0) {
709                                         log_error("Failed to parse host port: %s", optarg);
710                                         return -EINVAL;
711                                 }
712
713                                 r = safe_atou16(split + 1, &container_port);
714                         } else {
715                                 r = safe_atou16(e, &container_port);
716                                 host_port = container_port;
717                         }
718
719                         if (r < 0 || container_port <= 0) {
720                                 log_error("Failed to parse host port: %s", optarg);
721                                 return -EINVAL;
722                         }
723
724                         LIST_FOREACH(ports, p, arg_expose_ports) {
725                                 if (p->protocol == protocol && p->host_port == host_port) {
726                                         log_error("Duplicate port specification: %s", optarg);
727                                         return -EINVAL;
728                                 }
729                         }
730
731                         p = new(ExposePort, 1);
732                         if (!p)
733                                 return log_oom();
734
735                         p->protocol = protocol;
736                         p->host_port = host_port;
737                         p->container_port = container_port;
738
739                         LIST_PREPEND(ports, arg_expose_ports, p);
740
741                         break;
742                 }
743
744                 case ARG_PROPERTY:
745                         if (strv_extend(&arg_property, optarg) < 0)
746                                 return log_oom();
747
748                         break;
749
750                 case ARG_PRIVATE_USERS:
751                         if (optarg) {
752                                 _cleanup_free_ char *buffer = NULL;
753                                 const char *range, *shift;
754
755                                 range = strchr(optarg, ':');
756                                 if (range) {
757                                         buffer = strndup(optarg, range - optarg);
758                                         if (!buffer)
759                                                 return log_oom();
760                                         shift = buffer;
761
762                                         range++;
763                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764                                                 log_error("Failed to parse UID range: %s", range);
765                                                 return -EINVAL;
766                                         }
767                                 } else
768                                         shift = optarg;
769
770                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
771                                         log_error("Failed to parse UID: %s", optarg);
772                                         return -EINVAL;
773                                 }
774                         }
775
776                         arg_userns = true;
777                         break;
778
779                 case '?':
780                         return -EINVAL;
781
782                 default:
783                         assert_not_reached("Unhandled option");
784                 }
785
786         if (arg_share_system)
787                 arg_register = false;
788
789         if (arg_boot && arg_share_system) {
790                 log_error("--boot and --share-system may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
795                 log_error("--keep-unit may not be used when invoked from a user session.");
796                 return -EINVAL;
797         }
798
799         if (arg_directory && arg_image) {
800                 log_error("--directory= and --image= may not be combined.");
801                 return -EINVAL;
802         }
803
804         if (arg_template && arg_image) {
805                 log_error("--template= and --image= may not be combined.");
806                 return -EINVAL;
807         }
808
809         if (arg_template && !(arg_directory || arg_machine)) {
810                 log_error("--template= needs --directory= or --machine=.");
811                 return -EINVAL;
812         }
813
814         if (arg_ephemeral && arg_template) {
815                 log_error("--ephemeral and --template= may not be combined.");
816                 return -EINVAL;
817         }
818
819         if (arg_ephemeral && arg_image) {
820                 log_error("--ephemeral and --image= may not be combined.");
821                 return -EINVAL;
822         }
823
824         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
825                 log_error("--ephemeral and --link-journal= may not be combined.");
826                 return -EINVAL;
827         }
828
829         if (arg_volatile != VOLATILE_NO && arg_read_only) {
830                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
831                 return -EINVAL;
832         }
833
834         if (arg_expose_ports && !arg_private_network) {
835                 log_error("Cannot use --port= without private networking.");
836                 return -EINVAL;
837         }
838
839         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
840
841         return 1;
842 }
843
844 static int mount_all(const char *dest) {
845
846         typedef struct MountPoint {
847                 const char *what;
848                 const char *where;
849                 const char *type;
850                 const char *options;
851                 unsigned long flags;
852                 bool fatal;
853         } MountPoint;
854
855         static const MountPoint mount_table[] = {
856                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
857                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
858                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
859                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
860                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
861                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
862                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
863                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
864                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
865 #ifdef HAVE_SELINUX
866                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
867                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
868 #endif
869         };
870
871         unsigned k;
872         int r = 0;
873
874         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
875                 _cleanup_free_ char *where = NULL, *options = NULL;
876                 const char *o;
877                 int t;
878
879                 where = strjoin(dest, "/", mount_table[k].where, NULL);
880                 if (!where)
881                         return log_oom();
882
883                 t = path_is_mount_point(where, true);
884                 if (t < 0) {
885                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
886
887                         if (r == 0)
888                                 r = t;
889
890                         continue;
891                 }
892
893                 /* Skip this entry if it is not a remount. */
894                 if (mount_table[k].what && t > 0)
895                         continue;
896
897                 t = mkdir_p(where, 0755);
898                 if (t < 0) {
899                         if (mount_table[k].fatal) {
900                                log_error_errno(t, "Failed to create directory %s: %m", where);
901
902                                 if (r == 0)
903                                         r = t;
904                         } else
905                                log_warning_errno(t, "Failed to create directory %s: %m", where);
906
907                         continue;
908                 }
909
910 #ifdef HAVE_SELINUX
911                 if (arg_selinux_apifs_context &&
912                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
913                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
914                         if (!options)
915                                 return log_oom();
916
917                         o = options;
918                 } else
919 #endif
920                         o = mount_table[k].options;
921
922                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
923                         char *uid_options = NULL;
924
925                         if (o)
926                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
927                         else
928                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
929                         if (!uid_options)
930                                 return log_oom();
931
932                         free(options);
933                         o = options = uid_options;
934                 }
935
936                 if (mount(mount_table[k].what,
937                           where,
938                           mount_table[k].type,
939                           mount_table[k].flags,
940                           o) < 0) {
941
942                         if (mount_table[k].fatal) {
943                                 log_error_errno(errno, "mount(%s) failed: %m", where);
944
945                                 if (r == 0)
946                                         r = -errno;
947                         } else
948                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
949                 }
950         }
951
952         return r;
953 }
954
955 static int mount_binds(const char *dest, char **l, bool ro) {
956         char **x, **y;
957
958         STRV_FOREACH_PAIR(x, y, l) {
959                 _cleanup_free_ char *where = NULL;
960                 struct stat source_st, dest_st;
961                 int r;
962
963                 if (stat(*x, &source_st) < 0)
964                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
965
966                 where = strappend(dest, *y);
967                 if (!where)
968                         return log_oom();
969
970                 r = stat(where, &dest_st);
971                 if (r == 0) {
972                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
973                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
974                                 return -EINVAL;
975                         }
976                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
977                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
978                                 return -EINVAL;
979                         }
980                 } else if (errno == ENOENT) {
981                         r = mkdir_parents_label(where, 0755);
982                         if (r < 0)
983                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
984                 } else {
985                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
986                         return -errno;
987                 }
988
989                 /* Create the mount point. Any non-directory file can be
990                  * mounted on any non-directory file (regular, fifo, socket,
991                  * char, block).
992                  */
993                 if (S_ISDIR(source_st.st_mode)) {
994                         r = mkdir_label(where, 0755);
995                         if (r < 0 && errno != EEXIST)
996                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
997                 } else {
998                         r = touch(where);
999                         if (r < 0)
1000                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1001                 }
1002
1003                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1004                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1005
1006                 if (ro) {
1007                         r = bind_remount_recursive(where, true);
1008                         if (r < 0)
1009                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1010                 }
1011         }
1012
1013         return 0;
1014 }
1015
1016 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1017         char *to;
1018         int r;
1019
1020         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1021
1022         r = path_is_mount_point(to, false);
1023         if (r < 0)
1024                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1025         if (r > 0)
1026                 return 0;
1027
1028         mkdir_p(to, 0755);
1029
1030         /* The superblock mount options of the mount point need to be
1031          * identical to the hosts', and hence writable... */
1032         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1033                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1034
1035         /* ... hence let's only make the bind mount read-only, not the
1036          * superblock. */
1037         if (read_only) {
1038                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1039                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1040         }
1041         return 1;
1042 }
1043
1044 static int mount_cgroup(const char *dest) {
1045         _cleanup_set_free_free_ Set *controllers = NULL;
1046         _cleanup_free_ char *own_cgroup_path = NULL;
1047         const char *cgroup_root, *systemd_root, *systemd_own;
1048         int r;
1049
1050         controllers = set_new(&string_hash_ops);
1051         if (!controllers)
1052                 return log_oom();
1053
1054         r = cg_kernel_controllers(controllers);
1055         if (r < 0)
1056                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1057
1058         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1059         if (r < 0)
1060                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1061
1062         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1063         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1064                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1065
1066         for (;;) {
1067                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1068
1069                 controller = set_steal_first(controllers);
1070                 if (!controller)
1071                         break;
1072
1073                 origin = strappend("/sys/fs/cgroup/", controller);
1074                 if (!origin)
1075                         return log_oom();
1076
1077                 r = readlink_malloc(origin, &combined);
1078                 if (r == -EINVAL) {
1079                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1080
1081                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1082                         if (r < 0)
1083                                 return r;
1084
1085                 } else if (r < 0)
1086                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1087                 else {
1088                         _cleanup_free_ char *target = NULL;
1089
1090                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1091                         if (!target)
1092                                 return log_oom();
1093
1094                         /* A symbolic link, a combination of controllers in one hierarchy */
1095
1096                         if (!filename_is_valid(combined)) {
1097                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1098                                 continue;
1099                         }
1100
1101                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1102                         if (r < 0)
1103                                 return r;
1104
1105                         if (symlink(combined, target) < 0)
1106                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1107                 }
1108         }
1109
1110         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1111         if (r < 0)
1112                 return r;
1113
1114         /* Make our own cgroup a (writable) bind mount */
1115         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1116         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1117                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1118
1119         /* And then remount the systemd cgroup root read-only */
1120         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1121         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1122                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1123
1124         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1125                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1126
1127         return 0;
1128 }
1129
1130 static int mount_tmpfs(const char *dest) {
1131         char **i, **o;
1132
1133         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1134                 _cleanup_free_ char *where = NULL;
1135                 int r;
1136
1137                 where = strappend(dest, *i);
1138                 if (!where)
1139                         return log_oom();
1140
1141                 r = mkdir_label(where, 0755);
1142                 if (r < 0 && r != -EEXIST)
1143                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1144
1145                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1146                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1147         }
1148
1149         return 0;
1150 }
1151
1152 static int setup_timezone(const char *dest) {
1153         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1154         char *z, *y;
1155         int r;
1156
1157         assert(dest);
1158
1159         /* Fix the timezone, if possible */
1160         r = readlink_malloc("/etc/localtime", &p);
1161         if (r < 0) {
1162                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1163                 return 0;
1164         }
1165
1166         z = path_startswith(p, "../usr/share/zoneinfo/");
1167         if (!z)
1168                 z = path_startswith(p, "/usr/share/zoneinfo/");
1169         if (!z) {
1170                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1171                 return 0;
1172         }
1173
1174         where = strappend(dest, "/etc/localtime");
1175         if (!where)
1176                 return log_oom();
1177
1178         r = readlink_malloc(where, &q);
1179         if (r >= 0) {
1180                 y = path_startswith(q, "../usr/share/zoneinfo/");
1181                 if (!y)
1182                         y = path_startswith(q, "/usr/share/zoneinfo/");
1183
1184                 /* Already pointing to the right place? Then do nothing .. */
1185                 if (y && streq(y, z))
1186                         return 0;
1187         }
1188
1189         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1190         if (!check)
1191                 return log_oom();
1192
1193         if (access(check, F_OK) < 0) {
1194                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1195                 return 0;
1196         }
1197
1198         what = strappend("../usr/share/zoneinfo/", z);
1199         if (!what)
1200                 return log_oom();
1201
1202         r = mkdir_parents(where, 0755);
1203         if (r < 0) {
1204                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1205
1206                 return 0;
1207         }
1208
1209         r = unlink(where);
1210         if (r < 0 && errno != ENOENT) {
1211                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         if (symlink(what, where) < 0) {
1217                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1218                 return 0;
1219         }
1220
1221         return 0;
1222 }
1223
1224 static int setup_resolv_conf(const char *dest) {
1225         _cleanup_free_ char *where = NULL;
1226         int r;
1227
1228         assert(dest);
1229
1230         if (arg_private_network)
1231                 return 0;
1232
1233         /* Fix resolv.conf, if possible */
1234         where = strappend(dest, "/etc/resolv.conf");
1235         if (!where)
1236                 return log_oom();
1237
1238         /* We don't really care for the results of this really. If it
1239          * fails, it fails, but meh... */
1240         r = mkdir_parents(where, 0755);
1241         if (r < 0) {
1242                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1243
1244                 return 0;
1245         }
1246
1247         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int setup_volatile_state(const char *directory) {
1258         const char *p;
1259         int r;
1260
1261         assert(directory);
1262
1263         if (arg_volatile != VOLATILE_STATE)
1264                 return 0;
1265
1266         /* --volatile=state means we simply overmount /var
1267            with a tmpfs, and the rest read-only. */
1268
1269         r = bind_remount_recursive(directory, true);
1270         if (r < 0)
1271                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1272
1273         p = strjoina(directory, "/var");
1274         r = mkdir(p, 0755);
1275         if (r < 0 && errno != EEXIST)
1276                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1277
1278         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1279                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1280
1281         return 0;
1282 }
1283
1284 static int setup_volatile(const char *directory) {
1285         bool tmpfs_mounted = false, bind_mounted = false;
1286         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1287         const char *f, *t;
1288         int r;
1289
1290         assert(directory);
1291
1292         if (arg_volatile != VOLATILE_YES)
1293                 return 0;
1294
1295         /* --volatile=yes means we mount a tmpfs to the root dir, and
1296            the original /usr to use inside it, and that read-only. */
1297
1298         if (!mkdtemp(template))
1299                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1300
1301         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1302                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1303                 r = -errno;
1304                 goto fail;
1305         }
1306
1307         tmpfs_mounted = true;
1308
1309         f = strjoina(directory, "/usr");
1310         t = strjoina(template, "/usr");
1311
1312         r = mkdir(t, 0755);
1313         if (r < 0 && errno != EEXIST) {
1314                 log_error_errno(errno, "Failed to create %s: %m", t);
1315                 r = -errno;
1316                 goto fail;
1317         }
1318
1319         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1320                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1321                 r = -errno;
1322                 goto fail;
1323         }
1324
1325         bind_mounted = true;
1326
1327         r = bind_remount_recursive(t, true);
1328         if (r < 0) {
1329                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1330                 goto fail;
1331         }
1332
1333         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1334                 log_error_errno(errno, "Failed to move root mount: %m");
1335                 r = -errno;
1336                 goto fail;
1337         }
1338
1339         rmdir(template);
1340
1341         return 0;
1342
1343 fail:
1344         if (bind_mounted)
1345                 umount(t);
1346         if (tmpfs_mounted)
1347                 umount(template);
1348         rmdir(template);
1349         return r;
1350 }
1351
1352 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1353
1354         snprintf(s, 37,
1355                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1356                  SD_ID128_FORMAT_VAL(id));
1357
1358         return s;
1359 }
1360
1361 static int setup_boot_id(const char *dest) {
1362         _cleanup_free_ char *from = NULL, *to = NULL;
1363         sd_id128_t rnd = {};
1364         char as_uuid[37];
1365         int r;
1366
1367         assert(dest);
1368
1369         if (arg_share_system)
1370                 return 0;
1371
1372         /* Generate a new randomized boot ID, so that each boot-up of
1373          * the container gets a new one */
1374
1375         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1376         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1377         if (!from || !to)
1378                 return log_oom();
1379
1380         r = sd_id128_randomize(&rnd);
1381         if (r < 0)
1382                 return log_error_errno(r, "Failed to generate random boot id: %m");
1383
1384         id128_format_as_uuid(rnd, as_uuid);
1385
1386         r = write_string_file(from, as_uuid);
1387         if (r < 0)
1388                 return log_error_errno(r, "Failed to write boot id: %m");
1389
1390         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1391                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1392                 r = -errno;
1393         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1394                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1395
1396         unlink(from);
1397         return r;
1398 }
1399
1400 static int copy_devnodes(const char *dest) {
1401
1402         static const char devnodes[] =
1403                 "null\0"
1404                 "zero\0"
1405                 "full\0"
1406                 "random\0"
1407                 "urandom\0"
1408                 "tty\0"
1409                 "net/tun\0";
1410
1411         const char *d;
1412         int r = 0;
1413         _cleanup_umask_ mode_t u;
1414
1415         assert(dest);
1416
1417         u = umask(0000);
1418
1419         NULSTR_FOREACH(d, devnodes) {
1420                 _cleanup_free_ char *from = NULL, *to = NULL;
1421                 struct stat st;
1422
1423                 from = strappend("/dev/", d);
1424                 to = strjoin(dest, "/dev/", d, NULL);
1425                 if (!from || !to)
1426                         return log_oom();
1427
1428                 if (stat(from, &st) < 0) {
1429
1430                         if (errno != ENOENT)
1431                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1432
1433                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1434
1435                         log_error("%s is not a char or block device, cannot copy", from);
1436                         return -EIO;
1437
1438                 } else {
1439                         r = mkdir_parents(to, 0775);
1440                         if (r < 0) {
1441                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1442                                 return -r;
1443                         }
1444
1445                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1446                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1447
1448                         if (arg_userns && arg_uid_shift != UID_INVALID)
1449                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1450                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1451                 }
1452         }
1453
1454         return r;
1455 }
1456
1457 static int setup_ptmx(const char *dest) {
1458         _cleanup_free_ char *p = NULL;
1459
1460         p = strappend(dest, "/dev/ptmx");
1461         if (!p)
1462                 return log_oom();
1463
1464         if (symlink("pts/ptmx", p) < 0)
1465                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1466
1467         if (arg_userns && arg_uid_shift != UID_INVALID)
1468                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1469                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1470
1471         return 0;
1472 }
1473
1474 static int setup_dev_console(const char *dest, const char *console) {
1475         _cleanup_umask_ mode_t u;
1476         const char *to;
1477         struct stat st;
1478         int r;
1479
1480         assert(dest);
1481         assert(console);
1482
1483         u = umask(0000);
1484
1485         if (stat("/dev/null", &st) < 0)
1486                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1487
1488         r = chmod_and_chown(console, 0600, 0, 0);
1489         if (r < 0)
1490                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1491
1492         /* We need to bind mount the right tty to /dev/console since
1493          * ptys can only exist on pts file systems. To have something
1494          * to bind mount things on we create a device node first, and
1495          * use /dev/null for that since we the cgroups device policy
1496          * allows us to create that freely, while we cannot create
1497          * /dev/console. (Note that the major minor doesn't actually
1498          * matter here, since we mount it over anyway). */
1499
1500         to = strjoina(dest, "/dev/console");
1501         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1502                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1503
1504         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1505                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1506
1507         return 0;
1508 }
1509
1510 static int setup_kmsg(const char *dest, int kmsg_socket) {
1511         _cleanup_free_ char *from = NULL, *to = NULL;
1512         _cleanup_umask_ mode_t u;
1513         int r, fd, k;
1514         union {
1515                 struct cmsghdr cmsghdr;
1516                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1517         } control = {};
1518         struct msghdr mh = {
1519                 .msg_control = &control,
1520                 .msg_controllen = sizeof(control),
1521         };
1522         struct cmsghdr *cmsg;
1523
1524         assert(dest);
1525         assert(kmsg_socket >= 0);
1526
1527         u = umask(0000);
1528
1529         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1530          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1531          * on the reading side behave very similar to /proc/kmsg,
1532          * their writing side behaves differently from /dev/kmsg in
1533          * that writing blocks when nothing is reading. In order to
1534          * avoid any problems with containers deadlocking due to this
1535          * we simply make /dev/kmsg unavailable to the container. */
1536         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1537             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1538                 return log_oom();
1539
1540         if (mkfifo(from, 0600) < 0)
1541                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1542
1543         r = chmod_and_chown(from, 0600, 0, 0);
1544         if (r < 0)
1545                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1546
1547         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1548                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1549
1550         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1551         if (fd < 0)
1552                 return log_error_errno(errno, "Failed to open fifo: %m");
1553
1554         cmsg = CMSG_FIRSTHDR(&mh);
1555         cmsg->cmsg_level = SOL_SOCKET;
1556         cmsg->cmsg_type = SCM_RIGHTS;
1557         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1558         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1559
1560         mh.msg_controllen = cmsg->cmsg_len;
1561
1562         /* Store away the fd in the socket, so that it stays open as
1563          * long as we run the child */
1564         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1565         safe_close(fd);
1566
1567         if (k < 0)
1568                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1569
1570         /* And now make the FIFO unavailable as /dev/kmsg... */
1571         unlink(from);
1572         return 0;
1573 }
1574
1575 static int send_rtnl(int send_fd) {
1576         union {
1577                 struct cmsghdr cmsghdr;
1578                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1579         } control = {};
1580         struct msghdr mh = {
1581                 .msg_control = &control,
1582                 .msg_controllen = sizeof(control),
1583         };
1584         struct cmsghdr *cmsg;
1585         _cleanup_close_ int fd = -1;
1586         ssize_t k;
1587
1588         assert(send_fd >= 0);
1589
1590         if (!arg_expose_ports)
1591                 return 0;
1592
1593         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1594         if (fd < 0)
1595                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1596
1597         cmsg = CMSG_FIRSTHDR(&mh);
1598         cmsg->cmsg_level = SOL_SOCKET;
1599         cmsg->cmsg_type = SCM_RIGHTS;
1600         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1601         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1602
1603         mh.msg_controllen = cmsg->cmsg_len;
1604
1605         /* Store away the fd in the socket, so that it stays open as
1606          * long as we run the child */
1607         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1608         if (k < 0)
1609                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1610
1611         return 0;
1612 }
1613
1614 static int flush_ports(union in_addr_union *exposed) {
1615         ExposePort *p;
1616         int r, af = AF_INET;
1617
1618         assert(exposed);
1619
1620         if (!arg_expose_ports)
1621                 return 0;
1622
1623         if (in_addr_is_null(af, exposed))
1624                 return 0;
1625
1626         log_debug("Lost IP address.");
1627
1628         LIST_FOREACH(ports, p, arg_expose_ports) {
1629                 r = fw_add_local_dnat(false,
1630                                       af,
1631                                       p->protocol,
1632                                       NULL,
1633                                       NULL, 0,
1634                                       NULL, 0,
1635                                       p->host_port,
1636                                       exposed,
1637                                       p->container_port,
1638                                       NULL);
1639                 if (r < 0)
1640                         log_warning_errno(r, "Failed to modify firewall: %m");
1641         }
1642
1643         *exposed = IN_ADDR_NULL;
1644         return 0;
1645 }
1646
1647 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1648         _cleanup_free_ struct local_address *addresses = NULL;
1649         _cleanup_free_ char *pretty = NULL;
1650         union in_addr_union new_exposed;
1651         ExposePort *p;
1652         bool add;
1653         int af = AF_INET, r;
1654
1655         assert(exposed);
1656
1657         /* Invoked each time an address is added or removed inside the
1658          * container */
1659
1660         if (!arg_expose_ports)
1661                 return 0;
1662
1663         r = local_addresses(rtnl, 0, af, &addresses);
1664         if (r < 0)
1665                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1666
1667         add = r > 0 &&
1668                 addresses[0].family == af &&
1669                 addresses[0].scope < RT_SCOPE_LINK;
1670
1671         if (!add)
1672                 return flush_ports(exposed);
1673
1674         new_exposed = addresses[0].address;
1675         if (in_addr_equal(af, exposed, &new_exposed))
1676                 return 0;
1677
1678         in_addr_to_string(af, &new_exposed, &pretty);
1679         log_debug("New container IP is %s.", strna(pretty));
1680
1681         LIST_FOREACH(ports, p, arg_expose_ports) {
1682
1683                 r = fw_add_local_dnat(true,
1684                                       af,
1685                                       p->protocol,
1686                                       NULL,
1687                                       NULL, 0,
1688                                       NULL, 0,
1689                                       p->host_port,
1690                                       &new_exposed,
1691                                       p->container_port,
1692                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1693                 if (r < 0)
1694                         log_warning_errno(r, "Failed to modify firewall: %m");
1695         }
1696
1697         *exposed = new_exposed;
1698         return 0;
1699 }
1700
1701 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1702         union in_addr_union *exposed = userdata;
1703
1704         assert(rtnl);
1705         assert(m);
1706         assert(exposed);
1707
1708         expose_ports(rtnl, exposed);
1709         return 0;
1710 }
1711
1712 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1713         union {
1714                 struct cmsghdr cmsghdr;
1715                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1716         } control = {};
1717         struct msghdr mh = {
1718                 .msg_control = &control,
1719                 .msg_controllen = sizeof(control),
1720         };
1721         struct cmsghdr *cmsg;
1722         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1723         int fd, r;
1724         ssize_t k;
1725
1726         assert(event);
1727         assert(recv_fd >= 0);
1728         assert(ret);
1729
1730         if (!arg_expose_ports)
1731                 return 0;
1732
1733         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1734         if (k < 0)
1735                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1736
1737         cmsg = CMSG_FIRSTHDR(&mh);
1738         assert(cmsg->cmsg_level == SOL_SOCKET);
1739         assert(cmsg->cmsg_type == SCM_RIGHTS);
1740         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1741         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1742
1743         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1744         if (r < 0) {
1745                 safe_close(fd);
1746                 return log_error_errno(r, "Failed to create rtnl object: %m");
1747         }
1748
1749         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1750         if (r < 0)
1751                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1752
1753         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1754         if (r < 0)
1755                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1756
1757         r = sd_rtnl_attach_event(rtnl, event, 0);
1758         if (r < 0)
1759                 return log_error_errno(r, "Failed to add to even loop: %m");
1760
1761         *ret = rtnl;
1762         rtnl = NULL;
1763
1764         return 0;
1765 }
1766
1767 static int setup_hostname(void) {
1768
1769         if (arg_share_system)
1770                 return 0;
1771
1772         if (sethostname_idempotent(arg_machine) < 0)
1773                 return -errno;
1774
1775         return 0;
1776 }
1777
1778 static int setup_journal(const char *directory) {
1779         sd_id128_t machine_id, this_id;
1780         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1781         char *id;
1782         int r;
1783
1784         /* Don't link journals in ephemeral mode */
1785         if (arg_ephemeral)
1786                 return 0;
1787
1788         p = strappend(directory, "/etc/machine-id");
1789         if (!p)
1790                 return log_oom();
1791
1792         r = read_one_line_file(p, &b);
1793         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1794                 return 0;
1795         else if (r < 0)
1796                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1797
1798         id = strstrip(b);
1799         if (isempty(id) && arg_link_journal == LINK_AUTO)
1800                 return 0;
1801
1802         /* Verify validity */
1803         r = sd_id128_from_string(id, &machine_id);
1804         if (r < 0)
1805                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1806
1807         r = sd_id128_get_machine(&this_id);
1808         if (r < 0)
1809                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1810
1811         if (sd_id128_equal(machine_id, this_id)) {
1812                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1813                          "Host and machine ids are equal (%s): refusing to link journals", id);
1814                 if (arg_link_journal == LINK_AUTO)
1815                         return 0;
1816                 return -EEXIST;
1817         }
1818
1819         if (arg_link_journal == LINK_NO)
1820                 return 0;
1821
1822         free(p);
1823         p = strappend("/var/log/journal/", id);
1824         q = strjoin(directory, "/var/log/journal/", id, NULL);
1825         if (!p || !q)
1826                 return log_oom();
1827
1828         if (path_is_mount_point(p, false) > 0) {
1829                 if (arg_link_journal != LINK_AUTO) {
1830                         log_error("%s: already a mount point, refusing to use for journal", p);
1831                         return -EEXIST;
1832                 }
1833
1834                 return 0;
1835         }
1836
1837         if (path_is_mount_point(q, false) > 0) {
1838                 if (arg_link_journal != LINK_AUTO) {
1839                         log_error("%s: already a mount point, refusing to use for journal", q);
1840                         return -EEXIST;
1841                 }
1842
1843                 return 0;
1844         }
1845
1846         r = readlink_and_make_absolute(p, &d);
1847         if (r >= 0) {
1848                 if ((arg_link_journal == LINK_GUEST ||
1849                      arg_link_journal == LINK_AUTO) &&
1850                     path_equal(d, q)) {
1851
1852                         r = mkdir_p(q, 0755);
1853                         if (r < 0)
1854                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1855                         return 0;
1856                 }
1857
1858                 if (unlink(p) < 0)
1859                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1860         } else if (r == -EINVAL) {
1861
1862                 if (arg_link_journal == LINK_GUEST &&
1863                     rmdir(p) < 0) {
1864
1865                         if (errno == ENOTDIR) {
1866                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1867                                 return r;
1868                         } else {
1869                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1870                                 return -errno;
1871                         }
1872                 }
1873         } else if (r != -ENOENT) {
1874                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1875                 return r;
1876         }
1877
1878         if (arg_link_journal == LINK_GUEST) {
1879
1880                 if (symlink(q, p) < 0) {
1881                         if (arg_link_journal_try) {
1882                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1883                                 return 0;
1884                         } else {
1885                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1886                                 return -errno;
1887                         }
1888                 }
1889
1890                 r = mkdir_p(q, 0755);
1891                 if (r < 0)
1892                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1893                 return 0;
1894         }
1895
1896         if (arg_link_journal == LINK_HOST) {
1897                 /* don't create parents here -- if the host doesn't have
1898                  * permanent journal set up, don't force it here */
1899                 r = mkdir(p, 0755);
1900                 if (r < 0) {
1901                         if (arg_link_journal_try) {
1902                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1903                                 return 0;
1904                         } else {
1905                                 log_error_errno(errno, "Failed to create %s: %m", p);
1906                                 return r;
1907                         }
1908                 }
1909
1910         } else if (access(p, F_OK) < 0)
1911                 return 0;
1912
1913         if (dir_is_empty(q) == 0)
1914                 log_warning("%s is not empty, proceeding anyway.", q);
1915
1916         r = mkdir_p(q, 0755);
1917         if (r < 0) {
1918                 log_error_errno(errno, "Failed to create %s: %m", q);
1919                 return r;
1920         }
1921
1922         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1923                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1924
1925         return 0;
1926 }
1927
1928 static int drop_capabilities(void) {
1929         return capability_bounding_set_drop(~arg_retain, false);
1930 }
1931
1932 static int register_machine(pid_t pid, int local_ifindex) {
1933         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1934         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1935         int r;
1936
1937         if (!arg_register)
1938                 return 0;
1939
1940         r = sd_bus_default_system(&bus);
1941         if (r < 0)
1942                 return log_error_errno(r, "Failed to open system bus: %m");
1943
1944         if (arg_keep_unit) {
1945                 r = sd_bus_call_method(
1946                                 bus,
1947                                 "org.freedesktop.machine1",
1948                                 "/org/freedesktop/machine1",
1949                                 "org.freedesktop.machine1.Manager",
1950                                 "RegisterMachineWithNetwork",
1951                                 &error,
1952                                 NULL,
1953                                 "sayssusai",
1954                                 arg_machine,
1955                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1956                                 "nspawn",
1957                                 "container",
1958                                 (uint32_t) pid,
1959                                 strempty(arg_directory),
1960                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1961         } else {
1962                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1963                 char **i;
1964
1965                 r = sd_bus_message_new_method_call(
1966                                 bus,
1967                                 &m,
1968                                 "org.freedesktop.machine1",
1969                                 "/org/freedesktop/machine1",
1970                                 "org.freedesktop.machine1.Manager",
1971                                 "CreateMachineWithNetwork");
1972                 if (r < 0)
1973                         return bus_log_create_error(r);
1974
1975                 r = sd_bus_message_append(
1976                                 m,
1977                                 "sayssusai",
1978                                 arg_machine,
1979                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1980                                 "nspawn",
1981                                 "container",
1982                                 (uint32_t) pid,
1983                                 strempty(arg_directory),
1984                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1985                 if (r < 0)
1986                         return bus_log_create_error(r);
1987
1988                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1989                 if (r < 0)
1990                         return bus_log_create_error(r);
1991
1992                 if (!isempty(arg_slice)) {
1993                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1994                         if (r < 0)
1995                                 return bus_log_create_error(r);
1996                 }
1997
1998                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1999                 if (r < 0)
2000                         return bus_log_create_error(r);
2001
2002                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2003                                           /* Allow the container to
2004                                            * access and create the API
2005                                            * device nodes, so that
2006                                            * PrivateDevices= in the
2007                                            * container can work
2008                                            * fine */
2009                                           "/dev/null", "rwm",
2010                                           "/dev/zero", "rwm",
2011                                           "/dev/full", "rwm",
2012                                           "/dev/random", "rwm",
2013                                           "/dev/urandom", "rwm",
2014                                           "/dev/tty", "rwm",
2015                                           "/dev/net/tun", "rwm",
2016                                           /* Allow the container
2017                                            * access to ptys. However,
2018                                            * do not permit the
2019                                            * container to ever create
2020                                            * these device nodes. */
2021                                           "/dev/pts/ptmx", "rw",
2022                                           "char-pts", "rw");
2023                 if (r < 0)
2024                         return log_error_errno(r, "Failed to add device whitelist: %m");
2025
2026                 STRV_FOREACH(i, arg_property) {
2027                         r = sd_bus_message_open_container(m, 'r', "sv");
2028                         if (r < 0)
2029                                 return bus_log_create_error(r);
2030
2031                         r = bus_append_unit_property_assignment(m, *i);
2032                         if (r < 0)
2033                                 return r;
2034
2035                         r = sd_bus_message_close_container(m);
2036                         if (r < 0)
2037                                 return bus_log_create_error(r);
2038                 }
2039
2040                 r = sd_bus_message_close_container(m);
2041                 if (r < 0)
2042                         return bus_log_create_error(r);
2043
2044                 r = sd_bus_call(bus, m, 0, &error, NULL);
2045         }
2046
2047         if (r < 0) {
2048                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2049                 return r;
2050         }
2051
2052         return 0;
2053 }
2054
2055 static int terminate_machine(pid_t pid) {
2056         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2057         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2058         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2059         const char *path;
2060         int r;
2061
2062         if (!arg_register)
2063                 return 0;
2064
2065         r = sd_bus_default_system(&bus);
2066         if (r < 0)
2067                 return log_error_errno(r, "Failed to open system bus: %m");
2068
2069         r = sd_bus_call_method(
2070                         bus,
2071                         "org.freedesktop.machine1",
2072                         "/org/freedesktop/machine1",
2073                         "org.freedesktop.machine1.Manager",
2074                         "GetMachineByPID",
2075                         &error,
2076                         &reply,
2077                         "u",
2078                         (uint32_t) pid);
2079         if (r < 0) {
2080                 /* Note that the machine might already have been
2081                  * cleaned up automatically, hence don't consider it a
2082                  * failure if we cannot get the machine object. */
2083                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2084                 return 0;
2085         }
2086
2087         r = sd_bus_message_read(reply, "o", &path);
2088         if (r < 0)
2089                 return bus_log_parse_error(r);
2090
2091         r = sd_bus_call_method(
2092                         bus,
2093                         "org.freedesktop.machine1",
2094                         path,
2095                         "org.freedesktop.machine1.Machine",
2096                         "Terminate",
2097                         &error,
2098                         NULL,
2099                         NULL);
2100         if (r < 0) {
2101                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2102                 return 0;
2103         }
2104
2105         return 0;
2106 }
2107
2108 static int reset_audit_loginuid(void) {
2109         _cleanup_free_ char *p = NULL;
2110         int r;
2111
2112         if (arg_share_system)
2113                 return 0;
2114
2115         r = read_one_line_file("/proc/self/loginuid", &p);
2116         if (r == -ENOENT)
2117                 return 0;
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2120
2121         /* Already reset? */
2122         if (streq(p, "4294967295"))
2123                 return 0;
2124
2125         r = write_string_file("/proc/self/loginuid", "4294967295");
2126         if (r < 0) {
2127                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2128                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2129                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2130                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2131                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2132
2133                 sleep(5);
2134         }
2135
2136         return 0;
2137 }
2138
2139 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2140 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2141 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2142
2143 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2144         uint8_t result[8];
2145         size_t l, sz;
2146         uint8_t *v, *i;
2147         int r;
2148
2149         l = strlen(arg_machine);
2150         sz = sizeof(sd_id128_t) + l;
2151         if (idx > 0)
2152                 sz += sizeof(idx);
2153
2154         v = alloca(sz);
2155
2156         /* fetch some persistent data unique to the host */
2157         r = sd_id128_get_machine((sd_id128_t*) v);
2158         if (r < 0)
2159                 return r;
2160
2161         /* combine with some data unique (on this host) to this
2162          * container instance */
2163         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2164         if (idx > 0) {
2165                 idx = htole64(idx);
2166                 memcpy(i, &idx, sizeof(idx));
2167         }
2168
2169         /* Let's hash the host machine ID plus the container name. We
2170          * use a fixed, but originally randomly created hash key here. */
2171         siphash24(result, v, sz, hash_key.bytes);
2172
2173         assert_cc(ETH_ALEN <= sizeof(result));
2174         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2175
2176         /* see eth_random_addr in the kernel */
2177         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2178         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2179
2180         return 0;
2181 }
2182
2183 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2184         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2185         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2186         struct ether_addr mac_host, mac_container;
2187         int r, i;
2188
2189         if (!arg_private_network)
2190                 return 0;
2191
2192         if (!arg_network_veth)
2193                 return 0;
2194
2195         /* Use two different interface name prefixes depending whether
2196          * we are in bridge mode or not. */
2197         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2198                  arg_network_bridge ? "vb" : "ve", arg_machine);
2199
2200         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2203
2204         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2207
2208         r = sd_rtnl_open(&rtnl, 0);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to connect to netlink: %m");
2211
2212         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2215
2216         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2219
2220         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2221         if (r < 0)
2222                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2223
2224         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2225         if (r < 0)
2226                 return log_error_errno(r, "Failed to open netlink container: %m");
2227
2228         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2229         if (r < 0)
2230                 return log_error_errno(r, "Failed to open netlink container: %m");
2231
2232         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2233         if (r < 0)
2234                 return log_error_errno(r, "Failed to open netlink container: %m");
2235
2236         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2237         if (r < 0)
2238                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2239
2240         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2241         if (r < 0)
2242                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2243
2244         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2245         if (r < 0)
2246                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2247
2248         r = sd_rtnl_message_close_container(m);
2249         if (r < 0)
2250                 return log_error_errno(r, "Failed to close netlink container: %m");
2251
2252         r = sd_rtnl_message_close_container(m);
2253         if (r < 0)
2254                 return log_error_errno(r, "Failed to close netlink container: %m");
2255
2256         r = sd_rtnl_message_close_container(m);
2257         if (r < 0)
2258                 return log_error_errno(r, "Failed to close netlink container: %m");
2259
2260         r = sd_rtnl_call(rtnl, m, 0, NULL);
2261         if (r < 0)
2262                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2263
2264         i = (int) if_nametoindex(iface_name);
2265         if (i <= 0)
2266                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2267
2268         *ifi = i;
2269
2270         return 0;
2271 }
2272
2273 static int setup_bridge(const char veth_name[], int *ifi) {
2274         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2275         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2276         int r, bridge;
2277
2278         if (!arg_private_network)
2279                 return 0;
2280
2281         if (!arg_network_veth)
2282                 return 0;
2283
2284         if (!arg_network_bridge)
2285                 return 0;
2286
2287         bridge = (int) if_nametoindex(arg_network_bridge);
2288         if (bridge <= 0)
2289                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2290
2291         *ifi = bridge;
2292
2293         r = sd_rtnl_open(&rtnl, 0);
2294         if (r < 0)
2295                 return log_error_errno(r, "Failed to connect to netlink: %m");
2296
2297         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2298         if (r < 0)
2299                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2300
2301         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2302         if (r < 0)
2303                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2304
2305         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2306         if (r < 0)
2307                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2308
2309         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2310         if (r < 0)
2311                 return log_error_errno(r, "Failed to add netlink master field: %m");
2312
2313         r = sd_rtnl_call(rtnl, m, 0, NULL);
2314         if (r < 0)
2315                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2316
2317         return 0;
2318 }
2319
2320 static int parse_interface(struct udev *udev, const char *name) {
2321         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2322         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2323         int ifi;
2324
2325         ifi = (int) if_nametoindex(name);
2326         if (ifi <= 0)
2327                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2328
2329         sprintf(ifi_str, "n%i", ifi);
2330         d = udev_device_new_from_device_id(udev, ifi_str);
2331         if (!d)
2332                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2333
2334         if (udev_device_get_is_initialized(d) <= 0) {
2335                 log_error("Network interface %s is not initialized yet.", name);
2336                 return -EBUSY;
2337         }
2338
2339         return ifi;
2340 }
2341
2342 static int move_network_interfaces(pid_t pid) {
2343         _cleanup_udev_unref_ struct udev *udev = NULL;
2344         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2345         char **i;
2346         int r;
2347
2348         if (!arg_private_network)
2349                 return 0;
2350
2351         if (strv_isempty(arg_network_interfaces))
2352                 return 0;
2353
2354         r = sd_rtnl_open(&rtnl, 0);
2355         if (r < 0)
2356                 return log_error_errno(r, "Failed to connect to netlink: %m");
2357
2358         udev = udev_new();
2359         if (!udev) {
2360                 log_error("Failed to connect to udev.");
2361                 return -ENOMEM;
2362         }
2363
2364         STRV_FOREACH(i, arg_network_interfaces) {
2365                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2366                 int ifi;
2367
2368                 ifi = parse_interface(udev, *i);
2369                 if (ifi < 0)
2370                         return ifi;
2371
2372                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2375
2376                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2379
2380                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2381                 if (r < 0)
2382                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2383         }
2384
2385         return 0;
2386 }
2387
2388 static int setup_macvlan(pid_t pid) {
2389         _cleanup_udev_unref_ struct udev *udev = NULL;
2390         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2391         unsigned idx = 0;
2392         char **i;
2393         int r;
2394
2395         if (!arg_private_network)
2396                 return 0;
2397
2398         if (strv_isempty(arg_network_macvlan))
2399                 return 0;
2400
2401         r = sd_rtnl_open(&rtnl, 0);
2402         if (r < 0)
2403                 return log_error_errno(r, "Failed to connect to netlink: %m");
2404
2405         udev = udev_new();
2406         if (!udev) {
2407                 log_error("Failed to connect to udev.");
2408                 return -ENOMEM;
2409         }
2410
2411         STRV_FOREACH(i, arg_network_macvlan) {
2412                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2413                 _cleanup_free_ char *n = NULL;
2414                 struct ether_addr mac;
2415                 int ifi;
2416
2417                 ifi = parse_interface(udev, *i);
2418                 if (ifi < 0)
2419                         return ifi;
2420
2421                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2422                 if (r < 0)
2423                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2424
2425                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2426                 if (r < 0)
2427                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2428
2429                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2430                 if (r < 0)
2431                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2432
2433                 n = strappend("mv-", *i);
2434                 if (!n)
2435                         return log_oom();
2436
2437                 strshorten(n, IFNAMSIZ-1);
2438
2439                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2440                 if (r < 0)
2441                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2442
2443                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2444                 if (r < 0)
2445                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2446
2447                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2448                 if (r < 0)
2449                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2450
2451                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2452                 if (r < 0)
2453                         return log_error_errno(r, "Failed to open netlink container: %m");
2454
2455                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2456                 if (r < 0)
2457                         return log_error_errno(r, "Failed to open netlink container: %m");
2458
2459                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2460                 if (r < 0)
2461                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2462
2463                 r = sd_rtnl_message_close_container(m);
2464                 if (r < 0)
2465                         return log_error_errno(r, "Failed to close netlink container: %m");
2466
2467                 r = sd_rtnl_message_close_container(m);
2468                 if (r < 0)
2469                         return log_error_errno(r, "Failed to close netlink container: %m");
2470
2471                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2472                 if (r < 0)
2473                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2474         }
2475
2476         return 0;
2477 }
2478
2479 static int setup_ipvlan(pid_t pid) {
2480         _cleanup_udev_unref_ struct udev *udev = NULL;
2481         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2482         char **i;
2483         int r;
2484
2485         if (!arg_private_network)
2486                 return 0;
2487
2488         if (strv_isempty(arg_network_ipvlan))
2489                 return 0;
2490
2491         r = sd_rtnl_open(&rtnl, 0);
2492         if (r < 0)
2493                 return log_error_errno(r, "Failed to connect to netlink: %m");
2494
2495         udev = udev_new();
2496         if (!udev) {
2497                 log_error("Failed to connect to udev.");
2498                 return -ENOMEM;
2499         }
2500
2501         STRV_FOREACH(i, arg_network_ipvlan) {
2502                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2503                 _cleanup_free_ char *n = NULL;
2504                 int ifi;
2505
2506                 ifi = parse_interface(udev, *i);
2507                 if (ifi < 0)
2508                         return ifi;
2509
2510                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2511                 if (r < 0)
2512                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2513
2514                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2515                 if (r < 0)
2516                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2517
2518                 n = strappend("iv-", *i);
2519                 if (!n)
2520                         return log_oom();
2521
2522                 strshorten(n, IFNAMSIZ-1);
2523
2524                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2525                 if (r < 0)
2526                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2527
2528                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2529                 if (r < 0)
2530                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2531
2532                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2533                 if (r < 0)
2534                         return log_error_errno(r, "Failed to open netlink container: %m");
2535
2536                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2537                 if (r < 0)
2538                         return log_error_errno(r, "Failed to open netlink container: %m");
2539
2540                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2541                 if (r < 0)
2542                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2543
2544                 r = sd_rtnl_message_close_container(m);
2545                 if (r < 0)
2546                         return log_error_errno(r, "Failed to close netlink container: %m");
2547
2548                 r = sd_rtnl_message_close_container(m);
2549                 if (r < 0)
2550                         return log_error_errno(r, "Failed to close netlink container: %m");
2551
2552                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2553                 if (r < 0)
2554                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2555         }
2556
2557         return 0;
2558 }
2559
2560 static int setup_seccomp(void) {
2561
2562 #ifdef HAVE_SECCOMP
2563         static const int blacklist[] = {
2564                 SCMP_SYS(kexec_load),
2565                 SCMP_SYS(open_by_handle_at),
2566                 SCMP_SYS(iopl),
2567                 SCMP_SYS(ioperm),
2568                 SCMP_SYS(swapon),
2569                 SCMP_SYS(swapoff),
2570         };
2571
2572         static const int kmod_blacklist[] = {
2573                 SCMP_SYS(init_module),
2574                 SCMP_SYS(finit_module),
2575                 SCMP_SYS(delete_module),
2576         };
2577
2578         scmp_filter_ctx seccomp;
2579         unsigned i;
2580         int r;
2581
2582         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2583         if (!seccomp)
2584                 return log_oom();
2585
2586         r = seccomp_add_secondary_archs(seccomp);
2587         if (r < 0) {
2588                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2589                 goto finish;
2590         }
2591
2592         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2593                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2594                 if (r == -EFAULT)
2595                         continue; /* unknown syscall */
2596                 if (r < 0) {
2597                         log_error_errno(r, "Failed to block syscall: %m");
2598                         goto finish;
2599                 }
2600         }
2601
2602         /* If the CAP_SYS_MODULE capability is not requested then
2603          * we'll block the kmod syscalls too */
2604         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2605                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2606                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2607                         if (r == -EFAULT)
2608                                 continue; /* unknown syscall */
2609                         if (r < 0) {
2610                                 log_error_errno(r, "Failed to block syscall: %m");
2611                                 goto finish;
2612                         }
2613                 }
2614         }
2615
2616         /*
2617            Audit is broken in containers, much of the userspace audit
2618            hookup will fail if running inside a container. We don't
2619            care and just turn off creation of audit sockets.
2620
2621            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2622            with EAFNOSUPPORT which audit userspace uses as indication
2623            that audit is disabled in the kernel.
2624          */
2625
2626         r = seccomp_rule_add(
2627                         seccomp,
2628                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2629                         SCMP_SYS(socket),
2630                         2,
2631                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2632                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2633         if (r < 0) {
2634                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2635                 goto finish;
2636         }
2637
2638         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2639         if (r < 0) {
2640                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2641                 goto finish;
2642         }
2643
2644         r = seccomp_load(seccomp);
2645         if (r < 0)
2646                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2647
2648 finish:
2649         seccomp_release(seccomp);
2650         return r;
2651 #else
2652         return 0;
2653 #endif
2654
2655 }
2656
2657 static int setup_propagate(const char *root) {
2658         const char *p, *q;
2659
2660         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2661         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2662         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2663         (void) mkdir_p(p, 0600);
2664
2665         q = strjoina(root, "/run/systemd/nspawn/incoming");
2666         mkdir_parents(q, 0755);
2667         mkdir_p(q, 0600);
2668
2669         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2670                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2671
2672         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2673                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2674
2675         return 0;
2676 }
2677
2678 static int setup_image(char **device_path, int *loop_nr) {
2679         struct loop_info64 info = {
2680                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2681         };
2682         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2683         _cleanup_free_ char* loopdev = NULL;
2684         struct stat st;
2685         int r, nr;
2686
2687         assert(device_path);
2688         assert(loop_nr);
2689         assert(arg_image);
2690
2691         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2692         if (fd < 0)
2693                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2694
2695         if (fstat(fd, &st) < 0)
2696                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2697
2698         if (S_ISBLK(st.st_mode)) {
2699                 char *p;
2700
2701                 p = strdup(arg_image);
2702                 if (!p)
2703                         return log_oom();
2704
2705                 *device_path = p;
2706
2707                 *loop_nr = -1;
2708
2709                 r = fd;
2710                 fd = -1;
2711
2712                 return r;
2713         }
2714
2715         if (!S_ISREG(st.st_mode)) {
2716                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2717                 return -EINVAL;
2718         }
2719
2720         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2721         if (control < 0)
2722                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2723
2724         nr = ioctl(control, LOOP_CTL_GET_FREE);
2725         if (nr < 0)
2726                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2727
2728         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2729                 return log_oom();
2730
2731         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2732         if (loop < 0)
2733                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2734
2735         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2736                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2737
2738         if (arg_read_only)
2739                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2740
2741         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2742                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2743
2744         *device_path = loopdev;
2745         loopdev = NULL;
2746
2747         *loop_nr = nr;
2748
2749         r = loop;
2750         loop = -1;
2751
2752         return r;
2753 }
2754
2755 #define PARTITION_TABLE_BLURB \
2756         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2757         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2758         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2759         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2760         "to be bootable with systemd-nspawn."
2761
2762 static int dissect_image(
2763                 int fd,
2764                 char **root_device, bool *root_device_rw,
2765                 char **home_device, bool *home_device_rw,
2766                 char **srv_device, bool *srv_device_rw,
2767                 bool *secondary) {
2768
2769 #ifdef HAVE_BLKID
2770         int home_nr = -1, srv_nr = -1;
2771 #ifdef GPT_ROOT_NATIVE
2772         int root_nr = -1;
2773 #endif
2774 #ifdef GPT_ROOT_SECONDARY
2775         int secondary_root_nr = -1;
2776 #endif
2777         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2778         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2779         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2780         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2781         _cleanup_udev_unref_ struct udev *udev = NULL;
2782         struct udev_list_entry *first, *item;
2783         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2784         bool is_gpt, is_mbr, multiple_generic = false;
2785         const char *pttype = NULL;
2786         blkid_partlist pl;
2787         struct stat st;
2788         unsigned i;
2789         int r;
2790
2791         assert(fd >= 0);
2792         assert(root_device);
2793         assert(home_device);
2794         assert(srv_device);
2795         assert(secondary);
2796         assert(arg_image);
2797
2798         b = blkid_new_probe();
2799         if (!b)
2800                 return log_oom();
2801
2802         errno = 0;
2803         r = blkid_probe_set_device(b, fd, 0, 0);
2804         if (r != 0) {
2805                 if (errno == 0)
2806                         return log_oom();
2807
2808                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2809                 return -errno;
2810         }
2811
2812         blkid_probe_enable_partitions(b, 1);
2813         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2814
2815         errno = 0;
2816         r = blkid_do_safeprobe(b);
2817         if (r == -2 || r == 1) {
2818                 log_error("Failed to identify any partition table on\n"
2819                           "    %s\n"
2820                           PARTITION_TABLE_BLURB, arg_image);
2821                 return -EINVAL;
2822         } else if (r != 0) {
2823                 if (errno == 0)
2824                         errno = EIO;
2825                 log_error_errno(errno, "Failed to probe: %m");
2826                 return -errno;
2827         }
2828
2829         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2830
2831         is_gpt = streq_ptr(pttype, "gpt");
2832         is_mbr = streq_ptr(pttype, "dos");
2833
2834         if (!is_gpt && !is_mbr) {
2835                 log_error("No GPT or MBR partition table discovered on\n"
2836                           "    %s\n"
2837                           PARTITION_TABLE_BLURB, arg_image);
2838                 return -EINVAL;
2839         }
2840
2841         errno = 0;
2842         pl = blkid_probe_get_partitions(b);
2843         if (!pl) {
2844                 if (errno == 0)
2845                         return log_oom();
2846
2847                 log_error("Failed to list partitions of %s", arg_image);
2848                 return -errno;
2849         }
2850
2851         udev = udev_new();
2852         if (!udev)
2853                 return log_oom();
2854
2855         if (fstat(fd, &st) < 0)
2856                 return log_error_errno(errno, "Failed to stat block device: %m");
2857
2858         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2859         if (!d)
2860                 return log_oom();
2861
2862         for (i = 0;; i++) {
2863                 int n, m;
2864
2865                 if (i >= 10) {
2866                         log_error("Kernel partitions never appeared.");
2867                         return -ENXIO;
2868                 }
2869
2870                 e = udev_enumerate_new(udev);
2871                 if (!e)
2872                         return log_oom();
2873
2874                 r = udev_enumerate_add_match_parent(e, d);
2875                 if (r < 0)
2876                         return log_oom();
2877
2878                 r = udev_enumerate_scan_devices(e);
2879                 if (r < 0)
2880                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2881
2882                 /* Count the partitions enumerated by the kernel */
2883                 n = 0;
2884                 first = udev_enumerate_get_list_entry(e);
2885                 udev_list_entry_foreach(item, first)
2886                         n++;
2887
2888                 /* Count the partitions enumerated by blkid */
2889                 m = blkid_partlist_numof_partitions(pl);
2890                 if (n == m + 1)
2891                         break;
2892                 if (n > m + 1) {
2893                         log_error("blkid and kernel partition list do not match.");
2894                         return -EIO;
2895                 }
2896                 if (n < m + 1) {
2897                         unsigned j;
2898
2899                         /* The kernel has probed fewer partitions than
2900                          * blkid? Maybe the kernel prober is still
2901                          * running or it got EBUSY because udev
2902                          * already opened the device. Let's reprobe
2903                          * the device, which is a synchronous call
2904                          * that waits until probing is complete. */
2905
2906                         for (j = 0; j < 20; j++) {
2907
2908                                 r = ioctl(fd, BLKRRPART, 0);
2909                                 if (r < 0)
2910                                         r = -errno;
2911                                 if (r >= 0 || r != -EBUSY)
2912                                         break;
2913
2914                                 /* If something else has the device
2915                                  * open, such as an udev rule, the
2916                                  * ioctl will return EBUSY. Since
2917                                  * there's no way to wait until it
2918                                  * isn't busy anymore, let's just wait
2919                                  * a bit, and try again.
2920                                  *
2921                                  * This is really something they
2922                                  * should fix in the kernel! */
2923
2924                                 usleep(50 * USEC_PER_MSEC);
2925                         }
2926
2927                         if (r < 0)
2928                                 return log_error_errno(r, "Failed to reread partition table: %m");
2929                 }
2930
2931                 e = udev_enumerate_unref(e);
2932         }
2933
2934         first = udev_enumerate_get_list_entry(e);
2935         udev_list_entry_foreach(item, first) {
2936                 _cleanup_udev_device_unref_ struct udev_device *q;
2937                 const char *node;
2938                 unsigned long long flags;
2939                 blkid_partition pp;
2940                 dev_t qn;
2941                 int nr;
2942
2943                 errno = 0;
2944                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2945                 if (!q) {
2946                         if (!errno)
2947                                 errno = ENOMEM;
2948
2949                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2950                         return -errno;
2951                 }
2952
2953                 qn = udev_device_get_devnum(q);
2954                 if (major(qn) == 0)
2955                         continue;
2956
2957                 if (st.st_rdev == qn)
2958                         continue;
2959
2960                 node = udev_device_get_devnode(q);
2961                 if (!node)
2962                         continue;
2963
2964                 pp = blkid_partlist_devno_to_partition(pl, qn);
2965                 if (!pp)
2966                         continue;
2967
2968                 flags = blkid_partition_get_flags(pp);
2969
2970                 nr = blkid_partition_get_partno(pp);
2971                 if (nr < 0)
2972                         continue;
2973
2974                 if (is_gpt) {
2975                         sd_id128_t type_id;
2976                         const char *stype;
2977
2978                         if (flags & GPT_FLAG_NO_AUTO)
2979                                 continue;
2980
2981                         stype = blkid_partition_get_type_string(pp);
2982                         if (!stype)
2983                                 continue;
2984
2985                         if (sd_id128_from_string(stype, &type_id) < 0)
2986                                 continue;
2987
2988                         if (sd_id128_equal(type_id, GPT_HOME)) {
2989
2990                                 if (home && nr >= home_nr)
2991                                         continue;
2992
2993                                 home_nr = nr;
2994                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2995
2996                                 r = free_and_strdup(&home, node);
2997                                 if (r < 0)
2998                                         return log_oom();
2999
3000                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
3001
3002                                 if (srv && nr >= srv_nr)
3003                                         continue;
3004
3005                                 srv_nr = nr;
3006                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3007
3008                                 r = free_and_strdup(&srv, node);
3009                                 if (r < 0)
3010                                         return log_oom();
3011                         }
3012 #ifdef GPT_ROOT_NATIVE
3013                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3014
3015                                 if (root && nr >= root_nr)
3016                                         continue;
3017
3018                                 root_nr = nr;
3019                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3020
3021                                 r = free_and_strdup(&root, node);
3022                                 if (r < 0)
3023                                         return log_oom();
3024                         }
3025 #endif
3026 #ifdef GPT_ROOT_SECONDARY
3027                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3028
3029                                 if (secondary_root && nr >= secondary_root_nr)
3030                                         continue;
3031
3032                                 secondary_root_nr = nr;
3033                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3034
3035                                 r = free_and_strdup(&secondary_root, node);
3036                                 if (r < 0)
3037                                         return log_oom();
3038                         }
3039 #endif
3040                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3041
3042                                 if (generic)
3043                                         multiple_generic = true;
3044                                 else {
3045                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3046
3047                                         r = free_and_strdup(&generic, node);
3048                                         if (r < 0)
3049                                                 return log_oom();
3050                                 }
3051                         }
3052
3053                 } else if (is_mbr) {
3054                         int type;
3055
3056                         if (flags != 0x80) /* Bootable flag */
3057                                 continue;
3058
3059                         type = blkid_partition_get_type(pp);
3060                         if (type != 0x83) /* Linux partition */
3061                                 continue;
3062
3063                         if (generic)
3064                                 multiple_generic = true;
3065                         else {
3066                                 generic_rw = true;
3067
3068                                 r = free_and_strdup(&root, node);
3069                                 if (r < 0)
3070                                         return log_oom();
3071                         }
3072                 }
3073         }
3074
3075         if (root) {
3076                 *root_device = root;
3077                 root = NULL;
3078
3079                 *root_device_rw = root_rw;
3080                 *secondary = false;
3081         } else if (secondary_root) {
3082                 *root_device = secondary_root;
3083                 secondary_root = NULL;
3084
3085                 *root_device_rw = secondary_root_rw;
3086                 *secondary = true;
3087         } else if (generic) {
3088
3089                 /* There were no partitions with precise meanings
3090                  * around, but we found generic partitions. In this
3091                  * case, if there's only one, we can go ahead and boot
3092                  * it, otherwise we bail out, because we really cannot
3093                  * make any sense of it. */
3094
3095                 if (multiple_generic) {
3096                         log_error("Identified multiple bootable Linux partitions on\n"
3097                                   "    %s\n"
3098                                   PARTITION_TABLE_BLURB, arg_image);
3099                         return -EINVAL;
3100                 }
3101
3102                 *root_device = generic;
3103                 generic = NULL;
3104
3105                 *root_device_rw = generic_rw;
3106                 *secondary = false;
3107         } else {
3108                 log_error("Failed to identify root partition in disk image\n"
3109                           "    %s\n"
3110                           PARTITION_TABLE_BLURB, arg_image);
3111                 return -EINVAL;
3112         }
3113
3114         if (home) {
3115                 *home_device = home;
3116                 home = NULL;
3117
3118                 *home_device_rw = home_rw;
3119         }
3120
3121         if (srv) {
3122                 *srv_device = srv;
3123                 srv = NULL;
3124
3125                 *srv_device_rw = srv_rw;
3126         }
3127
3128         return 0;
3129 #else
3130         log_error("--image= is not supported, compiled without blkid support.");
3131         return -ENOTSUP;
3132 #endif
3133 }
3134
3135 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3136 #ifdef HAVE_BLKID
3137         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3138         const char *fstype, *p;
3139         int r;
3140
3141         assert(what);
3142         assert(where);
3143
3144         if (arg_read_only)
3145                 rw = false;
3146
3147         if (directory)
3148                 p = strjoina(where, directory);
3149         else
3150                 p = where;
3151
3152         errno = 0;
3153         b = blkid_new_probe_from_filename(what);
3154         if (!b) {
3155                 if (errno == 0)
3156                         return log_oom();
3157                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3158                 return -errno;
3159         }
3160
3161         blkid_probe_enable_superblocks(b, 1);
3162         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3163
3164         errno = 0;
3165         r = blkid_do_safeprobe(b);
3166         if (r == -1 || r == 1) {
3167                 log_error("Cannot determine file system type of %s", what);
3168                 return -EINVAL;
3169         } else if (r != 0) {
3170                 if (errno == 0)
3171                         errno = EIO;
3172                 log_error_errno(errno, "Failed to probe %s: %m", what);
3173                 return -errno;
3174         }
3175
3176         errno = 0;
3177         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3178                 if (errno == 0)
3179                         errno = EINVAL;
3180                 log_error("Failed to determine file system type of %s", what);
3181                 return -errno;
3182         }
3183
3184         if (streq(fstype, "crypto_LUKS")) {
3185                 log_error("nspawn currently does not support LUKS disk images.");
3186                 return -ENOTSUP;
3187         }
3188
3189         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3190                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3191
3192         return 0;
3193 #else
3194         log_error("--image= is not supported, compiled without blkid support.");
3195         return -ENOTSUP;
3196 #endif
3197 }
3198
3199 static int mount_devices(
3200                 const char *where,
3201                 const char *root_device, bool root_device_rw,
3202                 const char *home_device, bool home_device_rw,
3203                 const char *srv_device, bool srv_device_rw) {
3204         int r;
3205
3206         assert(where);
3207
3208         if (root_device) {
3209                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3210                 if (r < 0)
3211                         return log_error_errno(r, "Failed to mount root directory: %m");
3212         }
3213
3214         if (home_device) {
3215                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3216                 if (r < 0)
3217                         return log_error_errno(r, "Failed to mount home directory: %m");
3218         }
3219
3220         if (srv_device) {
3221                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3222                 if (r < 0)
3223                         return log_error_errno(r, "Failed to mount server data directory: %m");
3224         }
3225
3226         return 0;
3227 }
3228
3229 static void loop_remove(int nr, int *image_fd) {
3230         _cleanup_close_ int control = -1;
3231         int r;
3232
3233         if (nr < 0)
3234                 return;
3235
3236         if (image_fd && *image_fd >= 0) {
3237                 r = ioctl(*image_fd, LOOP_CLR_FD);
3238                 if (r < 0)
3239                         log_debug_errno(errno, "Failed to close loop image: %m");
3240                 *image_fd = safe_close(*image_fd);
3241         }
3242
3243         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3244         if (control < 0) {
3245                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3246                 return;
3247         }
3248
3249         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3250         if (r < 0)
3251                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3252 }
3253
3254 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3255         int pipe_fds[2];
3256         pid_t pid;
3257
3258         assert(database);
3259         assert(key);
3260         assert(rpid);
3261
3262         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3263                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3264
3265         pid = fork();
3266         if (pid < 0)
3267                 return log_error_errno(errno, "Failed to fork getent child: %m");
3268         else if (pid == 0) {
3269                 int nullfd;
3270                 char *empty_env = NULL;
3271
3272                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3273                         _exit(EXIT_FAILURE);
3274
3275                 if (pipe_fds[0] > 2)
3276                         safe_close(pipe_fds[0]);
3277                 if (pipe_fds[1] > 2)
3278                         safe_close(pipe_fds[1]);
3279
3280                 nullfd = open("/dev/null", O_RDWR);
3281                 if (nullfd < 0)
3282                         _exit(EXIT_FAILURE);
3283
3284                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3285                         _exit(EXIT_FAILURE);
3286
3287                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3288                         _exit(EXIT_FAILURE);
3289
3290                 if (nullfd > 2)
3291                         safe_close(nullfd);
3292
3293                 reset_all_signal_handlers();
3294                 close_all_fds(NULL, 0);
3295
3296                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3297                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3298                 _exit(EXIT_FAILURE);
3299         }
3300
3301         pipe_fds[1] = safe_close(pipe_fds[1]);
3302
3303         *rpid = pid;
3304
3305         return pipe_fds[0];
3306 }
3307
3308 static int change_uid_gid(char **_home) {
3309         char line[LINE_MAX], *x, *u, *g, *h;
3310         const char *word, *state;
3311         _cleanup_free_ uid_t *uids = NULL;
3312         _cleanup_free_ char *home = NULL;
3313         _cleanup_fclose_ FILE *f = NULL;
3314         _cleanup_close_ int fd = -1;
3315         unsigned n_uids = 0;
3316         size_t sz = 0, l;
3317         uid_t uid;
3318         gid_t gid;
3319         pid_t pid;
3320         int r;
3321
3322         assert(_home);
3323
3324         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3325                 /* Reset everything fully to 0, just in case */
3326
3327                 if (setgroups(0, NULL) < 0)
3328                         return log_error_errno(errno, "setgroups() failed: %m");
3329
3330                 if (setresgid(0, 0, 0) < 0)
3331                         return log_error_errno(errno, "setregid() failed: %m");
3332
3333                 if (setresuid(0, 0, 0) < 0)
3334                         return log_error_errno(errno, "setreuid() failed: %m");
3335
3336                 *_home = NULL;
3337                 return 0;
3338         }
3339
3340         /* First, get user credentials */
3341         fd = spawn_getent("passwd", arg_user, &pid);
3342         if (fd < 0)
3343                 return fd;
3344
3345         f = fdopen(fd, "r");
3346         if (!f)
3347                 return log_oom();
3348         fd = -1;
3349
3350         if (!fgets(line, sizeof(line), f)) {
3351
3352                 if (!ferror(f)) {
3353                         log_error("Failed to resolve user %s.", arg_user);
3354                         return -ESRCH;
3355                 }
3356
3357                 log_error_errno(errno, "Failed to read from getent: %m");
3358                 return -errno;
3359         }
3360
3361         truncate_nl(line);
3362
3363         wait_for_terminate_and_warn("getent passwd", pid, true);
3364
3365         x = strchr(line, ':');
3366         if (!x) {
3367                 log_error("/etc/passwd entry has invalid user field.");
3368                 return -EIO;
3369         }
3370
3371         u = strchr(x+1, ':');
3372         if (!u) {
3373                 log_error("/etc/passwd entry has invalid password field.");
3374                 return -EIO;
3375         }
3376
3377         u++;
3378         g = strchr(u, ':');
3379         if (!g) {
3380                 log_error("/etc/passwd entry has invalid UID field.");
3381                 return -EIO;
3382         }
3383
3384         *g = 0;
3385         g++;
3386         x = strchr(g, ':');
3387         if (!x) {
3388                 log_error("/etc/passwd entry has invalid GID field.");
3389                 return -EIO;
3390         }
3391
3392         *x = 0;
3393         h = strchr(x+1, ':');
3394         if (!h) {
3395                 log_error("/etc/passwd entry has invalid GECOS field.");
3396                 return -EIO;
3397         }
3398
3399         h++;
3400         x = strchr(h, ':');
3401         if (!x) {
3402                 log_error("/etc/passwd entry has invalid home directory field.");
3403                 return -EIO;
3404         }
3405
3406         *x = 0;
3407
3408         r = parse_uid(u, &uid);
3409         if (r < 0) {
3410                 log_error("Failed to parse UID of user.");
3411                 return -EIO;
3412         }
3413
3414         r = parse_gid(g, &gid);
3415         if (r < 0) {
3416                 log_error("Failed to parse GID of user.");
3417                 return -EIO;
3418         }
3419
3420         home = strdup(h);
3421         if (!home)
3422                 return log_oom();
3423
3424         /* Second, get group memberships */
3425         fd = spawn_getent("initgroups", arg_user, &pid);
3426         if (fd < 0)
3427                 return fd;
3428
3429         fclose(f);
3430         f = fdopen(fd, "r");
3431         if (!f)
3432                 return log_oom();
3433         fd = -1;
3434
3435         if (!fgets(line, sizeof(line), f)) {
3436                 if (!ferror(f)) {
3437                         log_error("Failed to resolve user %s.", arg_user);
3438                         return -ESRCH;
3439                 }
3440
3441                 log_error_errno(errno, "Failed to read from getent: %m");
3442                 return -errno;
3443         }
3444
3445         truncate_nl(line);
3446
3447         wait_for_terminate_and_warn("getent initgroups", pid, true);
3448
3449         /* Skip over the username and subsequent separator whitespace */
3450         x = line;
3451         x += strcspn(x, WHITESPACE);
3452         x += strspn(x, WHITESPACE);
3453
3454         FOREACH_WORD(word, l, x, state) {
3455                 char c[l+1];
3456
3457                 memcpy(c, word, l);
3458                 c[l] = 0;
3459
3460                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3461                         return log_oom();
3462
3463                 r = parse_uid(c, &uids[n_uids++]);
3464                 if (r < 0) {
3465                         log_error("Failed to parse group data from getent.");
3466                         return -EIO;
3467                 }
3468         }
3469
3470         r = mkdir_parents(home, 0775);
3471         if (r < 0)
3472                 return log_error_errno(r, "Failed to make home root directory: %m");
3473
3474         r = mkdir_safe(home, 0755, uid, gid);
3475         if (r < 0 && r != -EEXIST)
3476                 return log_error_errno(r, "Failed to make home directory: %m");
3477
3478         fchown(STDIN_FILENO, uid, gid);
3479         fchown(STDOUT_FILENO, uid, gid);
3480         fchown(STDERR_FILENO, uid, gid);
3481
3482         if (setgroups(n_uids, uids) < 0)
3483                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3484
3485         if (setresgid(gid, gid, gid) < 0)
3486                 return log_error_errno(errno, "setregid() failed: %m");
3487
3488         if (setresuid(uid, uid, uid) < 0)
3489                 return log_error_errno(errno, "setreuid() failed: %m");
3490
3491         if (_home) {
3492                 *_home = home;
3493                 home = NULL;
3494         }
3495
3496         return 0;
3497 }
3498
3499 /*
3500  * Return values:
3501  * < 0 : wait_for_terminate() failed to get the state of the
3502  *       container, the container was terminated by a signal, or
3503  *       failed for an unknown reason.  No change is made to the
3504  *       container argument.
3505  * > 0 : The program executed in the container terminated with an
3506  *       error.  The exit code of the program executed in the
3507  *       container is returned.  The container argument has been set
3508  *       to CONTAINER_TERMINATED.
3509  *   0 : The container is being rebooted, has been shut down or exited
3510  *       successfully.  The container argument has been set to either
3511  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3512  *
3513  * That is, success is indicated by a return value of zero, and an
3514  * error is indicated by a non-zero value.
3515  */
3516 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3517         siginfo_t status;
3518         int r;
3519
3520         r = wait_for_terminate(pid, &status);
3521         if (r < 0)
3522                 return log_warning_errno(r, "Failed to wait for container: %m");
3523
3524         switch (status.si_code) {
3525
3526         case CLD_EXITED:
3527                 if (status.si_status == 0) {
3528                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3529
3530                 } else
3531                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3532
3533                 *container = CONTAINER_TERMINATED;
3534                 return status.si_status;
3535
3536         case CLD_KILLED:
3537                 if (status.si_status == SIGINT) {
3538
3539                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3540                         *container = CONTAINER_TERMINATED;
3541                         return 0;
3542
3543                 } else if (status.si_status == SIGHUP) {
3544
3545                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3546                         *container = CONTAINER_REBOOTED;
3547                         return 0;
3548                 }
3549
3550                 /* CLD_KILLED fallthrough */
3551
3552         case CLD_DUMPED:
3553                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3554                 return -EIO;
3555
3556         default:
3557                 log_error("Container %s failed due to unknown reason.", arg_machine);
3558                 return -EIO;
3559         }
3560
3561         return r;
3562 }
3563
3564 static void nop_handler(int sig) {}
3565
3566 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3567         pid_t pid;
3568
3569         pid = PTR_TO_UINT32(userdata);
3570         if (pid > 0) {
3571                 if (kill(pid, SIGRTMIN+3) >= 0) {
3572                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3573                         sd_event_source_set_userdata(s, NULL);
3574                         return 0;
3575                 }
3576         }
3577
3578         sd_event_exit(sd_event_source_get_event(s), 0);
3579         return 0;
3580 }
3581
3582 static int determine_names(void) {
3583         int r;
3584
3585         if (!arg_image && !arg_directory) {
3586                 if (arg_machine) {
3587                         _cleanup_(image_unrefp) Image *i = NULL;
3588
3589                         r = image_find(arg_machine, &i);
3590                         if (r < 0)
3591                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3592                         else if (r == 0) {
3593                                 log_error("No image for machine '%s': %m", arg_machine);
3594                                 return -ENOENT;
3595                         }
3596
3597                         if (i->type == IMAGE_RAW)
3598                                 r = set_sanitized_path(&arg_image, i->path);
3599                         else
3600                                 r = set_sanitized_path(&arg_directory, i->path);
3601                         if (r < 0)
3602                                 return log_error_errno(r, "Invalid image directory: %m");
3603
3604                         arg_read_only = arg_read_only || i->read_only;
3605                 } else
3606                         arg_directory = get_current_dir_name();
3607
3608                 if (!arg_directory && !arg_machine) {
3609                         log_error("Failed to determine path, please use -D or -i.");
3610                         return -EINVAL;
3611                 }
3612         }
3613
3614         if (!arg_machine) {
3615                 if (arg_directory && path_equal(arg_directory, "/"))
3616                         arg_machine = gethostname_malloc();
3617                 else
3618                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3619
3620                 if (!arg_machine)
3621                         return log_oom();
3622
3623                 hostname_cleanup(arg_machine, false);
3624                 if (!machine_name_is_valid(arg_machine)) {
3625                         log_error("Failed to determine machine name automatically, please use -M.");
3626                         return -EINVAL;
3627                 }
3628
3629                 if (arg_ephemeral) {
3630                         char *b;
3631
3632                         /* Add a random suffix when this is an
3633                          * ephemeral machine, so that we can run many
3634                          * instances at once without manually having
3635                          * to specify -M each time. */
3636
3637                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3638                                 return log_oom();
3639
3640                         free(arg_machine);
3641                         arg_machine = b;
3642                 }
3643         }
3644
3645         return 0;
3646 }
3647
3648 static int determine_uid_shift(void) {
3649         int r;
3650
3651         if (!arg_userns)
3652                 return 0;
3653
3654         if (arg_uid_shift == UID_INVALID) {
3655                 struct stat st;
3656
3657                 r = stat(arg_directory, &st);
3658                 if (r < 0)
3659                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3660
3661                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3662
3663                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3664                         log_error("UID and GID base of %s don't match.", arg_directory);
3665                         return -EINVAL;
3666                 }
3667
3668                 arg_uid_range = UINT32_C(0x10000);
3669         }
3670
3671         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3672                 log_error("UID base too high for UID range.");
3673                 return -EINVAL;
3674         }
3675
3676         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3677         return 0;
3678 }
3679
3680 int main(int argc, char *argv[]) {
3681
3682         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3683         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3684         _cleanup_close_ int master = -1, image_fd = -1;
3685         _cleanup_fdset_free_ FDSet *fds = NULL;
3686         int r, n_fd_passed, loop_nr = -1;
3687         char veth_name[IFNAMSIZ];
3688         bool secondary = false, remove_subvol = false;
3689         sigset_t mask, mask_chld;
3690         pid_t pid = 0;
3691         int ret = EXIT_SUCCESS;
3692         union in_addr_union exposed = {};
3693         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3694         bool interactive;
3695
3696         log_parse_environment();
3697         log_open();
3698
3699         r = parse_argv(argc, argv);
3700         if (r <= 0)
3701                 goto finish;
3702
3703         r = determine_names();
3704         if (r < 0)
3705                 goto finish;
3706
3707         if (geteuid() != 0) {
3708                 log_error("Need to be root.");
3709                 r = -EPERM;
3710                 goto finish;
3711         }
3712
3713         if (sd_booted() <= 0) {
3714                 log_error("Not running on a systemd system.");
3715                 r = -EINVAL;
3716                 goto finish;
3717         }
3718
3719         log_close();
3720         n_fd_passed = sd_listen_fds(false);
3721         if (n_fd_passed > 0) {
3722                 r = fdset_new_listen_fds(&fds, false);
3723                 if (r < 0) {
3724                         log_error_errno(r, "Failed to collect file descriptors: %m");
3725                         goto finish;
3726                 }
3727         }
3728         fdset_close_others(fds);
3729         log_open();
3730
3731         if (arg_directory) {
3732                 assert(!arg_image);
3733
3734                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3735                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3736                         r = -EINVAL;
3737                         goto finish;
3738                 }
3739
3740                 if (arg_ephemeral) {
3741                         char *np;
3742
3743                         /* If the specified path is a mount point we
3744                          * generate the new snapshot immediately
3745                          * inside it under a random name. However if
3746                          * the specified is not a mount point we
3747                          * create the new snapshot in the parent
3748                          * directory, just next to it. */
3749                         r = path_is_mount_point(arg_directory, false);
3750                         if (r < 0) {
3751                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3752                                 goto finish;
3753                         }
3754                         if (r > 0)
3755                                 r = tempfn_random_child(arg_directory, &np);
3756                         else
3757                                 r = tempfn_random(arg_directory, &np);
3758                         if (r < 0) {
3759                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3760                                 goto finish;
3761                         }
3762
3763                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3764                         if (r < 0) {
3765                                 log_error_errno(r, "Failed to lock %s: %m", np);
3766                                 goto finish;
3767                         }
3768
3769                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3770                         if (r < 0) {
3771                                 free(np);
3772                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3773                                 goto finish;
3774                         }
3775
3776                         free(arg_directory);
3777                         arg_directory = np;
3778
3779                         remove_subvol = true;
3780
3781                 } else {
3782                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3783                         if (r == -EBUSY) {
3784                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3785                                 goto finish;
3786                         }
3787                         if (r < 0) {
3788                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3789                                 return r;
3790                         }
3791
3792                         if (arg_template) {
3793                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3794                                 if (r == -EEXIST) {
3795                                         if (!arg_quiet)
3796                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3797                                 } else if (r < 0) {
3798                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3799                                         goto finish;
3800                                 } else {
3801                                         if (!arg_quiet)
3802                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3803                                 }
3804                         }
3805                 }
3806
3807                 if (arg_boot) {
3808                         if (path_is_os_tree(arg_directory) <= 0) {
3809                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3810                                 r = -EINVAL;
3811                                 goto finish;
3812                         }
3813                 } else {
3814                         const char *p;
3815
3816                         p = strjoina(arg_directory,
3817                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3818                         if (access(p, F_OK) < 0) {
3819                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3820                                 r = -EINVAL;
3821                                 goto finish;
3822                         }
3823                 }
3824
3825         } else {
3826                 char template[] = "/tmp/nspawn-root-XXXXXX";
3827
3828                 assert(arg_image);
3829                 assert(!arg_template);
3830
3831                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3832                 if (r == -EBUSY) {
3833                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3834                         goto finish;
3835                 }
3836                 if (r < 0) {
3837                         r = log_error_errno(r, "Failed to create image lock: %m");
3838                         goto finish;
3839                 }
3840
3841                 if (!mkdtemp(template)) {
3842                         log_error_errno(errno, "Failed to create temporary directory: %m");
3843                         r = -errno;
3844                         goto finish;
3845                 }
3846
3847                 arg_directory = strdup(template);
3848                 if (!arg_directory) {
3849                         r = log_oom();
3850                         goto finish;
3851                 }
3852
3853                 image_fd = setup_image(&device_path, &loop_nr);
3854                 if (image_fd < 0) {
3855                         r = image_fd;
3856                         goto finish;
3857                 }
3858
3859                 r = dissect_image(image_fd,
3860                                   &root_device, &root_device_rw,
3861                                   &home_device, &home_device_rw,
3862                                   &srv_device, &srv_device_rw,
3863                                   &secondary);
3864                 if (r < 0)
3865                         goto finish;
3866         }
3867
3868         r = determine_uid_shift();
3869         if (r < 0)
3870                 goto finish;
3871
3872         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3873
3874         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3875         if (master < 0) {
3876                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3877                 goto finish;
3878         }
3879
3880         r = ptsname_malloc(master, &console);
3881         if (r < 0) {
3882                 r = log_error_errno(r, "Failed to determine tty name: %m");
3883                 goto finish;
3884         }
3885
3886         if (unlockpt(master) < 0) {
3887                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3888                 goto finish;
3889         }
3890
3891         if (!arg_quiet)
3892                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3893                          arg_machine, arg_image ?: arg_directory);
3894
3895         assert_se(sigemptyset(&mask) == 0);
3896         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3897         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3898
3899         assert_se(sigemptyset(&mask_chld) == 0);
3900         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3901
3902         for (;;) {
3903                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3904                 ContainerStatus container_status;
3905                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3906                 struct sigaction sa = {
3907                         .sa_handler = nop_handler,
3908                         .sa_flags = SA_NOCLDSTOP,
3909                 };
3910
3911                 r = barrier_create(&barrier);
3912                 if (r < 0) {
3913                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3914                         goto finish;
3915                 }
3916
3917                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3918                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3919                         goto finish;
3920                 }
3921
3922                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3923                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3924                         goto finish;
3925                 }
3926
3927                 /* Child can be killed before execv(), so handle SIGCHLD
3928                  * in order to interrupt parent's blocking calls and
3929                  * give it a chance to call wait() and terminate. */
3930                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3931                 if (r < 0) {
3932                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3933                         goto finish;
3934                 }
3935
3936                 r = sigaction(SIGCHLD, &sa, NULL);
3937                 if (r < 0) {
3938                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3939                         goto finish;
3940                 }
3941
3942                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3943                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3944                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3945                 if (pid < 0) {
3946                         if (errno == EINVAL)
3947                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3948                         else
3949                                 r = log_error_errno(errno, "clone() failed: %m");
3950
3951                         goto finish;
3952                 }
3953
3954                 if (pid == 0) {
3955                         /* child */
3956                         _cleanup_free_ char *home = NULL;
3957                         unsigned n_env = 2;
3958                         const char *envp[] = {
3959                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3960                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3961                                 NULL, /* TERM */
3962                                 NULL, /* HOME */
3963                                 NULL, /* USER */
3964                                 NULL, /* LOGNAME */
3965                                 NULL, /* container_uuid */
3966                                 NULL, /* LISTEN_FDS */
3967                                 NULL, /* LISTEN_PID */
3968                                 NULL
3969                         };
3970                         char **env_use;
3971
3972                         barrier_set_role(&barrier, BARRIER_CHILD);
3973
3974                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3975                         if (envp[n_env])
3976                                 n_env ++;
3977
3978                         master = safe_close(master);
3979
3980                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3981                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3982
3983                         reset_all_signal_handlers();
3984                         reset_signal_mask();
3985
3986                         if (interactive) {
3987                                 close_nointr(STDIN_FILENO);
3988                                 close_nointr(STDOUT_FILENO);
3989                                 close_nointr(STDERR_FILENO);
3990
3991                                 r = open_terminal(console, O_RDWR);
3992                                 if (r != STDIN_FILENO) {
3993                                         if (r >= 0) {
3994                                                 safe_close(r);
3995                                                 r = -EINVAL;
3996                                         }
3997
3998                                         log_error_errno(r, "Failed to open console: %m");
3999                                         _exit(EXIT_FAILURE);
4000                                 }
4001
4002                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4003                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4004                                         log_error_errno(errno, "Failed to duplicate console: %m");
4005                                         _exit(EXIT_FAILURE);
4006                                 }
4007                         }
4008
4009                         if (setsid() < 0) {
4010                                 log_error_errno(errno, "setsid() failed: %m");
4011                                 _exit(EXIT_FAILURE);
4012                         }
4013
4014                         if (reset_audit_loginuid() < 0)
4015                                 _exit(EXIT_FAILURE);
4016
4017                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4018                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4019                                 _exit(EXIT_FAILURE);
4020                         }
4021
4022                         if (arg_private_network)
4023                                 loopback_setup();
4024
4025                         /* Mark everything as slave, so that we still
4026                          * receive mounts from the real root, but don't
4027                          * propagate mounts to the real root. */
4028                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4029                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4030                                 _exit(EXIT_FAILURE);
4031                         }
4032
4033                         if (mount_devices(arg_directory,
4034                                           root_device, root_device_rw,
4035                                           home_device, home_device_rw,
4036                                           srv_device, srv_device_rw) < 0)
4037                                 _exit(EXIT_FAILURE);
4038
4039                         /* Turn directory into bind mount */
4040                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4041                                 log_error_errno(errno, "Failed to make bind mount: %m");
4042                                 _exit(EXIT_FAILURE);
4043                         }
4044
4045                         r = setup_volatile(arg_directory);
4046                         if (r < 0)
4047                                 _exit(EXIT_FAILURE);
4048
4049                         if (setup_volatile_state(arg_directory) < 0)
4050                                 _exit(EXIT_FAILURE);
4051
4052                         r = base_filesystem_create(arg_directory);
4053                         if (r < 0)
4054                                 _exit(EXIT_FAILURE);
4055
4056                         if (arg_read_only) {
4057                                 r = bind_remount_recursive(arg_directory, true);
4058                                 if (r < 0) {
4059                                         log_error_errno(r, "Failed to make tree read-only: %m");
4060                                         _exit(EXIT_FAILURE);
4061                                 }
4062                         }
4063
4064                         if (mount_all(arg_directory) < 0)
4065                                 _exit(EXIT_FAILURE);
4066
4067                         if (copy_devnodes(arg_directory) < 0)
4068                                 _exit(EXIT_FAILURE);
4069
4070                         if (setup_ptmx(arg_directory) < 0)
4071                                 _exit(EXIT_FAILURE);
4072
4073                         dev_setup(arg_directory);
4074
4075                         if (setup_propagate(arg_directory) < 0)
4076                                 _exit(EXIT_FAILURE);
4077
4078                         if (setup_seccomp() < 0)
4079                                 _exit(EXIT_FAILURE);
4080
4081                         if (setup_dev_console(arg_directory, console) < 0)
4082                                 _exit(EXIT_FAILURE);
4083
4084                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4085                                 _exit(EXIT_FAILURE);
4086                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4087
4088                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4089                                 _exit(EXIT_FAILURE);
4090                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4091
4092                         /* Tell the parent that we are ready, and that
4093                          * it can cgroupify us to that we lack access
4094                          * to certain devices and resources. */
4095                         (void) barrier_place(&barrier); /* #1 */
4096
4097                         if (setup_boot_id(arg_directory) < 0)
4098                                 _exit(EXIT_FAILURE);
4099
4100                         if (setup_timezone(arg_directory) < 0)
4101                                 _exit(EXIT_FAILURE);
4102
4103                         if (setup_resolv_conf(arg_directory) < 0)
4104                                 _exit(EXIT_FAILURE);
4105
4106                         if (setup_journal(arg_directory) < 0)
4107                                 _exit(EXIT_FAILURE);
4108
4109                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4110                                 _exit(EXIT_FAILURE);
4111
4112                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4113                                 _exit(EXIT_FAILURE);
4114
4115                         if (mount_tmpfs(arg_directory) < 0)
4116                                 _exit(EXIT_FAILURE);
4117
4118                         /* Wait until we are cgroup-ified, so that we
4119                          * can mount the right cgroup path writable */
4120                         (void) barrier_place_and_sync(&barrier); /* #2 */
4121
4122                         if (mount_cgroup(arg_directory) < 0)
4123                                 _exit(EXIT_FAILURE);
4124
4125                         if (chdir(arg_directory) < 0) {
4126                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4127                                 _exit(EXIT_FAILURE);
4128                         }
4129
4130                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4131                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4132                                 _exit(EXIT_FAILURE);
4133                         }
4134
4135                         if (chroot(".") < 0) {
4136                                 log_error_errno(errno, "chroot() failed: %m");
4137                                 _exit(EXIT_FAILURE);
4138                         }
4139
4140                         if (chdir("/") < 0) {
4141                                 log_error_errno(errno, "chdir() failed: %m");
4142                                 _exit(EXIT_FAILURE);
4143                         }
4144
4145                         if (arg_userns) {
4146                                 if (unshare(CLONE_NEWUSER) < 0) {
4147                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4148                                         _exit(EXIT_FAILURE);
4149                                 }
4150
4151                                 /* Tell the parent, that it now can
4152                                  * write the UID map. */
4153                                 (void) barrier_place(&barrier); /* #3 */
4154
4155                                 /* Wait until the parent wrote the UID
4156                                  * map */
4157                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4158                         }
4159
4160                         umask(0022);
4161
4162                         if (drop_capabilities() < 0) {
4163                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4164                                 _exit(EXIT_FAILURE);
4165                         }
4166
4167                         setup_hostname();
4168
4169                         if (arg_personality != 0xffffffffLU) {
4170                                 if (personality(arg_personality) < 0) {
4171                                         log_error_errno(errno, "personality() failed: %m");
4172                                         _exit(EXIT_FAILURE);
4173                                 }
4174                         } else if (secondary) {
4175                                 if (personality(PER_LINUX32) < 0) {
4176                                         log_error_errno(errno, "personality() failed: %m");
4177                                         _exit(EXIT_FAILURE);
4178                                 }
4179                         }
4180
4181 #ifdef HAVE_SELINUX
4182                         if (arg_selinux_context)
4183                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4184                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4185                                         _exit(EXIT_FAILURE);
4186                                 }
4187 #endif
4188
4189                         r = change_uid_gid(&home);
4190                         if (r < 0)
4191                                 _exit(EXIT_FAILURE);
4192
4193                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4194                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4195                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4196                                 log_oom();
4197                                 _exit(EXIT_FAILURE);
4198                         }
4199
4200                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4201                                 char as_uuid[37];
4202
4203                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4204                                         log_oom();
4205                                         _exit(EXIT_FAILURE);
4206                                 }
4207                         }
4208
4209                         if (fdset_size(fds) > 0) {
4210                                 r = fdset_cloexec(fds, false);
4211                                 if (r < 0) {
4212                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4213                                         _exit(EXIT_FAILURE);
4214                                 }
4215
4216                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4217                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4218                                         log_oom();
4219                                         _exit(EXIT_FAILURE);
4220                                 }
4221                         }
4222
4223                         if (!strv_isempty(arg_setenv)) {
4224                                 char **n;
4225
4226                                 n = strv_env_merge(2, envp, arg_setenv);
4227                                 if (!n) {
4228                                         log_oom();
4229                                         _exit(EXIT_FAILURE);
4230                                 }
4231
4232                                 env_use = n;
4233                         } else
4234                                 env_use = (char**) envp;
4235
4236                         /* Let the parent know that we are ready and
4237                          * wait until the parent is ready with the
4238                          * setup, too... */
4239                         (void) barrier_place_and_sync(&barrier); /* #5 */
4240
4241                         if (arg_boot) {
4242                                 char **a;
4243                                 size_t l;
4244
4245                                 /* Automatically search for the init system */
4246
4247                                 l = 1 + argc - optind;
4248                                 a = newa(char*, l + 1);
4249                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4250
4251                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4252                                 execve(a[0], a, env_use);
4253
4254                                 a[0] = (char*) "/lib/systemd/systemd";
4255                                 execve(a[0], a, env_use);
4256
4257                                 a[0] = (char*) "/sbin/init";
4258                                 execve(a[0], a, env_use);
4259                         } else if (argc > optind)
4260                                 execvpe(argv[optind], argv + optind, env_use);
4261                         else {
4262                                 chdir(home ? home : "/root");
4263                                 execle("/bin/bash", "-bash", NULL, env_use);
4264                                 execle("/bin/sh", "-sh", NULL, env_use);
4265                         }
4266
4267                         log_error_errno(errno, "execv() failed: %m");
4268                         _exit(EXIT_FAILURE);
4269                 }
4270
4271                 barrier_set_role(&barrier, BARRIER_PARENT);
4272                 fdset_free(fds);
4273                 fds = NULL;
4274
4275                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4276                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4277
4278                 (void) barrier_place(&barrier); /* #1 */
4279
4280                 /* Wait for the most basic Child-setup to be done,
4281                  * before we add hardware to it, and place it in a
4282                  * cgroup. */
4283                 if (barrier_sync(&barrier)) { /* #1 */
4284                         int ifi = 0;
4285
4286                         r = move_network_interfaces(pid);
4287                         if (r < 0)
4288                                 goto finish;
4289
4290                         r = setup_veth(pid, veth_name, &ifi);
4291                         if (r < 0)
4292                                 goto finish;
4293
4294                         r = setup_bridge(veth_name, &ifi);
4295                         if (r < 0)
4296                                 goto finish;
4297
4298                         r = setup_macvlan(pid);
4299                         if (r < 0)
4300                                 goto finish;
4301
4302                         r = setup_ipvlan(pid);
4303                         if (r < 0)
4304                                 goto finish;
4305
4306                         r = register_machine(pid, ifi);
4307                         if (r < 0)
4308                                 goto finish;
4309
4310                         /* Notify the child that the parent is ready with all
4311                          * its setup, and that the child can now hand over
4312                          * control to the code to run inside the container. */
4313                         (void) barrier_place(&barrier); /* #2 */
4314
4315                         if (arg_userns) {
4316                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4317
4318                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4319
4320                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4321                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4322                                 r = write_string_file(uid_map, line);
4323                                 if (r < 0) {
4324                                         log_error_errno(r, "Failed to write UID map: %m");
4325                                         goto finish;
4326                                 }
4327
4328                                 /* We always assign the same UID and GID ranges */
4329                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4330                                 r = write_string_file(uid_map, line);
4331                                 if (r < 0) {
4332                                         log_error_errno(r, "Failed to write GID map: %m");
4333                                         goto finish;
4334                                 }
4335
4336                                 (void) barrier_place(&barrier); /* #4 */
4337                         }
4338
4339                         /* Block SIGCHLD here, before notifying child.
4340                          * process_pty() will handle it with the other signals. */
4341                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4342                         if (r < 0)
4343                                 goto finish;
4344
4345                         /* Reset signal to default */
4346                         r = default_signals(SIGCHLD, -1);
4347                         if (r < 0)
4348                                 goto finish;
4349
4350                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4351                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4352                                 _cleanup_event_unref_ sd_event *event = NULL;
4353                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4354                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4355                                 char last_char = 0;
4356
4357                                 sd_notifyf(false,
4358                                            "READY=1\n"
4359                                            "STATUS=Container running.\n"
4360                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4361
4362                                 r = sd_event_new(&event);
4363                                 if (r < 0) {
4364                                         log_error_errno(r, "Failed to get default event source: %m");
4365                                         goto finish;
4366                                 }
4367
4368                                 if (arg_boot) {
4369                                         /* Try to kill the init system on SIGINT or SIGTERM */
4370                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4371                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4372                                 } else {
4373                                         /* Immediately exit */
4374                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4375                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4376                                 }
4377
4378                                 /* simply exit on sigchld */
4379                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4380
4381                                 if (arg_expose_ports) {
4382                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4383                                         if (r < 0)
4384                                                 goto finish;
4385
4386                                         (void) expose_ports(rtnl, &exposed);
4387                                 }
4388
4389                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4390
4391                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4392                                 if (r < 0) {
4393                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4394                                         goto finish;
4395                                 }
4396
4397                                 r = sd_event_loop(event);
4398                                 if (r < 0) {
4399                                         log_error_errno(r, "Failed to run event loop: %m");
4400                                         goto finish;
4401                                 }
4402
4403                                 pty_forward_get_last_char(forward, &last_char);
4404
4405                                 forward = pty_forward_free(forward);
4406
4407                                 if (!arg_quiet && last_char != '\n')
4408                                         putc('\n', stdout);
4409
4410                                 /* Kill if it is not dead yet anyway */
4411                                 terminate_machine(pid);
4412                         }
4413                 }
4414
4415                 /* Normally redundant, but better safe than sorry */
4416                 kill(pid, SIGKILL);
4417
4418                 r = wait_for_container(pid, &container_status);
4419                 pid = 0;
4420
4421                 if (r < 0)
4422                         /* We failed to wait for the container, or the
4423                          * container exited abnormally */
4424                         goto finish;
4425                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4426                         /* The container exited with a non-zero
4427                          * status, or with zero status and no reboot
4428                          * was requested. */
4429                         ret = r;
4430                         break;
4431                 }
4432
4433                 /* CONTAINER_REBOOTED, loop again */
4434
4435                 if (arg_keep_unit) {
4436                         /* Special handling if we are running as a
4437                          * service: instead of simply restarting the
4438                          * machine we want to restart the entire
4439                          * service, so let's inform systemd about this
4440                          * with the special exit code 133. The service
4441                          * file uses RestartForceExitStatus=133 so
4442                          * that this results in a full nspawn
4443                          * restart. This is necessary since we might
4444                          * have cgroup parameters set we want to have
4445                          * flushed out. */
4446                         ret = 133;
4447                         r = 0;
4448                         break;
4449                 }
4450
4451                 flush_ports(&exposed);
4452         }
4453
4454 finish:
4455         sd_notify(false,
4456                   "STOPPING=1\n"
4457                   "STATUS=Terminating...");
4458
4459         loop_remove(loop_nr, &image_fd);
4460
4461         if (pid > 0)
4462                 kill(pid, SIGKILL);
4463
4464         if (remove_subvol && arg_directory) {
4465                 int k;
4466
4467                 k = btrfs_subvol_remove(arg_directory);
4468                 if (k < 0)
4469                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4470         }
4471
4472         if (arg_machine) {
4473                 const char *p;
4474
4475                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4476                 (void) rm_rf(p, false, true, false);
4477         }
4478
4479         free(arg_directory);
4480         free(arg_template);
4481         free(arg_image);
4482         free(arg_machine);
4483         free(arg_user);
4484         strv_free(arg_setenv);
4485         strv_free(arg_network_interfaces);
4486         strv_free(arg_network_macvlan);
4487         strv_free(arg_network_ipvlan);
4488         strv_free(arg_bind);
4489         strv_free(arg_bind_ro);
4490         strv_free(arg_tmpfs);
4491
4492         flush_ports(&exposed);
4493
4494         while (arg_expose_ports) {
4495                 ExposePort *p = arg_expose_ports;
4496                 LIST_REMOVE(ports, arg_expose_ports, p);
4497                 free(p);
4498         }
4499
4500         return r < 0 ? EXIT_FAILURE : ret;
4501 }