chiark / gitweb /
nspawn: fix whitespace and typo in partition table blurb
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
192 static bool arg_userns = false;
193
194 static void help(void) {
195         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197                "  -h --help                 Show this help\n"
198                "     --version              Print version string\n"
199                "  -q --quiet                Do not show status information\n"
200                "  -D --directory=PATH       Root directory for the container\n"
201                "     --template=PATH        Initialize root directory from template directory,\n"
202                "                            if missing\n"
203                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
204                "                            remove it after exit\n"
205                "  -i --image=PATH           File system device or disk image for the container\n"
206                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
207                "  -u --user=USER            Run the command under specified user or uid\n"
208                "  -M --machine=NAME         Set the machine name for the container\n"
209                "     --uuid=UUID            Set a specific machine UUID for the container\n"
210                "  -S --slice=SLICE          Place the container in the specified slice\n"
211                "     --property=NAME=VALUE  Set scope unit property\n"
212                "     --private-network      Disable network in container\n"
213                "     --network-interface=INTERFACE\n"
214                "                            Assign an existing network interface to the\n"
215                "                            container\n"
216                "     --network-macvlan=INTERFACE\n"
217                "                            Create a macvlan network interface based on an\n"
218                "                            existing network interface to the container\n"
219                "     --network-ipvlan=INTERFACE\n"
220                "                            Create a ipvlan network interface based on an\n"
221                "                            existing network interface to the container\n"
222                "  -n --network-veth         Add a virtual ethernet connection between host\n"
223                "                            and container\n"
224                "     --network-bridge=INTERFACE\n"
225                "                            Add a virtual ethernet connection between host\n"
226                "                            and container and add it to an existing bridge on\n"
227                "                            the host\n"
228                "     --private-users[=UIDBASE[:NUIDS]]\n"
229                "                            Run within user namespace\n"
230                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
231                "                            Expose a container IP port on the host\n"
232                "  -Z --selinux-context=SECLABEL\n"
233                "                            Set the SELinux security context to be used by\n"
234                "                            processes in the container\n"
235                "  -L --selinux-apifs-context=SECLABEL\n"
236                "                            Set the SELinux security context to be used by\n"
237                "                            API/tmpfs file systems in the container\n"
238                "     --capability=CAP       In addition to the default, retain specified\n"
239                "                            capability\n"
240                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
241                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
242                "                            try-guest, try-host\n"
243                "  -j                        Equivalent to --link-journal=try-guest\n"
244                "     --read-only            Mount the root directory read-only\n"
245                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
246                "                            the container\n"
247                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
248                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
249                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
250                "     --share-system         Share system namespaces with host\n"
251                "     --register=BOOLEAN     Register container as machine\n"
252                "     --keep-unit            Do not register a scope for the machine, reuse\n"
253                "                            the service unit nspawn is running in\n"
254                "     --volatile[=MODE]      Run the system in volatile mode\n"
255                , program_invocation_short_name);
256 }
257
258 static int set_sanitized_path(char **b, const char *path) {
259         char *p;
260
261         assert(b);
262         assert(path);
263
264         p = canonicalize_file_name(path);
265         if (!p) {
266                 if (errno != ENOENT)
267                         return -errno;
268
269                 p = path_make_absolute_cwd(path);
270                 if (!p)
271                         return -ENOMEM;
272         }
273
274         free(*b);
275         *b = path_kill_slashes(p);
276         return 0;
277 }
278
279 static int parse_argv(int argc, char *argv[]) {
280
281         enum {
282                 ARG_VERSION = 0x100,
283                 ARG_PRIVATE_NETWORK,
284                 ARG_UUID,
285                 ARG_READ_ONLY,
286                 ARG_CAPABILITY,
287                 ARG_DROP_CAPABILITY,
288                 ARG_LINK_JOURNAL,
289                 ARG_BIND,
290                 ARG_BIND_RO,
291                 ARG_TMPFS,
292                 ARG_SETENV,
293                 ARG_SHARE_SYSTEM,
294                 ARG_REGISTER,
295                 ARG_KEEP_UNIT,
296                 ARG_NETWORK_INTERFACE,
297                 ARG_NETWORK_MACVLAN,
298                 ARG_NETWORK_IPVLAN,
299                 ARG_NETWORK_BRIDGE,
300                 ARG_PERSONALITY,
301                 ARG_VOLATILE,
302                 ARG_TEMPLATE,
303                 ARG_PROPERTY,
304                 ARG_PRIVATE_USERS,
305         };
306
307         static const struct option options[] = {
308                 { "help",                  no_argument,       NULL, 'h'                   },
309                 { "version",               no_argument,       NULL, ARG_VERSION           },
310                 { "directory",             required_argument, NULL, 'D'                   },
311                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
312                 { "ephemeral",             no_argument,       NULL, 'x'                   },
313                 { "user",                  required_argument, NULL, 'u'                   },
314                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
315                 { "boot",                  no_argument,       NULL, 'b'                   },
316                 { "uuid",                  required_argument, NULL, ARG_UUID              },
317                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
318                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
319                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
320                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
321                 { "bind",                  required_argument, NULL, ARG_BIND              },
322                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
323                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
324                 { "machine",               required_argument, NULL, 'M'                   },
325                 { "slice",                 required_argument, NULL, 'S'                   },
326                 { "setenv",                required_argument, NULL, ARG_SETENV            },
327                 { "selinux-context",       required_argument, NULL, 'Z'                   },
328                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
329                 { "quiet",                 no_argument,       NULL, 'q'                   },
330                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
331                 { "register",              required_argument, NULL, ARG_REGISTER          },
332                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
333                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
334                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
335                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
336                 { "network-veth",          no_argument,       NULL, 'n'                   },
337                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
338                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
339                 { "image",                 required_argument, NULL, 'i'                   },
340                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
341                 { "port",                  required_argument, NULL, 'p'                   },
342                 { "property",              required_argument, NULL, ARG_PROPERTY          },
343                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
344                 {}
345         };
346
347         int c, r;
348         uint64_t plus = 0, minus = 0;
349
350         assert(argc >= 0);
351         assert(argv);
352
353         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
354
355                 switch (c) {
356
357                 case 'h':
358                         help();
359                         return 0;
360
361                 case ARG_VERSION:
362                         puts(PACKAGE_STRING);
363                         puts(SYSTEMD_FEATURES);
364                         return 0;
365
366                 case 'D':
367                         r = set_sanitized_path(&arg_directory, optarg);
368                         if (r < 0)
369                                 return log_error_errno(r, "Invalid root directory: %m");
370
371                         break;
372
373                 case ARG_TEMPLATE:
374                         r = set_sanitized_path(&arg_template, optarg);
375                         if (r < 0)
376                                 return log_error_errno(r, "Invalid template directory: %m");
377
378                         break;
379
380                 case 'i':
381                         r = set_sanitized_path(&arg_image, optarg);
382                         if (r < 0)
383                                 return log_error_errno(r, "Invalid image path: %m");
384
385                         break;
386
387                 case 'x':
388                         arg_ephemeral = true;
389                         break;
390
391                 case 'u':
392                         free(arg_user);
393                         arg_user = strdup(optarg);
394                         if (!arg_user)
395                                 return log_oom();
396
397                         break;
398
399                 case ARG_NETWORK_BRIDGE:
400                         arg_network_bridge = optarg;
401
402                         /* fall through */
403
404                 case 'n':
405                         arg_network_veth = true;
406                         arg_private_network = true;
407                         break;
408
409                 case ARG_NETWORK_INTERFACE:
410                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
411                                 return log_oom();
412
413                         arg_private_network = true;
414                         break;
415
416                 case ARG_NETWORK_MACVLAN:
417                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
418                                 return log_oom();
419
420                         arg_private_network = true;
421                         break;
422
423                 case ARG_NETWORK_IPVLAN:
424                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425                                 return log_oom();
426
427                         /* fall through */
428
429                 case ARG_PRIVATE_NETWORK:
430                         arg_private_network = true;
431                         break;
432
433                 case 'b':
434                         arg_boot = true;
435                         break;
436
437                 case ARG_UUID:
438                         r = sd_id128_from_string(optarg, &arg_uuid);
439                         if (r < 0) {
440                                 log_error("Invalid UUID: %s", optarg);
441                                 return r;
442                         }
443                         break;
444
445                 case 'S':
446                         arg_slice = optarg;
447                         break;
448
449                 case 'M':
450                         if (isempty(optarg)) {
451                                 free(arg_machine);
452                                 arg_machine = NULL;
453                         } else {
454                                 if (!machine_name_is_valid(optarg)) {
455                                         log_error("Invalid machine name: %s", optarg);
456                                         return -EINVAL;
457                                 }
458
459                                 r = free_and_strdup(&arg_machine, optarg);
460                                 if (r < 0)
461                                         return log_oom();
462
463                                 break;
464                         }
465
466                 case 'Z':
467                         arg_selinux_context = optarg;
468                         break;
469
470                 case 'L':
471                         arg_selinux_apifs_context = optarg;
472                         break;
473
474                 case ARG_READ_ONLY:
475                         arg_read_only = true;
476                         break;
477
478                 case ARG_CAPABILITY:
479                 case ARG_DROP_CAPABILITY: {
480                         const char *state, *word;
481                         size_t length;
482
483                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
484                                 _cleanup_free_ char *t;
485
486                                 t = strndup(word, length);
487                                 if (!t)
488                                         return log_oom();
489
490                                 if (streq(t, "all")) {
491                                         if (c == ARG_CAPABILITY)
492                                                 plus = (uint64_t) -1;
493                                         else
494                                                 minus = (uint64_t) -1;
495                                 } else {
496                                         int cap;
497
498                                         cap = capability_from_name(t);
499                                         if (cap < 0) {
500                                                 log_error("Failed to parse capability %s.", t);
501                                                 return -EINVAL;
502                                         }
503
504                                         if (c == ARG_CAPABILITY)
505                                                 plus |= 1ULL << (uint64_t) cap;
506                                         else
507                                                 minus |= 1ULL << (uint64_t) cap;
508                                 }
509                         }
510
511                         break;
512                 }
513
514                 case 'j':
515                         arg_link_journal = LINK_GUEST;
516                         arg_link_journal_try = true;
517                         break;
518
519                 case ARG_LINK_JOURNAL:
520                         if (streq(optarg, "auto")) {
521                                 arg_link_journal = LINK_AUTO;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "no")) {
524                                 arg_link_journal = LINK_NO;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = false;
529                         } else if (streq(optarg, "host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = false;
532                         } else if (streq(optarg, "try-guest")) {
533                                 arg_link_journal = LINK_GUEST;
534                                 arg_link_journal_try = true;
535                         } else if (streq(optarg, "try-host")) {
536                                 arg_link_journal = LINK_HOST;
537                                 arg_link_journal_try = true;
538                         } else {
539                                 log_error("Failed to parse link journal mode %s", optarg);
540                                 return -EINVAL;
541                         }
542
543                         break;
544
545                 case ARG_BIND:
546                 case ARG_BIND_RO: {
547                         _cleanup_free_ char *a = NULL, *b = NULL;
548                         char *e;
549                         char ***x;
550
551                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553                         e = strchr(optarg, ':');
554                         if (e) {
555                                 a = strndup(optarg, e - optarg);
556                                 b = strdup(e + 1);
557                         } else {
558                                 a = strdup(optarg);
559                                 b = strdup(optarg);
560                         }
561
562                         if (!a || !b)
563                                 return log_oom();
564
565                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
566                                 log_error("Invalid bind mount specification: %s", optarg);
567                                 return -EINVAL;
568                         }
569
570                         r = strv_extend(x, a);
571                         if (r < 0)
572                                 return log_oom();
573
574                         r = strv_extend(x, b);
575                         if (r < 0)
576                                 return log_oom();
577
578                         break;
579                 }
580
581                 case ARG_TMPFS: {
582                         _cleanup_free_ char *a = NULL, *b = NULL;
583                         char *e;
584
585                         e = strchr(optarg, ':');
586                         if (e) {
587                                 a = strndup(optarg, e - optarg);
588                                 b = strdup(e + 1);
589                         } else {
590                                 a = strdup(optarg);
591                                 b = strdup("mode=0755");
592                         }
593
594                         if (!a || !b)
595                                 return log_oom();
596
597                         if (!path_is_absolute(a)) {
598                                 log_error("Invalid tmpfs specification: %s", optarg);
599                                 return -EINVAL;
600                         }
601
602                         r = strv_push(&arg_tmpfs, a);
603                         if (r < 0)
604                                 return log_oom();
605
606                         a = NULL;
607
608                         r = strv_push(&arg_tmpfs, b);
609                         if (r < 0)
610                                 return log_oom();
611
612                         b = NULL;
613
614                         break;
615                 }
616
617                 case ARG_SETENV: {
618                         char **n;
619
620                         if (!env_assignment_is_valid(optarg)) {
621                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         n = strv_env_set(arg_setenv, optarg);
626                         if (!n)
627                                 return log_oom();
628
629                         strv_free(arg_setenv);
630                         arg_setenv = n;
631                         break;
632                 }
633
634                 case 'q':
635                         arg_quiet = true;
636                         break;
637
638                 case ARG_SHARE_SYSTEM:
639                         arg_share_system = true;
640                         break;
641
642                 case ARG_REGISTER:
643                         r = parse_boolean(optarg);
644                         if (r < 0) {
645                                 log_error("Failed to parse --register= argument: %s", optarg);
646                                 return r;
647                         }
648
649                         arg_register = r;
650                         break;
651
652                 case ARG_KEEP_UNIT:
653                         arg_keep_unit = true;
654                         break;
655
656                 case ARG_PERSONALITY:
657
658                         arg_personality = personality_from_string(optarg);
659                         if (arg_personality == 0xffffffffLU) {
660                                 log_error("Unknown or unsupported personality '%s'.", optarg);
661                                 return -EINVAL;
662                         }
663
664                         break;
665
666                 case ARG_VOLATILE:
667
668                         if (!optarg)
669                                 arg_volatile = VOLATILE_YES;
670                         else {
671                                 r = parse_boolean(optarg);
672                                 if (r < 0) {
673                                         if (streq(optarg, "state"))
674                                                 arg_volatile = VOLATILE_STATE;
675                                         else {
676                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
677                                                 return r;
678                                         }
679                                 } else
680                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681                         }
682
683                         break;
684
685                 case 'p': {
686                         const char *split, *e;
687                         uint16_t container_port, host_port;
688                         int protocol;
689                         ExposePort *p;
690
691                         if ((e = startswith(optarg, "tcp:")))
692                                 protocol = IPPROTO_TCP;
693                         else if ((e = startswith(optarg, "udp:")))
694                                 protocol = IPPROTO_UDP;
695                         else {
696                                 e = optarg;
697                                 protocol = IPPROTO_TCP;
698                         }
699
700                         split = strchr(e, ':');
701                         if (split) {
702                                 char v[split - e + 1];
703
704                                 memcpy(v, e, split - e);
705                                 v[split - e] = 0;
706
707                                 r = safe_atou16(v, &host_port);
708                                 if (r < 0 || host_port <= 0) {
709                                         log_error("Failed to parse host port: %s", optarg);
710                                         return -EINVAL;
711                                 }
712
713                                 r = safe_atou16(split + 1, &container_port);
714                         } else {
715                                 r = safe_atou16(e, &container_port);
716                                 host_port = container_port;
717                         }
718
719                         if (r < 0 || container_port <= 0) {
720                                 log_error("Failed to parse host port: %s", optarg);
721                                 return -EINVAL;
722                         }
723
724                         LIST_FOREACH(ports, p, arg_expose_ports) {
725                                 if (p->protocol == protocol && p->host_port == host_port) {
726                                         log_error("Duplicate port specification: %s", optarg);
727                                         return -EINVAL;
728                                 }
729                         }
730
731                         p = new(ExposePort, 1);
732                         if (!p)
733                                 return log_oom();
734
735                         p->protocol = protocol;
736                         p->host_port = host_port;
737                         p->container_port = container_port;
738
739                         LIST_PREPEND(ports, arg_expose_ports, p);
740
741                         break;
742                 }
743
744                 case ARG_PROPERTY:
745                         if (strv_extend(&arg_property, optarg) < 0)
746                                 return log_oom();
747
748                         break;
749
750                 case ARG_PRIVATE_USERS:
751                         if (optarg) {
752                                 _cleanup_free_ char *buffer = NULL;
753                                 const char *range, *shift;
754
755                                 range = strchr(optarg, ':');
756                                 if (range) {
757                                         buffer = strndup(optarg, range - optarg);
758                                         if (!buffer)
759                                                 return log_oom();
760                                         shift = buffer;
761
762                                         range++;
763                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764                                                 log_error("Failed to parse UID range: %s", range);
765                                                 return -EINVAL;
766                                         }
767                                 } else
768                                         shift = optarg;
769
770                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
771                                         log_error("Failed to parse UID: %s", optarg);
772                                         return -EINVAL;
773                                 }
774                         }
775
776                         arg_userns = true;
777                         break;
778
779                 case '?':
780                         return -EINVAL;
781
782                 default:
783                         assert_not_reached("Unhandled option");
784                 }
785
786         if (arg_share_system)
787                 arg_register = false;
788
789         if (arg_boot && arg_share_system) {
790                 log_error("--boot and --share-system may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
795                 log_error("--keep-unit may not be used when invoked from a user session.");
796                 return -EINVAL;
797         }
798
799         if (arg_directory && arg_image) {
800                 log_error("--directory= and --image= may not be combined.");
801                 return -EINVAL;
802         }
803
804         if (arg_template && arg_image) {
805                 log_error("--template= and --image= may not be combined.");
806                 return -EINVAL;
807         }
808
809         if (arg_template && !(arg_directory || arg_machine)) {
810                 log_error("--template= needs --directory= or --machine=.");
811                 return -EINVAL;
812         }
813
814         if (arg_ephemeral && arg_template) {
815                 log_error("--ephemeral and --template= may not be combined.");
816                 return -EINVAL;
817         }
818
819         if (arg_ephemeral && arg_image) {
820                 log_error("--ephemeral and --image= may not be combined.");
821                 return -EINVAL;
822         }
823
824         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
825                 log_error("--ephemeral and --link-journal= may not be combined.");
826                 return -EINVAL;
827         }
828
829         if (arg_volatile != VOLATILE_NO && arg_read_only) {
830                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
831                 return -EINVAL;
832         }
833
834         if (arg_expose_ports && !arg_private_network) {
835                 log_error("Cannot use --port= without private networking.");
836                 return -EINVAL;
837         }
838
839         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
840
841         return 1;
842 }
843
844 static int mount_all(const char *dest) {
845
846         typedef struct MountPoint {
847                 const char *what;
848                 const char *where;
849                 const char *type;
850                 const char *options;
851                 unsigned long flags;
852                 bool fatal;
853         } MountPoint;
854
855         static const MountPoint mount_table[] = {
856                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
857                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
858                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
859                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
860                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
861                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
862                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
863                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
864                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
865 #ifdef HAVE_SELINUX
866                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
867                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
868 #endif
869         };
870
871         unsigned k;
872         int r = 0;
873
874         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
875                 _cleanup_free_ char *where = NULL, *options = NULL;
876                 const char *o;
877                 int t;
878
879                 where = strjoin(dest, "/", mount_table[k].where, NULL);
880                 if (!where)
881                         return log_oom();
882
883                 t = path_is_mount_point(where, true);
884                 if (t < 0) {
885                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
886
887                         if (r == 0)
888                                 r = t;
889
890                         continue;
891                 }
892
893                 /* Skip this entry if it is not a remount. */
894                 if (mount_table[k].what && t > 0)
895                         continue;
896
897                 t = mkdir_p(where, 0755);
898                 if (t < 0) {
899                         if (mount_table[k].fatal) {
900                                log_error_errno(t, "Failed to create directory %s: %m", where);
901
902                                 if (r == 0)
903                                         r = t;
904                         } else
905                                log_warning_errno(t, "Failed to create directory %s: %m", where);
906
907                         continue;
908                 }
909
910 #ifdef HAVE_SELINUX
911                 if (arg_selinux_apifs_context &&
912                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
913                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
914                         if (!options)
915                                 return log_oom();
916
917                         o = options;
918                 } else
919 #endif
920                         o = mount_table[k].options;
921
922                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
923                         char *uid_options = NULL;
924
925                         if (o)
926                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
927                         else
928                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
929                         if (!uid_options)
930                                 return log_oom();
931
932                         free(options);
933                         o = options = uid_options;
934                 }
935
936                 if (mount(mount_table[k].what,
937                           where,
938                           mount_table[k].type,
939                           mount_table[k].flags,
940                           o) < 0) {
941
942                         if (mount_table[k].fatal) {
943                                 log_error_errno(errno, "mount(%s) failed: %m", where);
944
945                                 if (r == 0)
946                                         r = -errno;
947                         } else
948                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
949                 }
950         }
951
952         return r;
953 }
954
955 static int mount_binds(const char *dest, char **l, bool ro) {
956         char **x, **y;
957
958         STRV_FOREACH_PAIR(x, y, l) {
959                 _cleanup_free_ char *where = NULL;
960                 struct stat source_st, dest_st;
961                 int r;
962
963                 if (stat(*x, &source_st) < 0)
964                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
965
966                 where = strappend(dest, *y);
967                 if (!where)
968                         return log_oom();
969
970                 r = stat(where, &dest_st);
971                 if (r == 0) {
972                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
973                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
974                                 return -EINVAL;
975                         }
976                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
977                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
978                                 return -EINVAL;
979                         }
980                 } else if (errno == ENOENT) {
981                         r = mkdir_parents_label(where, 0755);
982                         if (r < 0)
983                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
984                 } else {
985                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
986                         return -errno;
987                 }
988
989                 /* Create the mount point. Any non-directory file can be
990                  * mounted on any non-directory file (regular, fifo, socket,
991                  * char, block).
992                  */
993                 if (S_ISDIR(source_st.st_mode)) {
994                         r = mkdir_label(where, 0755);
995                         if (r < 0 && errno != EEXIST)
996                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
997                 } else {
998                         r = touch(where);
999                         if (r < 0)
1000                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1001                 }
1002
1003                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1004                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1005
1006                 if (ro) {
1007                         r = bind_remount_recursive(where, true);
1008                         if (r < 0)
1009                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1010                 }
1011         }
1012
1013         return 0;
1014 }
1015
1016 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1017         char *to;
1018         int r;
1019
1020         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1021
1022         r = path_is_mount_point(to, false);
1023         if (r < 0)
1024                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1025         if (r > 0)
1026                 return 0;
1027
1028         mkdir_p(to, 0755);
1029
1030         /* The superblock mount options of the mount point need to be
1031          * identical to the hosts', and hence writable... */
1032         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1033                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1034
1035         /* ... hence let's only make the bind mount read-only, not the
1036          * superblock. */
1037         if (read_only) {
1038                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1039                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1040         }
1041         return 1;
1042 }
1043
1044 static int mount_cgroup(const char *dest) {
1045         _cleanup_set_free_free_ Set *controllers = NULL;
1046         _cleanup_free_ char *own_cgroup_path = NULL;
1047         const char *cgroup_root, *systemd_root, *systemd_own;
1048         int r;
1049
1050         controllers = set_new(&string_hash_ops);
1051         if (!controllers)
1052                 return log_oom();
1053
1054         r = cg_kernel_controllers(controllers);
1055         if (r < 0)
1056                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1057
1058         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1059         if (r < 0)
1060                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1061
1062         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1063         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1064                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1065
1066         for (;;) {
1067                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1068
1069                 controller = set_steal_first(controllers);
1070                 if (!controller)
1071                         break;
1072
1073                 origin = strappend("/sys/fs/cgroup/", controller);
1074                 if (!origin)
1075                         return log_oom();
1076
1077                 r = readlink_malloc(origin, &combined);
1078                 if (r == -EINVAL) {
1079                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1080
1081                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1082                         if (r < 0)
1083                                 return r;
1084
1085                 } else if (r < 0)
1086                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1087                 else {
1088                         _cleanup_free_ char *target = NULL;
1089
1090                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1091                         if (!target)
1092                                 return log_oom();
1093
1094                         /* A symbolic link, a combination of controllers in one hierarchy */
1095
1096                         if (!filename_is_valid(combined)) {
1097                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1098                                 continue;
1099                         }
1100
1101                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1102                         if (r < 0)
1103                                 return r;
1104
1105                         if (symlink(combined, target) < 0)
1106                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1107                 }
1108         }
1109
1110         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1111         if (r < 0)
1112                 return r;
1113
1114         /* Make our own cgroup a (writable) bind mount */
1115         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1116         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1117                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1118
1119         /* And then remount the systemd cgroup root read-only */
1120         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1121         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1122                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1123
1124         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1125                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1126
1127         return 0;
1128 }
1129
1130 static int mount_tmpfs(const char *dest) {
1131         char **i, **o;
1132
1133         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1134                 _cleanup_free_ char *where = NULL;
1135                 int r;
1136
1137                 where = strappend(dest, *i);
1138                 if (!where)
1139                         return log_oom();
1140
1141                 r = mkdir_label(where, 0755);
1142                 if (r < 0 && r != -EEXIST)
1143                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1144
1145                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1146                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1147         }
1148
1149         return 0;
1150 }
1151
1152 static int setup_timezone(const char *dest) {
1153         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1154         char *z, *y;
1155         int r;
1156
1157         assert(dest);
1158
1159         /* Fix the timezone, if possible */
1160         r = readlink_malloc("/etc/localtime", &p);
1161         if (r < 0) {
1162                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1163                 return 0;
1164         }
1165
1166         z = path_startswith(p, "../usr/share/zoneinfo/");
1167         if (!z)
1168                 z = path_startswith(p, "/usr/share/zoneinfo/");
1169         if (!z) {
1170                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1171                 return 0;
1172         }
1173
1174         where = strappend(dest, "/etc/localtime");
1175         if (!where)
1176                 return log_oom();
1177
1178         r = readlink_malloc(where, &q);
1179         if (r >= 0) {
1180                 y = path_startswith(q, "../usr/share/zoneinfo/");
1181                 if (!y)
1182                         y = path_startswith(q, "/usr/share/zoneinfo/");
1183
1184                 /* Already pointing to the right place? Then do nothing .. */
1185                 if (y && streq(y, z))
1186                         return 0;
1187         }
1188
1189         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1190         if (!check)
1191                 return log_oom();
1192
1193         if (access(check, F_OK) < 0) {
1194                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1195                 return 0;
1196         }
1197
1198         what = strappend("../usr/share/zoneinfo/", z);
1199         if (!what)
1200                 return log_oom();
1201
1202         r = mkdir_parents(where, 0755);
1203         if (r < 0) {
1204                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1205
1206                 return 0;
1207         }
1208
1209         r = unlink(where);
1210         if (r < 0 && errno != ENOENT) {
1211                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         if (symlink(what, where) < 0) {
1217                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1218                 return 0;
1219         }
1220
1221         return 0;
1222 }
1223
1224 static int setup_resolv_conf(const char *dest) {
1225         _cleanup_free_ char *where = NULL;
1226         int r;
1227
1228         assert(dest);
1229
1230         if (arg_private_network)
1231                 return 0;
1232
1233         /* Fix resolv.conf, if possible */
1234         where = strappend(dest, "/etc/resolv.conf");
1235         if (!where)
1236                 return log_oom();
1237
1238         /* We don't really care for the results of this really. If it
1239          * fails, it fails, but meh... */
1240         r = mkdir_parents(where, 0755);
1241         if (r < 0) {
1242                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1243
1244                 return 0;
1245         }
1246
1247         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int setup_volatile_state(const char *directory) {
1258         const char *p;
1259         int r;
1260
1261         assert(directory);
1262
1263         if (arg_volatile != VOLATILE_STATE)
1264                 return 0;
1265
1266         /* --volatile=state means we simply overmount /var
1267            with a tmpfs, and the rest read-only. */
1268
1269         r = bind_remount_recursive(directory, true);
1270         if (r < 0)
1271                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1272
1273         p = strjoina(directory, "/var");
1274         r = mkdir(p, 0755);
1275         if (r < 0 && errno != EEXIST)
1276                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1277
1278         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1279                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1280
1281         return 0;
1282 }
1283
1284 static int setup_volatile(const char *directory) {
1285         bool tmpfs_mounted = false, bind_mounted = false;
1286         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1287         const char *f, *t;
1288         int r;
1289
1290         assert(directory);
1291
1292         if (arg_volatile != VOLATILE_YES)
1293                 return 0;
1294
1295         /* --volatile=yes means we mount a tmpfs to the root dir, and
1296            the original /usr to use inside it, and that read-only. */
1297
1298         if (!mkdtemp(template))
1299                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1300
1301         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1302                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1303                 r = -errno;
1304                 goto fail;
1305         }
1306
1307         tmpfs_mounted = true;
1308
1309         f = strjoina(directory, "/usr");
1310         t = strjoina(template, "/usr");
1311
1312         r = mkdir(t, 0755);
1313         if (r < 0 && errno != EEXIST) {
1314                 log_error_errno(errno, "Failed to create %s: %m", t);
1315                 r = -errno;
1316                 goto fail;
1317         }
1318
1319         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1320                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1321                 r = -errno;
1322                 goto fail;
1323         }
1324
1325         bind_mounted = true;
1326
1327         r = bind_remount_recursive(t, true);
1328         if (r < 0) {
1329                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1330                 goto fail;
1331         }
1332
1333         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1334                 log_error_errno(errno, "Failed to move root mount: %m");
1335                 r = -errno;
1336                 goto fail;
1337         }
1338
1339         rmdir(template);
1340
1341         return 0;
1342
1343 fail:
1344         if (bind_mounted)
1345                 umount(t);
1346         if (tmpfs_mounted)
1347                 umount(template);
1348         rmdir(template);
1349         return r;
1350 }
1351
1352 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1353
1354         snprintf(s, 37,
1355                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1356                  SD_ID128_FORMAT_VAL(id));
1357
1358         return s;
1359 }
1360
1361 static int setup_boot_id(const char *dest) {
1362         _cleanup_free_ char *from = NULL, *to = NULL;
1363         sd_id128_t rnd = {};
1364         char as_uuid[37];
1365         int r;
1366
1367         assert(dest);
1368
1369         if (arg_share_system)
1370                 return 0;
1371
1372         /* Generate a new randomized boot ID, so that each boot-up of
1373          * the container gets a new one */
1374
1375         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1376         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1377         if (!from || !to)
1378                 return log_oom();
1379
1380         r = sd_id128_randomize(&rnd);
1381         if (r < 0)
1382                 return log_error_errno(r, "Failed to generate random boot id: %m");
1383
1384         id128_format_as_uuid(rnd, as_uuid);
1385
1386         r = write_string_file(from, as_uuid);
1387         if (r < 0)
1388                 return log_error_errno(r, "Failed to write boot id: %m");
1389
1390         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1391                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1392                 r = -errno;
1393         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1394                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1395
1396         unlink(from);
1397         return r;
1398 }
1399
1400 static int copy_devnodes(const char *dest) {
1401
1402         static const char devnodes[] =
1403                 "null\0"
1404                 "zero\0"
1405                 "full\0"
1406                 "random\0"
1407                 "urandom\0"
1408                 "tty\0"
1409                 "net/tun\0";
1410
1411         const char *d;
1412         int r = 0;
1413         _cleanup_umask_ mode_t u;
1414
1415         assert(dest);
1416
1417         u = umask(0000);
1418
1419         NULSTR_FOREACH(d, devnodes) {
1420                 _cleanup_free_ char *from = NULL, *to = NULL;
1421                 struct stat st;
1422
1423                 from = strappend("/dev/", d);
1424                 to = strjoin(dest, "/dev/", d, NULL);
1425                 if (!from || !to)
1426                         return log_oom();
1427
1428                 if (stat(from, &st) < 0) {
1429
1430                         if (errno != ENOENT)
1431                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1432
1433                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1434
1435                         log_error("%s is not a char or block device, cannot copy", from);
1436                         return -EIO;
1437
1438                 } else {
1439                         r = mkdir_parents(to, 0775);
1440                         if (r < 0) {
1441                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1442                                 return -r;
1443                         }
1444
1445                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1446                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1447
1448                         if (arg_userns && arg_uid_shift != UID_INVALID)
1449                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1450                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1451                 }
1452         }
1453
1454         return r;
1455 }
1456
1457 static int setup_ptmx(const char *dest) {
1458         _cleanup_free_ char *p = NULL;
1459
1460         p = strappend(dest, "/dev/ptmx");
1461         if (!p)
1462                 return log_oom();
1463
1464         if (symlink("pts/ptmx", p) < 0)
1465                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1466
1467         if (arg_userns && arg_uid_shift != UID_INVALID)
1468                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1469                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1470
1471         return 0;
1472 }
1473
1474 static int setup_dev_console(const char *dest, const char *console) {
1475         _cleanup_umask_ mode_t u;
1476         const char *to;
1477         struct stat st;
1478         int r;
1479
1480         assert(dest);
1481         assert(console);
1482
1483         u = umask(0000);
1484
1485         if (stat("/dev/null", &st) < 0)
1486                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1487
1488         r = chmod_and_chown(console, 0600, 0, 0);
1489         if (r < 0)
1490                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1491
1492         /* We need to bind mount the right tty to /dev/console since
1493          * ptys can only exist on pts file systems. To have something
1494          * to bind mount things on we create a device node first, and
1495          * use /dev/null for that since we the cgroups device policy
1496          * allows us to create that freely, while we cannot create
1497          * /dev/console. (Note that the major minor doesn't actually
1498          * matter here, since we mount it over anyway). */
1499
1500         to = strjoina(dest, "/dev/console");
1501         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1502                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1503
1504         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1505                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1506
1507         return 0;
1508 }
1509
1510 static int setup_kmsg(const char *dest, int kmsg_socket) {
1511         _cleanup_free_ char *from = NULL, *to = NULL;
1512         _cleanup_umask_ mode_t u;
1513         int r, fd, k;
1514         union {
1515                 struct cmsghdr cmsghdr;
1516                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1517         } control = {};
1518         struct msghdr mh = {
1519                 .msg_control = &control,
1520                 .msg_controllen = sizeof(control),
1521         };
1522         struct cmsghdr *cmsg;
1523
1524         assert(dest);
1525         assert(kmsg_socket >= 0);
1526
1527         u = umask(0000);
1528
1529         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1530          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1531          * on the reading side behave very similar to /proc/kmsg,
1532          * their writing side behaves differently from /dev/kmsg in
1533          * that writing blocks when nothing is reading. In order to
1534          * avoid any problems with containers deadlocking due to this
1535          * we simply make /dev/kmsg unavailable to the container. */
1536         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1537             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1538                 return log_oom();
1539
1540         if (mkfifo(from, 0600) < 0)
1541                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1542
1543         r = chmod_and_chown(from, 0600, 0, 0);
1544         if (r < 0)
1545                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1546
1547         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1548                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1549
1550         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1551         if (fd < 0)
1552                 return log_error_errno(errno, "Failed to open fifo: %m");
1553
1554         cmsg = CMSG_FIRSTHDR(&mh);
1555         cmsg->cmsg_level = SOL_SOCKET;
1556         cmsg->cmsg_type = SCM_RIGHTS;
1557         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1558         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1559
1560         mh.msg_controllen = cmsg->cmsg_len;
1561
1562         /* Store away the fd in the socket, so that it stays open as
1563          * long as we run the child */
1564         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1565         safe_close(fd);
1566
1567         if (k < 0)
1568                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1569
1570         /* And now make the FIFO unavailable as /dev/kmsg... */
1571         unlink(from);
1572         return 0;
1573 }
1574
1575 static int send_rtnl(int send_fd) {
1576         union {
1577                 struct cmsghdr cmsghdr;
1578                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1579         } control = {};
1580         struct msghdr mh = {
1581                 .msg_control = &control,
1582                 .msg_controllen = sizeof(control),
1583         };
1584         struct cmsghdr *cmsg;
1585         _cleanup_close_ int fd = -1;
1586         ssize_t k;
1587
1588         assert(send_fd >= 0);
1589
1590         if (!arg_expose_ports)
1591                 return 0;
1592
1593         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1594         if (fd < 0)
1595                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1596
1597         cmsg = CMSG_FIRSTHDR(&mh);
1598         cmsg->cmsg_level = SOL_SOCKET;
1599         cmsg->cmsg_type = SCM_RIGHTS;
1600         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1601         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1602
1603         mh.msg_controllen = cmsg->cmsg_len;
1604
1605         /* Store away the fd in the socket, so that it stays open as
1606          * long as we run the child */
1607         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1608         if (k < 0)
1609                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1610
1611         return 0;
1612 }
1613
1614 static int flush_ports(union in_addr_union *exposed) {
1615         ExposePort *p;
1616         int r, af = AF_INET;
1617
1618         assert(exposed);
1619
1620         if (!arg_expose_ports)
1621                 return 0;
1622
1623         if (in_addr_is_null(af, exposed))
1624                 return 0;
1625
1626         log_debug("Lost IP address.");
1627
1628         LIST_FOREACH(ports, p, arg_expose_ports) {
1629                 r = fw_add_local_dnat(false,
1630                                       af,
1631                                       p->protocol,
1632                                       NULL,
1633                                       NULL, 0,
1634                                       NULL, 0,
1635                                       p->host_port,
1636                                       exposed,
1637                                       p->container_port,
1638                                       NULL);
1639                 if (r < 0)
1640                         log_warning_errno(r, "Failed to modify firewall: %m");
1641         }
1642
1643         *exposed = IN_ADDR_NULL;
1644         return 0;
1645 }
1646
1647 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1648         _cleanup_free_ struct local_address *addresses = NULL;
1649         _cleanup_free_ char *pretty = NULL;
1650         union in_addr_union new_exposed;
1651         ExposePort *p;
1652         bool add;
1653         int af = AF_INET, r;
1654
1655         assert(exposed);
1656
1657         /* Invoked each time an address is added or removed inside the
1658          * container */
1659
1660         if (!arg_expose_ports)
1661                 return 0;
1662
1663         r = local_addresses(rtnl, 0, af, &addresses);
1664         if (r < 0)
1665                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1666
1667         add = r > 0 &&
1668                 addresses[0].family == af &&
1669                 addresses[0].scope < RT_SCOPE_LINK;
1670
1671         if (!add)
1672                 return flush_ports(exposed);
1673
1674         new_exposed = addresses[0].address;
1675         if (in_addr_equal(af, exposed, &new_exposed))
1676                 return 0;
1677
1678         in_addr_to_string(af, &new_exposed, &pretty);
1679         log_debug("New container IP is %s.", strna(pretty));
1680
1681         LIST_FOREACH(ports, p, arg_expose_ports) {
1682
1683                 r = fw_add_local_dnat(true,
1684                                       af,
1685                                       p->protocol,
1686                                       NULL,
1687                                       NULL, 0,
1688                                       NULL, 0,
1689                                       p->host_port,
1690                                       &new_exposed,
1691                                       p->container_port,
1692                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1693                 if (r < 0)
1694                         log_warning_errno(r, "Failed to modify firewall: %m");
1695         }
1696
1697         *exposed = new_exposed;
1698         return 0;
1699 }
1700
1701 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1702         union in_addr_union *exposed = userdata;
1703
1704         assert(rtnl);
1705         assert(m);
1706         assert(exposed);
1707
1708         expose_ports(rtnl, exposed);
1709         return 0;
1710 }
1711
1712 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1713         union {
1714                 struct cmsghdr cmsghdr;
1715                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1716         } control = {};
1717         struct msghdr mh = {
1718                 .msg_control = &control,
1719                 .msg_controllen = sizeof(control),
1720         };
1721         struct cmsghdr *cmsg;
1722         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1723         int fd, r;
1724         ssize_t k;
1725
1726         assert(event);
1727         assert(recv_fd >= 0);
1728         assert(ret);
1729
1730         if (!arg_expose_ports)
1731                 return 0;
1732
1733         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1734         if (k < 0)
1735                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1736
1737         cmsg = CMSG_FIRSTHDR(&mh);
1738         assert(cmsg->cmsg_level == SOL_SOCKET);
1739         assert(cmsg->cmsg_type == SCM_RIGHTS);
1740         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1741         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1742
1743         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1744         if (r < 0) {
1745                 safe_close(fd);
1746                 return log_error_errno(r, "Failed to create rtnl object: %m");
1747         }
1748
1749         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1750         if (r < 0)
1751                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1752
1753         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1754         if (r < 0)
1755                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1756
1757         r = sd_rtnl_attach_event(rtnl, event, 0);
1758         if (r < 0)
1759                 return log_error_errno(r, "Failed to add to even loop: %m");
1760
1761         *ret = rtnl;
1762         rtnl = NULL;
1763
1764         return 0;
1765 }
1766
1767 static int setup_hostname(void) {
1768
1769         if (arg_share_system)
1770                 return 0;
1771
1772         if (sethostname_idempotent(arg_machine) < 0)
1773                 return -errno;
1774
1775         return 0;
1776 }
1777
1778 static int setup_journal(const char *directory) {
1779         sd_id128_t machine_id, this_id;
1780         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1781         char *id;
1782         int r;
1783
1784         /* Don't link journals in ephemeral mode */
1785         if (arg_ephemeral)
1786                 return 0;
1787
1788         p = strappend(directory, "/etc/machine-id");
1789         if (!p)
1790                 return log_oom();
1791
1792         r = read_one_line_file(p, &b);
1793         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1794                 return 0;
1795         else if (r < 0)
1796                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1797
1798         id = strstrip(b);
1799         if (isempty(id) && arg_link_journal == LINK_AUTO)
1800                 return 0;
1801
1802         /* Verify validity */
1803         r = sd_id128_from_string(id, &machine_id);
1804         if (r < 0)
1805                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1806
1807         r = sd_id128_get_machine(&this_id);
1808         if (r < 0)
1809                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1810
1811         if (sd_id128_equal(machine_id, this_id)) {
1812                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1813                          "Host and machine ids are equal (%s): refusing to link journals", id);
1814                 if (arg_link_journal == LINK_AUTO)
1815                         return 0;
1816                 return -EEXIST;
1817         }
1818
1819         if (arg_link_journal == LINK_NO)
1820                 return 0;
1821
1822         free(p);
1823         p = strappend("/var/log/journal/", id);
1824         q = strjoin(directory, "/var/log/journal/", id, NULL);
1825         if (!p || !q)
1826                 return log_oom();
1827
1828         if (path_is_mount_point(p, false) > 0) {
1829                 if (arg_link_journal != LINK_AUTO) {
1830                         log_error("%s: already a mount point, refusing to use for journal", p);
1831                         return -EEXIST;
1832                 }
1833
1834                 return 0;
1835         }
1836
1837         if (path_is_mount_point(q, false) > 0) {
1838                 if (arg_link_journal != LINK_AUTO) {
1839                         log_error("%s: already a mount point, refusing to use for journal", q);
1840                         return -EEXIST;
1841                 }
1842
1843                 return 0;
1844         }
1845
1846         r = readlink_and_make_absolute(p, &d);
1847         if (r >= 0) {
1848                 if ((arg_link_journal == LINK_GUEST ||
1849                      arg_link_journal == LINK_AUTO) &&
1850                     path_equal(d, q)) {
1851
1852                         r = mkdir_p(q, 0755);
1853                         if (r < 0)
1854                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1855                         return 0;
1856                 }
1857
1858                 if (unlink(p) < 0)
1859                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1860         } else if (r == -EINVAL) {
1861
1862                 if (arg_link_journal == LINK_GUEST &&
1863                     rmdir(p) < 0) {
1864
1865                         if (errno == ENOTDIR) {
1866                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1867                                 return r;
1868                         } else {
1869                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1870                                 return -errno;
1871                         }
1872                 }
1873         } else if (r != -ENOENT) {
1874                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1875                 return r;
1876         }
1877
1878         if (arg_link_journal == LINK_GUEST) {
1879
1880                 if (symlink(q, p) < 0) {
1881                         if (arg_link_journal_try) {
1882                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1883                                 return 0;
1884                         } else {
1885                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1886                                 return -errno;
1887                         }
1888                 }
1889
1890                 r = mkdir_p(q, 0755);
1891                 if (r < 0)
1892                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1893                 return 0;
1894         }
1895
1896         if (arg_link_journal == LINK_HOST) {
1897                 /* don't create parents here -- if the host doesn't have
1898                  * permanent journal set up, don't force it here */
1899                 r = mkdir(p, 0755);
1900                 if (r < 0) {
1901                         if (arg_link_journal_try) {
1902                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1903                                 return 0;
1904                         } else {
1905                                 log_error_errno(errno, "Failed to create %s: %m", p);
1906                                 return r;
1907                         }
1908                 }
1909
1910         } else if (access(p, F_OK) < 0)
1911                 return 0;
1912
1913         if (dir_is_empty(q) == 0)
1914                 log_warning("%s is not empty, proceeding anyway.", q);
1915
1916         r = mkdir_p(q, 0755);
1917         if (r < 0) {
1918                 log_error_errno(errno, "Failed to create %s: %m", q);
1919                 return r;
1920         }
1921
1922         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1923                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1924
1925         return 0;
1926 }
1927
1928 static int drop_capabilities(void) {
1929         return capability_bounding_set_drop(~arg_retain, false);
1930 }
1931
1932 static int register_machine(pid_t pid, int local_ifindex) {
1933         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1934         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1935         int r;
1936
1937         if (!arg_register)
1938                 return 0;
1939
1940         r = sd_bus_default_system(&bus);
1941         if (r < 0)
1942                 return log_error_errno(r, "Failed to open system bus: %m");
1943
1944         if (arg_keep_unit) {
1945                 r = sd_bus_call_method(
1946                                 bus,
1947                                 "org.freedesktop.machine1",
1948                                 "/org/freedesktop/machine1",
1949                                 "org.freedesktop.machine1.Manager",
1950                                 "RegisterMachineWithNetwork",
1951                                 &error,
1952                                 NULL,
1953                                 "sayssusai",
1954                                 arg_machine,
1955                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1956                                 "nspawn",
1957                                 "container",
1958                                 (uint32_t) pid,
1959                                 strempty(arg_directory),
1960                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1961         } else {
1962                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1963                 char **i;
1964
1965                 r = sd_bus_message_new_method_call(
1966                                 bus,
1967                                 &m,
1968                                 "org.freedesktop.machine1",
1969                                 "/org/freedesktop/machine1",
1970                                 "org.freedesktop.machine1.Manager",
1971                                 "CreateMachineWithNetwork");
1972                 if (r < 0)
1973                         return bus_log_create_error(r);
1974
1975                 r = sd_bus_message_append(
1976                                 m,
1977                                 "sayssusai",
1978                                 arg_machine,
1979                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1980                                 "nspawn",
1981                                 "container",
1982                                 (uint32_t) pid,
1983                                 strempty(arg_directory),
1984                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1985                 if (r < 0)
1986                         return bus_log_create_error(r);
1987
1988                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1989                 if (r < 0)
1990                         return bus_log_create_error(r);
1991
1992                 if (!isempty(arg_slice)) {
1993                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1994                         if (r < 0)
1995                                 return bus_log_create_error(r);
1996                 }
1997
1998                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1999                 if (r < 0)
2000                         return bus_log_create_error(r);
2001
2002                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2003                                           /* Allow the container to
2004                                            * access and create the API
2005                                            * device nodes, so that
2006                                            * PrivateDevices= in the
2007                                            * container can work
2008                                            * fine */
2009                                           "/dev/null", "rwm",
2010                                           "/dev/zero", "rwm",
2011                                           "/dev/full", "rwm",
2012                                           "/dev/random", "rwm",
2013                                           "/dev/urandom", "rwm",
2014                                           "/dev/tty", "rwm",
2015                                           "/dev/net/tun", "rwm",
2016                                           /* Allow the container
2017                                            * access to ptys. However,
2018                                            * do not permit the
2019                                            * container to ever create
2020                                            * these device nodes. */
2021                                           "/dev/pts/ptmx", "rw",
2022                                           "char-pts", "rw");
2023                 if (r < 0)
2024                         return log_error_errno(r, "Failed to add device whitelist: %m");
2025
2026                 STRV_FOREACH(i, arg_property) {
2027                         r = sd_bus_message_open_container(m, 'r', "sv");
2028                         if (r < 0)
2029                                 return bus_log_create_error(r);
2030
2031                         r = bus_append_unit_property_assignment(m, *i);
2032                         if (r < 0)
2033                                 return r;
2034
2035                         r = sd_bus_message_close_container(m);
2036                         if (r < 0)
2037                                 return bus_log_create_error(r);
2038                 }
2039
2040                 r = sd_bus_message_close_container(m);
2041                 if (r < 0)
2042                         return bus_log_create_error(r);
2043
2044                 r = sd_bus_call(bus, m, 0, &error, NULL);
2045         }
2046
2047         if (r < 0) {
2048                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2049                 return r;
2050         }
2051
2052         return 0;
2053 }
2054
2055 static int terminate_machine(pid_t pid) {
2056         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2057         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2058         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2059         const char *path;
2060         int r;
2061
2062         if (!arg_register)
2063                 return 0;
2064
2065         r = sd_bus_default_system(&bus);
2066         if (r < 0)
2067                 return log_error_errno(r, "Failed to open system bus: %m");
2068
2069         r = sd_bus_call_method(
2070                         bus,
2071                         "org.freedesktop.machine1",
2072                         "/org/freedesktop/machine1",
2073                         "org.freedesktop.machine1.Manager",
2074                         "GetMachineByPID",
2075                         &error,
2076                         &reply,
2077                         "u",
2078                         (uint32_t) pid);
2079         if (r < 0) {
2080                 /* Note that the machine might already have been
2081                  * cleaned up automatically, hence don't consider it a
2082                  * failure if we cannot get the machine object. */
2083                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2084                 return 0;
2085         }
2086
2087         r = sd_bus_message_read(reply, "o", &path);
2088         if (r < 0)
2089                 return bus_log_parse_error(r);
2090
2091         r = sd_bus_call_method(
2092                         bus,
2093                         "org.freedesktop.machine1",
2094                         path,
2095                         "org.freedesktop.machine1.Machine",
2096                         "Terminate",
2097                         &error,
2098                         NULL,
2099                         NULL);
2100         if (r < 0) {
2101                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2102                 return 0;
2103         }
2104
2105         return 0;
2106 }
2107
2108 static int reset_audit_loginuid(void) {
2109         _cleanup_free_ char *p = NULL;
2110         int r;
2111
2112         if (arg_share_system)
2113                 return 0;
2114
2115         r = read_one_line_file("/proc/self/loginuid", &p);
2116         if (r == -ENOENT)
2117                 return 0;
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2120
2121         /* Already reset? */
2122         if (streq(p, "4294967295"))
2123                 return 0;
2124
2125         r = write_string_file("/proc/self/loginuid", "4294967295");
2126         if (r < 0) {
2127                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2128                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2129                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2130                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2131                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2132
2133                 sleep(5);
2134         }
2135
2136         return 0;
2137 }
2138
2139 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2140 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2141 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2142
2143 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2144         uint8_t result[8];
2145         size_t l, sz;
2146         uint8_t *v, *i;
2147         int r;
2148
2149         l = strlen(arg_machine);
2150         sz = sizeof(sd_id128_t) + l;
2151         if (idx > 0)
2152                 sz += sizeof(idx);
2153
2154         v = alloca(sz);
2155
2156         /* fetch some persistent data unique to the host */
2157         r = sd_id128_get_machine((sd_id128_t*) v);
2158         if (r < 0)
2159                 return r;
2160
2161         /* combine with some data unique (on this host) to this
2162          * container instance */
2163         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2164         if (idx > 0) {
2165                 idx = htole64(idx);
2166                 memcpy(i, &idx, sizeof(idx));
2167         }
2168
2169         /* Let's hash the host machine ID plus the container name. We
2170          * use a fixed, but originally randomly created hash key here. */
2171         siphash24(result, v, sz, hash_key.bytes);
2172
2173         assert_cc(ETH_ALEN <= sizeof(result));
2174         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2175
2176         /* see eth_random_addr in the kernel */
2177         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2178         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2179
2180         return 0;
2181 }
2182
2183 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2184         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2185         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2186         struct ether_addr mac_host, mac_container;
2187         int r, i;
2188
2189         if (!arg_private_network)
2190                 return 0;
2191
2192         if (!arg_network_veth)
2193                 return 0;
2194
2195         /* Use two different interface name prefixes depending whether
2196          * we are in bridge mode or not. */
2197         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2198                  arg_network_bridge ? "vb" : "ve", arg_machine);
2199
2200         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2203
2204         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2207
2208         r = sd_rtnl_open(&rtnl, 0);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to connect to netlink: %m");
2211
2212         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2215
2216         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2219
2220         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2221         if (r < 0)
2222                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2223
2224         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2225         if (r < 0)
2226                 return log_error_errno(r, "Failed to open netlink container: %m");
2227
2228         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2229         if (r < 0)
2230                 return log_error_errno(r, "Failed to open netlink container: %m");
2231
2232         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2233         if (r < 0)
2234                 return log_error_errno(r, "Failed to open netlink container: %m");
2235
2236         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2237         if (r < 0)
2238                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2239
2240         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2241         if (r < 0)
2242                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2243
2244         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2245         if (r < 0)
2246                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2247
2248         r = sd_rtnl_message_close_container(m);
2249         if (r < 0)
2250                 return log_error_errno(r, "Failed to close netlink container: %m");
2251
2252         r = sd_rtnl_message_close_container(m);
2253         if (r < 0)
2254                 return log_error_errno(r, "Failed to close netlink container: %m");
2255
2256         r = sd_rtnl_message_close_container(m);
2257         if (r < 0)
2258                 return log_error_errno(r, "Failed to close netlink container: %m");
2259
2260         r = sd_rtnl_call(rtnl, m, 0, NULL);
2261         if (r < 0)
2262                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2263
2264         i = (int) if_nametoindex(iface_name);
2265         if (i <= 0)
2266                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2267
2268         *ifi = i;
2269
2270         return 0;
2271 }
2272
2273 static int setup_bridge(const char veth_name[], int *ifi) {
2274         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2275         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2276         int r, bridge;
2277
2278         if (!arg_private_network)
2279                 return 0;
2280
2281         if (!arg_network_veth)
2282                 return 0;
2283
2284         if (!arg_network_bridge)
2285                 return 0;
2286
2287         bridge = (int) if_nametoindex(arg_network_bridge);
2288         if (bridge <= 0)
2289                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2290
2291         *ifi = bridge;
2292
2293         r = sd_rtnl_open(&rtnl, 0);
2294         if (r < 0)
2295                 return log_error_errno(r, "Failed to connect to netlink: %m");
2296
2297         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2298         if (r < 0)
2299                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2300
2301         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2302         if (r < 0)
2303                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2304
2305         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2306         if (r < 0)
2307                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2308
2309         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2310         if (r < 0)
2311                 return log_error_errno(r, "Failed to add netlink master field: %m");
2312
2313         r = sd_rtnl_call(rtnl, m, 0, NULL);
2314         if (r < 0)
2315                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2316
2317         return 0;
2318 }
2319
2320 static int parse_interface(struct udev *udev, const char *name) {
2321         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2322         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2323         int ifi;
2324
2325         ifi = (int) if_nametoindex(name);
2326         if (ifi <= 0)
2327                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2328
2329         sprintf(ifi_str, "n%i", ifi);
2330         d = udev_device_new_from_device_id(udev, ifi_str);
2331         if (!d)
2332                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2333
2334         if (udev_device_get_is_initialized(d) <= 0) {
2335                 log_error("Network interface %s is not initialized yet.", name);
2336                 return -EBUSY;
2337         }
2338
2339         return ifi;
2340 }
2341
2342 static int move_network_interfaces(pid_t pid) {
2343         _cleanup_udev_unref_ struct udev *udev = NULL;
2344         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2345         char **i;
2346         int r;
2347
2348         if (!arg_private_network)
2349                 return 0;
2350
2351         if (strv_isempty(arg_network_interfaces))
2352                 return 0;
2353
2354         r = sd_rtnl_open(&rtnl, 0);
2355         if (r < 0)
2356                 return log_error_errno(r, "Failed to connect to netlink: %m");
2357
2358         udev = udev_new();
2359         if (!udev) {
2360                 log_error("Failed to connect to udev.");
2361                 return -ENOMEM;
2362         }
2363
2364         STRV_FOREACH(i, arg_network_interfaces) {
2365                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2366                 int ifi;
2367
2368                 ifi = parse_interface(udev, *i);
2369                 if (ifi < 0)
2370                         return ifi;
2371
2372                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2375
2376                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2379
2380                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2381                 if (r < 0)
2382                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2383         }
2384
2385         return 0;
2386 }
2387
2388 static int setup_macvlan(pid_t pid) {
2389         _cleanup_udev_unref_ struct udev *udev = NULL;
2390         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2391         unsigned idx = 0;
2392         char **i;
2393         int r;
2394
2395         if (!arg_private_network)
2396                 return 0;
2397
2398         if (strv_isempty(arg_network_macvlan))
2399                 return 0;
2400
2401         r = sd_rtnl_open(&rtnl, 0);
2402         if (r < 0)
2403                 return log_error_errno(r, "Failed to connect to netlink: %m");
2404
2405         udev = udev_new();
2406         if (!udev) {
2407                 log_error("Failed to connect to udev.");
2408                 return -ENOMEM;
2409         }
2410
2411         STRV_FOREACH(i, arg_network_macvlan) {
2412                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2413                 _cleanup_free_ char *n = NULL;
2414                 struct ether_addr mac;
2415                 int ifi;
2416
2417                 ifi = parse_interface(udev, *i);
2418                 if (ifi < 0)
2419                         return ifi;
2420
2421                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2422                 if (r < 0)
2423                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2424
2425                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2426                 if (r < 0)
2427                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2428
2429                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2430                 if (r < 0)
2431                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2432
2433                 n = strappend("mv-", *i);
2434                 if (!n)
2435                         return log_oom();
2436
2437                 strshorten(n, IFNAMSIZ-1);
2438
2439                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2440                 if (r < 0)
2441                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2442
2443                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2444                 if (r < 0)
2445                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2446
2447                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2448                 if (r < 0)
2449                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2450
2451                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2452                 if (r < 0)
2453                         return log_error_errno(r, "Failed to open netlink container: %m");
2454
2455                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2456                 if (r < 0)
2457                         return log_error_errno(r, "Failed to open netlink container: %m");
2458
2459                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2460                 if (r < 0)
2461                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2462
2463                 r = sd_rtnl_message_close_container(m);
2464                 if (r < 0)
2465                         return log_error_errno(r, "Failed to close netlink container: %m");
2466
2467                 r = sd_rtnl_message_close_container(m);
2468                 if (r < 0)
2469                         return log_error_errno(r, "Failed to close netlink container: %m");
2470
2471                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2472                 if (r < 0)
2473                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2474         }
2475
2476         return 0;
2477 }
2478
2479 static int setup_ipvlan(pid_t pid) {
2480         _cleanup_udev_unref_ struct udev *udev = NULL;
2481         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2482         char **i;
2483         int r;
2484
2485         if (!arg_private_network)
2486                 return 0;
2487
2488         if (strv_isempty(arg_network_ipvlan))
2489                 return 0;
2490
2491         r = sd_rtnl_open(&rtnl, 0);
2492         if (r < 0)
2493                 return log_error_errno(r, "Failed to connect to netlink: %m");
2494
2495         udev = udev_new();
2496         if (!udev) {
2497                 log_error("Failed to connect to udev.");
2498                 return -ENOMEM;
2499         }
2500
2501         STRV_FOREACH(i, arg_network_ipvlan) {
2502                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2503                 _cleanup_free_ char *n = NULL;
2504                 int ifi;
2505
2506                 ifi = parse_interface(udev, *i);
2507                 if (ifi < 0)
2508                         return ifi;
2509
2510                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2511                 if (r < 0)
2512                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2513
2514                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2515                 if (r < 0)
2516                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2517
2518                 n = strappend("iv-", *i);
2519                 if (!n)
2520                         return log_oom();
2521
2522                 strshorten(n, IFNAMSIZ-1);
2523
2524                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2525                 if (r < 0)
2526                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2527
2528                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2529                 if (r < 0)
2530                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2531
2532                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2533                 if (r < 0)
2534                         return log_error_errno(r, "Failed to open netlink container: %m");
2535
2536                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2537                 if (r < 0)
2538                         return log_error_errno(r, "Failed to open netlink container: %m");
2539
2540                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2541                 if (r < 0)
2542                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2543
2544                 r = sd_rtnl_message_close_container(m);
2545                 if (r < 0)
2546                         return log_error_errno(r, "Failed to close netlink container: %m");
2547
2548                 r = sd_rtnl_message_close_container(m);
2549                 if (r < 0)
2550                         return log_error_errno(r, "Failed to close netlink container: %m");
2551
2552                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2553                 if (r < 0)
2554                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2555         }
2556
2557         return 0;
2558 }
2559
2560 static int setup_seccomp(void) {
2561
2562 #ifdef HAVE_SECCOMP
2563         static const int blacklist[] = {
2564                 SCMP_SYS(kexec_load),
2565                 SCMP_SYS(open_by_handle_at),
2566                 SCMP_SYS(iopl),
2567                 SCMP_SYS(ioperm),
2568                 SCMP_SYS(swapon),
2569                 SCMP_SYS(swapoff),
2570         };
2571
2572         static const int kmod_blacklist[] = {
2573                 SCMP_SYS(init_module),
2574                 SCMP_SYS(finit_module),
2575                 SCMP_SYS(delete_module),
2576         };
2577
2578         scmp_filter_ctx seccomp;
2579         unsigned i;
2580         int r;
2581
2582         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2583         if (!seccomp)
2584                 return log_oom();
2585
2586         r = seccomp_add_secondary_archs(seccomp);
2587         if (r < 0) {
2588                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2589                 goto finish;
2590         }
2591
2592         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2593                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2594                 if (r == -EFAULT)
2595                         continue; /* unknown syscall */
2596                 if (r < 0) {
2597                         log_error_errno(r, "Failed to block syscall: %m");
2598                         goto finish;
2599                 }
2600         }
2601
2602         /* If the CAP_SYS_MODULE capability is not requested then
2603          * we'll block the kmod syscalls too */
2604         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2605                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2606                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2607                         if (r == -EFAULT)
2608                                 continue; /* unknown syscall */
2609                         if (r < 0) {
2610                                 log_error_errno(r, "Failed to block syscall: %m");
2611                                 goto finish;
2612                         }
2613                 }
2614         }
2615
2616         /*
2617            Audit is broken in containers, much of the userspace audit
2618            hookup will fail if running inside a container. We don't
2619            care and just turn off creation of audit sockets.
2620
2621            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2622            with EAFNOSUPPORT which audit userspace uses as indication
2623            that audit is disabled in the kernel.
2624          */
2625
2626         r = seccomp_rule_add(
2627                         seccomp,
2628                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2629                         SCMP_SYS(socket),
2630                         2,
2631                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2632                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2633         if (r < 0) {
2634                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2635                 goto finish;
2636         }
2637
2638         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2639         if (r < 0) {
2640                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2641                 goto finish;
2642         }
2643
2644         r = seccomp_load(seccomp);
2645         if (r < 0)
2646                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2647
2648 finish:
2649         seccomp_release(seccomp);
2650         return r;
2651 #else
2652         return 0;
2653 #endif
2654
2655 }
2656
2657 static int setup_propagate(const char *root) {
2658         const char *p, *q;
2659
2660         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2661         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2662         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2663         (void) mkdir_p(p, 0600);
2664
2665         q = strjoina(root, "/run/systemd/nspawn/incoming");
2666         mkdir_parents(q, 0755);
2667         mkdir_p(q, 0600);
2668
2669         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2670                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2671
2672         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2673                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2674
2675         return 0;
2676 }
2677
2678 static int setup_image(char **device_path, int *loop_nr) {
2679         struct loop_info64 info = {
2680                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2681         };
2682         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2683         _cleanup_free_ char* loopdev = NULL;
2684         struct stat st;
2685         int r, nr;
2686
2687         assert(device_path);
2688         assert(loop_nr);
2689         assert(arg_image);
2690
2691         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2692         if (fd < 0)
2693                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2694
2695         if (fstat(fd, &st) < 0)
2696                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2697
2698         if (S_ISBLK(st.st_mode)) {
2699                 char *p;
2700
2701                 p = strdup(arg_image);
2702                 if (!p)
2703                         return log_oom();
2704
2705                 *device_path = p;
2706
2707                 *loop_nr = -1;
2708
2709                 r = fd;
2710                 fd = -1;
2711
2712                 return r;
2713         }
2714
2715         if (!S_ISREG(st.st_mode)) {
2716                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2717                 return -EINVAL;
2718         }
2719
2720         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2721         if (control < 0)
2722                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2723
2724         nr = ioctl(control, LOOP_CTL_GET_FREE);
2725         if (nr < 0)
2726                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2727
2728         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2729                 return log_oom();
2730
2731         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2732         if (loop < 0)
2733                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2734
2735         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2736                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2737
2738         if (arg_read_only)
2739                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2740
2741         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2742                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2743
2744         *device_path = loopdev;
2745         loopdev = NULL;
2746
2747         *loop_nr = nr;
2748
2749         r = loop;
2750         loop = -1;
2751
2752         return r;
2753 }
2754
2755 #define PARTITION_TABLE_BLURB \
2756         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2757         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2758         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2759         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2760         "to be bootable with systemd-nspawn."
2761
2762 static int dissect_image(
2763                 int fd,
2764                 char **root_device, bool *root_device_rw,
2765                 char **home_device, bool *home_device_rw,
2766                 char **srv_device, bool *srv_device_rw,
2767                 bool *secondary) {
2768
2769 #ifdef HAVE_BLKID
2770         int home_nr = -1, srv_nr = -1;
2771 #ifdef GPT_ROOT_NATIVE
2772         int root_nr = -1;
2773 #endif
2774 #ifdef GPT_ROOT_SECONDARY
2775         int secondary_root_nr = -1;
2776 #endif
2777         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2778         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2779         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2780         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2781         _cleanup_udev_unref_ struct udev *udev = NULL;
2782         struct udev_list_entry *first, *item;
2783         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2784         bool is_gpt, is_mbr, multiple_generic = false;
2785         const char *pttype = NULL;
2786         blkid_partlist pl;
2787         struct stat st;
2788         unsigned i;
2789         int r;
2790
2791         assert(fd >= 0);
2792         assert(root_device);
2793         assert(home_device);
2794         assert(srv_device);
2795         assert(secondary);
2796         assert(arg_image);
2797
2798         b = blkid_new_probe();
2799         if (!b)
2800                 return log_oom();
2801
2802         errno = 0;
2803         r = blkid_probe_set_device(b, fd, 0, 0);
2804         if (r != 0) {
2805                 if (errno == 0)
2806                         return log_oom();
2807
2808                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2809                 return -errno;
2810         }
2811
2812         blkid_probe_enable_partitions(b, 1);
2813         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2814
2815         errno = 0;
2816         r = blkid_do_safeprobe(b);
2817         if (r == -2 || r == 1) {
2818                 log_error("Failed to identify any partition table on\n"
2819                           "    %s\n"
2820                           PARTITION_TABLE_BLURB, arg_image);
2821                 return -EINVAL;
2822         } else if (r != 0) {
2823                 if (errno == 0)
2824                         errno = EIO;
2825                 log_error_errno(errno, "Failed to probe: %m");
2826                 return -errno;
2827         }
2828
2829         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2830
2831         is_gpt = streq_ptr(pttype, "gpt");
2832         is_mbr = streq_ptr(pttype, "dos");
2833
2834         if (!is_gpt && !is_mbr) {
2835                 log_error("No GPT or MBR partition table discovered on\n"
2836                           "    %s\n"
2837                           PARTITION_TABLE_BLURB, arg_image);
2838                 return -EINVAL;
2839         }
2840
2841         errno = 0;
2842         pl = blkid_probe_get_partitions(b);
2843         if (!pl) {
2844                 if (errno == 0)
2845                         return log_oom();
2846
2847                 log_error("Failed to list partitions of %s", arg_image);
2848                 return -errno;
2849         }
2850
2851         udev = udev_new();
2852         if (!udev)
2853                 return log_oom();
2854
2855         if (fstat(fd, &st) < 0)
2856                 return log_error_errno(errno, "Failed to stat block device: %m");
2857
2858         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2859         if (!d)
2860                 return log_oom();
2861
2862         for (i = 0;; i++) {
2863                 int n, m;
2864
2865                 if (i >= 10) {
2866                         log_error("Kernel partitions never appeared.");
2867                         return -ENXIO;
2868                 }
2869
2870                 e = udev_enumerate_new(udev);
2871                 if (!e)
2872                         return log_oom();
2873
2874                 r = udev_enumerate_add_match_parent(e, d);
2875                 if (r < 0)
2876                         return log_oom();
2877
2878                 r = udev_enumerate_scan_devices(e);
2879                 if (r < 0)
2880                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2881
2882                 /* Count the partitions enumerated by the kernel */
2883                 n = 0;
2884                 first = udev_enumerate_get_list_entry(e);
2885                 udev_list_entry_foreach(item, first)
2886                         n++;
2887
2888                 /* Count the partitions enumerated by blkid */
2889                 m = blkid_partlist_numof_partitions(pl);
2890                 if (n == m + 1)
2891                         break;
2892                 if (n > m + 1) {
2893                         log_error("blkid and kernel partition list do not match.");
2894                         return -EIO;
2895                 }
2896                 if (n < m + 1) {
2897                         unsigned j;
2898
2899                         /* The kernel has probed fewer partitions than
2900                          * blkid? Maybe the kernel prober is still
2901                          * running or it got EBUSY because udev
2902                          * already opened the device. Let's reprobe
2903                          * the device, which is a synchronous call
2904                          * that waits until probing is complete. */
2905
2906                         for (j = 0; j < 20; j++) {
2907
2908                                 r = ioctl(fd, BLKRRPART, 0);
2909                                 if (r < 0)
2910                                         r = -errno;
2911                                 if (r >= 0 || r != -EBUSY)
2912                                         break;
2913
2914                                 /* If something else has the device
2915                                  * open, such as an udev rule, the
2916                                  * ioctl will return EBUSY. Since
2917                                  * there's no way to wait until it
2918                                  * isn't busy anymore, let's just wait
2919                                  * a bit, and try again.
2920                                  *
2921                                  * This is really something they
2922                                  * should fix in the kernel! */
2923
2924                                 usleep(50 * USEC_PER_MSEC);
2925                         }
2926
2927                         if (r < 0)
2928                                 return log_error_errno(r, "Failed to reread partition table: %m");
2929                 }
2930
2931                 e = udev_enumerate_unref(e);
2932         }
2933
2934         first = udev_enumerate_get_list_entry(e);
2935         udev_list_entry_foreach(item, first) {
2936                 _cleanup_udev_device_unref_ struct udev_device *q;
2937                 const char *node;
2938                 unsigned long long flags;
2939                 blkid_partition pp;
2940                 dev_t qn;
2941                 int nr;
2942
2943                 errno = 0;
2944                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2945                 if (!q) {
2946                         if (!errno)
2947                                 errno = ENOMEM;
2948
2949                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2950                         return -errno;
2951                 }
2952
2953                 qn = udev_device_get_devnum(q);
2954                 if (major(qn) == 0)
2955                         continue;
2956
2957                 if (st.st_rdev == qn)
2958                         continue;
2959
2960                 node = udev_device_get_devnode(q);
2961                 if (!node)
2962                         continue;
2963
2964                 pp = blkid_partlist_devno_to_partition(pl, qn);
2965                 if (!pp)
2966                         continue;
2967
2968                 flags = blkid_partition_get_flags(pp);
2969
2970                 nr = blkid_partition_get_partno(pp);
2971                 if (nr < 0)
2972                         continue;
2973
2974                 if (is_gpt) {
2975                         sd_id128_t type_id;
2976                         const char *stype;
2977
2978                         if (flags & GPT_FLAG_NO_AUTO)
2979                                 continue;
2980
2981                         stype = blkid_partition_get_type_string(pp);
2982                         if (!stype)
2983                                 continue;
2984
2985                         if (sd_id128_from_string(stype, &type_id) < 0)
2986                                 continue;
2987
2988                         if (sd_id128_equal(type_id, GPT_HOME)) {
2989
2990                                 if (home && nr >= home_nr)
2991                                         continue;
2992
2993                                 home_nr = nr;
2994                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2995
2996                                 r = free_and_strdup(&home, node);
2997                                 if (r < 0)
2998                                         return log_oom();
2999
3000                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
3001
3002                                 if (srv && nr >= srv_nr)
3003                                         continue;
3004
3005                                 srv_nr = nr;
3006                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3007
3008                                 r = free_and_strdup(&srv, node);
3009                                 if (r < 0)
3010                                         return log_oom();
3011                         }
3012 #ifdef GPT_ROOT_NATIVE
3013                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3014
3015                                 if (root && nr >= root_nr)
3016                                         continue;
3017
3018                                 root_nr = nr;
3019                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3020
3021                                 r = free_and_strdup(&root, node);
3022                                 if (r < 0)
3023                                         return log_oom();
3024                         }
3025 #endif
3026 #ifdef GPT_ROOT_SECONDARY
3027                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3028
3029                                 if (secondary_root && nr >= secondary_root_nr)
3030                                         continue;
3031
3032                                 secondary_root_nr = nr;
3033                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3034
3035                                 r = free_and_strdup(&secondary_root, node);
3036                                 if (r < 0)
3037                                         return log_oom();
3038                         }
3039 #endif
3040                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3041
3042                                 if (generic)
3043                                         multiple_generic = true;
3044                                 else {
3045                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3046
3047                                         r = free_and_strdup(&generic, node);
3048                                         if (r < 0)
3049                                                 return log_oom();
3050                                 }
3051                         }
3052
3053                 } else if (is_mbr) {
3054                         int type;
3055
3056                         if (flags != 0x80) /* Bootable flag */
3057                                 continue;
3058
3059                         type = blkid_partition_get_type(pp);
3060                         if (type != 0x83) /* Linux partition */
3061                                 continue;
3062
3063                         if (generic)
3064                                 multiple_generic = true;
3065                         else {
3066                                 generic_rw = true;
3067
3068