chiark / gitweb /
nspawn: add basic user namespacing support
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
192 static bool arg_userns = false;
193
194 static void help(void) {
195         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197                "  -h --help                 Show this help\n"
198                "     --version              Print version string\n"
199                "  -q --quiet                Do not show status information\n"
200                "  -D --directory=PATH       Root directory for the container\n"
201                "     --template=PATH        Initialize root directory from template directory,\n"
202                "                            if missing\n"
203                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
204                "                            remove it after exit\n"
205                "  -i --image=PATH           File system device or disk image for the container\n"
206                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
207                "  -u --user=USER            Run the command under specified user or uid\n"
208                "  -M --machine=NAME         Set the machine name for the container\n"
209                "     --uuid=UUID            Set a specific machine UUID for the container\n"
210                "  -S --slice=SLICE          Place the container in the specified slice\n"
211                "     --property=NAME=VALUE  Set scope unit property\n"
212                "     --private-network      Disable network in container\n"
213                "     --network-interface=INTERFACE\n"
214                "                            Assign an existing network interface to the\n"
215                "                            container\n"
216                "     --network-macvlan=INTERFACE\n"
217                "                            Create a macvlan network interface based on an\n"
218                "                            existing network interface to the container\n"
219                "     --network-ipvlan=INTERFACE\n"
220                "                            Create a ipvlan network interface based on an\n"
221                "                            existing network interface to the container\n"
222                "  -n --network-veth         Add a virtual ethernet connection between host\n"
223                "                            and container\n"
224                "     --network-bridge=INTERFACE\n"
225                "                            Add a virtual ethernet connection between host\n"
226                "                            and container and add it to an existing bridge on\n"
227                "                            the host\n"
228                "     --private-users[=UIDBASE[:NUIDS]]\n"
229                "                            Run within user namespace\n"
230                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
231                "                            Expose a container IP port on the host\n"
232                "  -Z --selinux-context=SECLABEL\n"
233                "                            Set the SELinux security context to be used by\n"
234                "                            processes in the container\n"
235                "  -L --selinux-apifs-context=SECLABEL\n"
236                "                            Set the SELinux security context to be used by\n"
237                "                            API/tmpfs file systems in the container\n"
238                "     --capability=CAP       In addition to the default, retain specified\n"
239                "                            capability\n"
240                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
241                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
242                "                            try-guest, try-host\n"
243                "  -j                        Equivalent to --link-journal=try-guest\n"
244                "     --read-only            Mount the root directory read-only\n"
245                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
246                "                            the container\n"
247                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
248                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
249                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
250                "     --share-system         Share system namespaces with host\n"
251                "     --register=BOOLEAN     Register container as machine\n"
252                "     --keep-unit            Do not register a scope for the machine, reuse\n"
253                "                            the service unit nspawn is running in\n"
254                "     --volatile[=MODE]      Run the system in volatile mode\n"
255                , program_invocation_short_name);
256 }
257
258 static int set_sanitized_path(char **b, const char *path) {
259         char *p;
260
261         assert(b);
262         assert(path);
263
264         p = canonicalize_file_name(path);
265         if (!p) {
266                 if (errno != ENOENT)
267                         return -errno;
268
269                 p = path_make_absolute_cwd(path);
270                 if (!p)
271                         return -ENOMEM;
272         }
273
274         free(*b);
275         *b = path_kill_slashes(p);
276         return 0;
277 }
278
279 static int parse_argv(int argc, char *argv[]) {
280
281         enum {
282                 ARG_VERSION = 0x100,
283                 ARG_PRIVATE_NETWORK,
284                 ARG_UUID,
285                 ARG_READ_ONLY,
286                 ARG_CAPABILITY,
287                 ARG_DROP_CAPABILITY,
288                 ARG_LINK_JOURNAL,
289                 ARG_BIND,
290                 ARG_BIND_RO,
291                 ARG_TMPFS,
292                 ARG_SETENV,
293                 ARG_SHARE_SYSTEM,
294                 ARG_REGISTER,
295                 ARG_KEEP_UNIT,
296                 ARG_NETWORK_INTERFACE,
297                 ARG_NETWORK_MACVLAN,
298                 ARG_NETWORK_IPVLAN,
299                 ARG_NETWORK_BRIDGE,
300                 ARG_PERSONALITY,
301                 ARG_VOLATILE,
302                 ARG_TEMPLATE,
303                 ARG_PROPERTY,
304                 ARG_PRIVATE_USERS,
305         };
306
307         static const struct option options[] = {
308                 { "help",                  no_argument,       NULL, 'h'                   },
309                 { "version",               no_argument,       NULL, ARG_VERSION           },
310                 { "directory",             required_argument, NULL, 'D'                   },
311                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
312                 { "ephemeral",             no_argument,       NULL, 'x'                   },
313                 { "user",                  required_argument, NULL, 'u'                   },
314                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
315                 { "boot",                  no_argument,       NULL, 'b'                   },
316                 { "uuid",                  required_argument, NULL, ARG_UUID              },
317                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
318                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
319                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
320                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
321                 { "bind",                  required_argument, NULL, ARG_BIND              },
322                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
323                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
324                 { "machine",               required_argument, NULL, 'M'                   },
325                 { "slice",                 required_argument, NULL, 'S'                   },
326                 { "setenv",                required_argument, NULL, ARG_SETENV            },
327                 { "selinux-context",       required_argument, NULL, 'Z'                   },
328                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
329                 { "quiet",                 no_argument,       NULL, 'q'                   },
330                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
331                 { "register",              required_argument, NULL, ARG_REGISTER          },
332                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
333                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
334                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
335                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
336                 { "network-veth",          no_argument,       NULL, 'n'                   },
337                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
338                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
339                 { "image",                 required_argument, NULL, 'i'                   },
340                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
341                 { "port",                  required_argument, NULL, 'p'                   },
342                 { "property",              required_argument, NULL, ARG_PROPERTY          },
343                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
344                 {}
345         };
346
347         int c, r;
348         uint64_t plus = 0, minus = 0;
349
350         assert(argc >= 0);
351         assert(argv);
352
353         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
354
355                 switch (c) {
356
357                 case 'h':
358                         help();
359                         return 0;
360
361                 case ARG_VERSION:
362                         puts(PACKAGE_STRING);
363                         puts(SYSTEMD_FEATURES);
364                         return 0;
365
366                 case 'D':
367                         r = set_sanitized_path(&arg_directory, optarg);
368                         if (r < 0)
369                                 return log_error_errno(r, "Invalid root directory: %m");
370
371                         break;
372
373                 case ARG_TEMPLATE:
374                         r = set_sanitized_path(&arg_template, optarg);
375                         if (r < 0)
376                                 return log_error_errno(r, "Invalid template directory: %m");
377
378                         break;
379
380                 case 'i':
381                         r = set_sanitized_path(&arg_image, optarg);
382                         if (r < 0)
383                                 return log_error_errno(r, "Invalid image path: %m");
384
385                         break;
386
387                 case 'x':
388                         arg_ephemeral = true;
389                         break;
390
391                 case 'u':
392                         free(arg_user);
393                         arg_user = strdup(optarg);
394                         if (!arg_user)
395                                 return log_oom();
396
397                         break;
398
399                 case ARG_NETWORK_BRIDGE:
400                         arg_network_bridge = optarg;
401
402                         /* fall through */
403
404                 case 'n':
405                         arg_network_veth = true;
406                         arg_private_network = true;
407                         break;
408
409                 case ARG_NETWORK_INTERFACE:
410                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
411                                 return log_oom();
412
413                         arg_private_network = true;
414                         break;
415
416                 case ARG_NETWORK_MACVLAN:
417                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
418                                 return log_oom();
419
420                         arg_private_network = true;
421                         break;
422
423                 case ARG_NETWORK_IPVLAN:
424                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425                                 return log_oom();
426
427                         /* fall through */
428
429                 case ARG_PRIVATE_NETWORK:
430                         arg_private_network = true;
431                         break;
432
433                 case 'b':
434                         arg_boot = true;
435                         break;
436
437                 case ARG_UUID:
438                         r = sd_id128_from_string(optarg, &arg_uuid);
439                         if (r < 0) {
440                                 log_error("Invalid UUID: %s", optarg);
441                                 return r;
442                         }
443                         break;
444
445                 case 'S':
446                         arg_slice = optarg;
447                         break;
448
449                 case 'M':
450                         if (isempty(optarg)) {
451                                 free(arg_machine);
452                                 arg_machine = NULL;
453                         } else {
454                                 if (!machine_name_is_valid(optarg)) {
455                                         log_error("Invalid machine name: %s", optarg);
456                                         return -EINVAL;
457                                 }
458
459                                 r = free_and_strdup(&arg_machine, optarg);
460                                 if (r < 0)
461                                         return log_oom();
462
463                                 break;
464                         }
465
466                 case 'Z':
467                         arg_selinux_context = optarg;
468                         break;
469
470                 case 'L':
471                         arg_selinux_apifs_context = optarg;
472                         break;
473
474                 case ARG_READ_ONLY:
475                         arg_read_only = true;
476                         break;
477
478                 case ARG_CAPABILITY:
479                 case ARG_DROP_CAPABILITY: {
480                         const char *state, *word;
481                         size_t length;
482
483                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
484                                 _cleanup_free_ char *t;
485
486                                 t = strndup(word, length);
487                                 if (!t)
488                                         return log_oom();
489
490                                 if (streq(t, "all")) {
491                                         if (c == ARG_CAPABILITY)
492                                                 plus = (uint64_t) -1;
493                                         else
494                                                 minus = (uint64_t) -1;
495                                 } else {
496                                         int cap;
497
498                                         cap = capability_from_name(t);
499                                         if (cap < 0) {
500                                                 log_error("Failed to parse capability %s.", t);
501                                                 return -EINVAL;
502                                         }
503
504                                         if (c == ARG_CAPABILITY)
505                                                 plus |= 1ULL << (uint64_t) cap;
506                                         else
507                                                 minus |= 1ULL << (uint64_t) cap;
508                                 }
509                         }
510
511                         break;
512                 }
513
514                 case 'j':
515                         arg_link_journal = LINK_GUEST;
516                         arg_link_journal_try = true;
517                         break;
518
519                 case ARG_LINK_JOURNAL:
520                         if (streq(optarg, "auto")) {
521                                 arg_link_journal = LINK_AUTO;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "no")) {
524                                 arg_link_journal = LINK_NO;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = false;
529                         } else if (streq(optarg, "host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = false;
532                         } else if (streq(optarg, "try-guest")) {
533                                 arg_link_journal = LINK_GUEST;
534                                 arg_link_journal_try = true;
535                         } else if (streq(optarg, "try-host")) {
536                                 arg_link_journal = LINK_HOST;
537                                 arg_link_journal_try = true;
538                         } else {
539                                 log_error("Failed to parse link journal mode %s", optarg);
540                                 return -EINVAL;
541                         }
542
543                         break;
544
545                 case ARG_BIND:
546                 case ARG_BIND_RO: {
547                         _cleanup_free_ char *a = NULL, *b = NULL;
548                         char *e;
549                         char ***x;
550
551                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553                         e = strchr(optarg, ':');
554                         if (e) {
555                                 a = strndup(optarg, e - optarg);
556                                 b = strdup(e + 1);
557                         } else {
558                                 a = strdup(optarg);
559                                 b = strdup(optarg);
560                         }
561
562                         if (!a || !b)
563                                 return log_oom();
564
565                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
566                                 log_error("Invalid bind mount specification: %s", optarg);
567                                 return -EINVAL;
568                         }
569
570                         r = strv_extend(x, a);
571                         if (r < 0)
572                                 return log_oom();
573
574                         r = strv_extend(x, b);
575                         if (r < 0)
576                                 return log_oom();
577
578                         break;
579                 }
580
581                 case ARG_TMPFS: {
582                         _cleanup_free_ char *a = NULL, *b = NULL;
583                         char *e;
584
585                         e = strchr(optarg, ':');
586                         if (e) {
587                                 a = strndup(optarg, e - optarg);
588                                 b = strdup(e + 1);
589                         } else {
590                                 a = strdup(optarg);
591                                 b = strdup("mode=0755");
592                         }
593
594                         if (!a || !b)
595                                 return log_oom();
596
597                         if (!path_is_absolute(a)) {
598                                 log_error("Invalid tmpfs specification: %s", optarg);
599                                 return -EINVAL;
600                         }
601
602                         r = strv_push(&arg_tmpfs, a);
603                         if (r < 0)
604                                 return log_oom();
605
606                         a = NULL;
607
608                         r = strv_push(&arg_tmpfs, b);
609                         if (r < 0)
610                                 return log_oom();
611
612                         b = NULL;
613
614                         break;
615                 }
616
617                 case ARG_SETENV: {
618                         char **n;
619
620                         if (!env_assignment_is_valid(optarg)) {
621                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         n = strv_env_set(arg_setenv, optarg);
626                         if (!n)
627                                 return log_oom();
628
629                         strv_free(arg_setenv);
630                         arg_setenv = n;
631                         break;
632                 }
633
634                 case 'q':
635                         arg_quiet = true;
636                         break;
637
638                 case ARG_SHARE_SYSTEM:
639                         arg_share_system = true;
640                         break;
641
642                 case ARG_REGISTER:
643                         r = parse_boolean(optarg);
644                         if (r < 0) {
645                                 log_error("Failed to parse --register= argument: %s", optarg);
646                                 return r;
647                         }
648
649                         arg_register = r;
650                         break;
651
652                 case ARG_KEEP_UNIT:
653                         arg_keep_unit = true;
654                         break;
655
656                 case ARG_PERSONALITY:
657
658                         arg_personality = personality_from_string(optarg);
659                         if (arg_personality == 0xffffffffLU) {
660                                 log_error("Unknown or unsupported personality '%s'.", optarg);
661                                 return -EINVAL;
662                         }
663
664                         break;
665
666                 case ARG_VOLATILE:
667
668                         if (!optarg)
669                                 arg_volatile = VOLATILE_YES;
670                         else {
671                                 r = parse_boolean(optarg);
672                                 if (r < 0) {
673                                         if (streq(optarg, "state"))
674                                                 arg_volatile = VOLATILE_STATE;
675                                         else {
676                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
677                                                 return r;
678                                         }
679                                 } else
680                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681                         }
682
683                         break;
684
685                 case 'p': {
686                         const char *split, *e;
687                         uint16_t container_port, host_port;
688                         int protocol;
689                         ExposePort *p;
690
691                         if ((e = startswith(optarg, "tcp:")))
692                                 protocol = IPPROTO_TCP;
693                         else if ((e = startswith(optarg, "udp:")))
694                                 protocol = IPPROTO_UDP;
695                         else {
696                                 e = optarg;
697                                 protocol = IPPROTO_TCP;
698                         }
699
700                         split = strchr(e, ':');
701                         if (split) {
702                                 char v[split - e + 1];
703
704                                 memcpy(v, e, split - e);
705                                 v[split - e] = 0;
706
707                                 r = safe_atou16(v, &host_port);
708                                 if (r < 0 || host_port <= 0) {
709                                         log_error("Failed to parse host port: %s", optarg);
710                                         return -EINVAL;
711                                 }
712
713                                 r = safe_atou16(split + 1, &container_port);
714                         } else {
715                                 r = safe_atou16(e, &container_port);
716                                 host_port = container_port;
717                         }
718
719                         if (r < 0 || container_port <= 0) {
720                                 log_error("Failed to parse host port: %s", optarg);
721                                 return -EINVAL;
722                         }
723
724                         LIST_FOREACH(ports, p, arg_expose_ports) {
725                                 if (p->protocol == protocol && p->host_port == host_port) {
726                                         log_error("Duplicate port specification: %s", optarg);
727                                         return -EINVAL;
728                                 }
729                         }
730
731                         p = new(ExposePort, 1);
732                         if (!p)
733                                 return log_oom();
734
735                         p->protocol = protocol;
736                         p->host_port = host_port;
737                         p->container_port = container_port;
738
739                         LIST_PREPEND(ports, arg_expose_ports, p);
740
741                         break;
742                 }
743
744                 case ARG_PROPERTY:
745                         if (strv_extend(&arg_property, optarg) < 0)
746                                 return log_oom();
747
748                         break;
749
750                 case ARG_PRIVATE_USERS:
751                         if (optarg) {
752                                 _cleanup_free_ char *buffer = NULL;
753                                 const char *range, *shift;
754
755                                 range = strchr(optarg, ':');
756                                 if (range) {
757                                         buffer = strndup(optarg, range - optarg);
758                                         if (!buffer)
759                                                 return log_oom();
760                                         shift = buffer;
761
762                                         range++;
763                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764                                                 log_error("Failed to parse UID range: %s", range);
765                                                 return -EINVAL;
766                                         }
767                                 } else
768                                         shift = optarg;
769
770                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
771                                         log_error("Failed to parse UID: %s", optarg);
772                                         return -EINVAL;
773                                 }
774                         }
775
776                         arg_userns = true;
777                         break;
778
779                 case '?':
780                         return -EINVAL;
781
782                 default:
783                         assert_not_reached("Unhandled option");
784                 }
785
786         if (arg_share_system)
787                 arg_register = false;
788
789         if (arg_boot && arg_share_system) {
790                 log_error("--boot and --share-system may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
795                 log_error("--keep-unit may not be used when invoked from a user session.");
796                 return -EINVAL;
797         }
798
799         if (arg_directory && arg_image) {
800                 log_error("--directory= and --image= may not be combined.");
801                 return -EINVAL;
802         }
803
804         if (arg_template && arg_image) {
805                 log_error("--template= and --image= may not be combined.");
806                 return -EINVAL;
807         }
808
809         if (arg_template && !(arg_directory || arg_machine)) {
810                 log_error("--template= needs --directory= or --machine=.");
811                 return -EINVAL;
812         }
813
814         if (arg_ephemeral && arg_template) {
815                 log_error("--ephemeral and --template= may not be combined.");
816                 return -EINVAL;
817         }
818
819         if (arg_ephemeral && arg_image) {
820                 log_error("--ephemeral and --image= may not be combined.");
821                 return -EINVAL;
822         }
823
824         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
825                 log_error("--ephemeral and --link-journal= may not be combined.");
826                 return -EINVAL;
827         }
828
829         if (arg_volatile != VOLATILE_NO && arg_read_only) {
830                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
831                 return -EINVAL;
832         }
833
834         if (arg_expose_ports && !arg_private_network) {
835                 log_error("Cannot use --port= without private networking.");
836                 return -EINVAL;
837         }
838
839         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
840
841         return 1;
842 }
843
844 static int mount_all(const char *dest) {
845
846         typedef struct MountPoint {
847                 const char *what;
848                 const char *where;
849                 const char *type;
850                 const char *options;
851                 unsigned long flags;
852                 bool fatal;
853         } MountPoint;
854
855         static const MountPoint mount_table[] = {
856                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
857                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
858                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
859                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
860                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
861                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
862                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
863                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
864                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
865 #ifdef HAVE_SELINUX
866                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
867                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
868 #endif
869         };
870
871         unsigned k;
872         int r = 0;
873
874         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
875                 _cleanup_free_ char *where = NULL;
876 #ifdef HAVE_SELINUX
877                 _cleanup_free_ char *options = NULL;
878 #endif
879                 const char *o;
880                 int t;
881
882                 where = strjoin(dest, "/", mount_table[k].where, NULL);
883                 if (!where)
884                         return log_oom();
885
886                 t = path_is_mount_point(where, true);
887                 if (t < 0) {
888                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
889
890                         if (r == 0)
891                                 r = t;
892
893                         continue;
894                 }
895
896                 /* Skip this entry if it is not a remount. */
897                 if (mount_table[k].what && t > 0)
898                         continue;
899
900                 t = mkdir_p(where, 0755);
901                 if (t < 0) {
902                         if (mount_table[k].fatal) {
903                                log_error_errno(t, "Failed to create directory %s: %m", where);
904
905                                 if (r == 0)
906                                         r = t;
907                         } else
908                                log_warning_errno(t, "Failed to create directory %s: %m", where);
909
910                         continue;
911                 }
912
913 #ifdef HAVE_SELINUX
914                 if (arg_selinux_apifs_context &&
915                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
916                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
917                         if (!options)
918                                 return log_oom();
919
920                         o = options;
921                 } else
922 #endif
923                         o = mount_table[k].options;
924
925                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
926                         char *uid_options = NULL;
927
928                         if (o)
929                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
930                         else
931                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
932                         if (!uid_options)
933                                 return log_oom();
934
935                         free(options);
936                         o = options = uid_options;
937                 }
938
939                 if (mount(mount_table[k].what,
940                           where,
941                           mount_table[k].type,
942                           mount_table[k].flags,
943                           o) < 0) {
944
945                         if (mount_table[k].fatal) {
946                                 log_error_errno(errno, "mount(%s) failed: %m", where);
947
948                                 if (r == 0)
949                                         r = -errno;
950                         } else
951                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
952                 }
953         }
954
955         return r;
956 }
957
958 static int mount_binds(const char *dest, char **l, bool ro) {
959         char **x, **y;
960
961         STRV_FOREACH_PAIR(x, y, l) {
962                 _cleanup_free_ char *where = NULL;
963                 struct stat source_st, dest_st;
964                 int r;
965
966                 if (stat(*x, &source_st) < 0)
967                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
968
969                 where = strappend(dest, *y);
970                 if (!where)
971                         return log_oom();
972
973                 r = stat(where, &dest_st);
974                 if (r == 0) {
975                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
976                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
977                                 return -EINVAL;
978                         }
979                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
980                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
981                                 return -EINVAL;
982                         }
983                 } else if (errno == ENOENT) {
984                         r = mkdir_parents_label(where, 0755);
985                         if (r < 0)
986                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
987                 } else {
988                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
989                         return -errno;
990                 }
991
992                 /* Create the mount point. Any non-directory file can be
993                  * mounted on any non-directory file (regular, fifo, socket,
994                  * char, block).
995                  */
996                 if (S_ISDIR(source_st.st_mode)) {
997                         r = mkdir_label(where, 0755);
998                         if (r < 0 && errno != EEXIST)
999                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1000                 } else {
1001                         r = touch(where);
1002                         if (r < 0)
1003                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1004                 }
1005
1006                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1007                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1008
1009                 if (ro) {
1010                         r = bind_remount_recursive(where, true);
1011                         if (r < 0)
1012                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1013                 }
1014         }
1015
1016         return 0;
1017 }
1018
1019 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1020         char *to;
1021         int r;
1022
1023         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1024
1025         r = path_is_mount_point(to, false);
1026         if (r < 0)
1027                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1028         if (r > 0)
1029                 return 0;
1030
1031         mkdir_p(to, 0755);
1032
1033         /* The superblock mount options of the mount point need to be
1034          * identical to the hosts', and hence writable... */
1035         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1036                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1037
1038         /* ... hence let's only make the bind mount read-only, not the
1039          * superblock. */
1040         if (read_only) {
1041                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1042                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1043         }
1044         return 1;
1045 }
1046
1047 static int mount_cgroup(const char *dest) {
1048         _cleanup_set_free_free_ Set *controllers = NULL;
1049         _cleanup_free_ char *own_cgroup_path = NULL;
1050         const char *cgroup_root, *systemd_root, *systemd_own;
1051         int r;
1052
1053         controllers = set_new(&string_hash_ops);
1054         if (!controllers)
1055                 return log_oom();
1056
1057         r = cg_kernel_controllers(controllers);
1058         if (r < 0)
1059                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1060
1061         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1062         if (r < 0)
1063                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1064
1065         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1066         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1067                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1068
1069         for (;;) {
1070                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1071
1072                 controller = set_steal_first(controllers);
1073                 if (!controller)
1074                         break;
1075
1076                 origin = strappend("/sys/fs/cgroup/", controller);
1077                 if (!origin)
1078                         return log_oom();
1079
1080                 r = readlink_malloc(origin, &combined);
1081                 if (r == -EINVAL) {
1082                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1083
1084                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1085                         if (r < 0)
1086                                 return r;
1087
1088                 } else if (r < 0)
1089                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1090                 else {
1091                         _cleanup_free_ char *target = NULL;
1092
1093                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1094                         if (!target)
1095                                 return log_oom();
1096
1097                         /* A symbolic link, a combination of controllers in one hierarchy */
1098
1099                         if (!filename_is_valid(combined)) {
1100                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1101                                 continue;
1102                         }
1103
1104                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1105                         if (r < 0)
1106                                 return r;
1107
1108                         if (symlink(combined, target) < 0)
1109                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1110                 }
1111         }
1112
1113         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1114         if (r < 0)
1115                 return r;
1116
1117         /* Make our own cgroup a (writable) bind mount */
1118         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1119         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1120                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1121
1122         /* And then remount the systemd cgroup root read-only */
1123         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1124         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1125                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1126
1127         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1128                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1129
1130         return 0;
1131 }
1132
1133 static int mount_tmpfs(const char *dest) {
1134         char **i, **o;
1135
1136         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1137                 _cleanup_free_ char *where = NULL;
1138                 int r;
1139
1140                 where = strappend(dest, *i);
1141                 if (!where)
1142                         return log_oom();
1143
1144                 r = mkdir_label(where, 0755);
1145                 if (r < 0 && r != -EEXIST)
1146                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1147
1148                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1149                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1150         }
1151
1152         return 0;
1153 }
1154
1155 static int setup_timezone(const char *dest) {
1156         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1157         char *z, *y;
1158         int r;
1159
1160         assert(dest);
1161
1162         /* Fix the timezone, if possible */
1163         r = readlink_malloc("/etc/localtime", &p);
1164         if (r < 0) {
1165                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1166                 return 0;
1167         }
1168
1169         z = path_startswith(p, "../usr/share/zoneinfo/");
1170         if (!z)
1171                 z = path_startswith(p, "/usr/share/zoneinfo/");
1172         if (!z) {
1173                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1174                 return 0;
1175         }
1176
1177         where = strappend(dest, "/etc/localtime");
1178         if (!where)
1179                 return log_oom();
1180
1181         r = readlink_malloc(where, &q);
1182         if (r >= 0) {
1183                 y = path_startswith(q, "../usr/share/zoneinfo/");
1184                 if (!y)
1185                         y = path_startswith(q, "/usr/share/zoneinfo/");
1186
1187                 /* Already pointing to the right place? Then do nothing .. */
1188                 if (y && streq(y, z))
1189                         return 0;
1190         }
1191
1192         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1193         if (!check)
1194                 return log_oom();
1195
1196         if (access(check, F_OK) < 0) {
1197                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1198                 return 0;
1199         }
1200
1201         what = strappend("../usr/share/zoneinfo/", z);
1202         if (!what)
1203                 return log_oom();
1204
1205         r = mkdir_parents(where, 0755);
1206         if (r < 0) {
1207                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1208
1209                 return 0;
1210         }
1211
1212         r = unlink(where);
1213         if (r < 0 && errno != ENOENT) {
1214                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1215
1216                 return 0;
1217         }
1218
1219         if (symlink(what, where) < 0) {
1220                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1221                 return 0;
1222         }
1223
1224         return 0;
1225 }
1226
1227 static int setup_resolv_conf(const char *dest) {
1228         _cleanup_free_ char *where = NULL;
1229         int r;
1230
1231         assert(dest);
1232
1233         if (arg_private_network)
1234                 return 0;
1235
1236         /* Fix resolv.conf, if possible */
1237         where = strappend(dest, "/etc/resolv.conf");
1238         if (!where)
1239                 return log_oom();
1240
1241         /* We don't really care for the results of this really. If it
1242          * fails, it fails, but meh... */
1243         r = mkdir_parents(where, 0755);
1244         if (r < 0) {
1245                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1246
1247                 return 0;
1248         }
1249
1250         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1251         if (r < 0) {
1252                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1253
1254                 return 0;
1255         }
1256
1257         return 0;
1258 }
1259
1260 static int setup_volatile_state(const char *directory) {
1261         const char *p;
1262         int r;
1263
1264         assert(directory);
1265
1266         if (arg_volatile != VOLATILE_STATE)
1267                 return 0;
1268
1269         /* --volatile=state means we simply overmount /var
1270            with a tmpfs, and the rest read-only. */
1271
1272         r = bind_remount_recursive(directory, true);
1273         if (r < 0)
1274                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1275
1276         p = strjoina(directory, "/var");
1277         r = mkdir(p, 0755);
1278         if (r < 0 && errno != EEXIST)
1279                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1280
1281         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1282                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1283
1284         return 0;
1285 }
1286
1287 static int setup_volatile(const char *directory) {
1288         bool tmpfs_mounted = false, bind_mounted = false;
1289         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1290         const char *f, *t;
1291         int r;
1292
1293         assert(directory);
1294
1295         if (arg_volatile != VOLATILE_YES)
1296                 return 0;
1297
1298         /* --volatile=yes means we mount a tmpfs to the root dir, and
1299            the original /usr to use inside it, and that read-only. */
1300
1301         if (!mkdtemp(template))
1302                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1303
1304         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1305                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1306                 r = -errno;
1307                 goto fail;
1308         }
1309
1310         tmpfs_mounted = true;
1311
1312         f = strjoina(directory, "/usr");
1313         t = strjoina(template, "/usr");
1314
1315         r = mkdir(t, 0755);
1316         if (r < 0 && errno != EEXIST) {
1317                 log_error_errno(errno, "Failed to create %s: %m", t);
1318                 r = -errno;
1319                 goto fail;
1320         }
1321
1322         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1323                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1324                 r = -errno;
1325                 goto fail;
1326         }
1327
1328         bind_mounted = true;
1329
1330         r = bind_remount_recursive(t, true);
1331         if (r < 0) {
1332                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1333                 goto fail;
1334         }
1335
1336         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1337                 log_error_errno(errno, "Failed to move root mount: %m");
1338                 r = -errno;
1339                 goto fail;
1340         }
1341
1342         rmdir(template);
1343
1344         return 0;
1345
1346 fail:
1347         if (bind_mounted)
1348                 umount(t);
1349         if (tmpfs_mounted)
1350                 umount(template);
1351         rmdir(template);
1352         return r;
1353 }
1354
1355 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1356
1357         snprintf(s, 37,
1358                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1359                  SD_ID128_FORMAT_VAL(id));
1360
1361         return s;
1362 }
1363
1364 static int setup_boot_id(const char *dest) {
1365         _cleanup_free_ char *from = NULL, *to = NULL;
1366         sd_id128_t rnd = {};
1367         char as_uuid[37];
1368         int r;
1369
1370         assert(dest);
1371
1372         if (arg_share_system)
1373                 return 0;
1374
1375         /* Generate a new randomized boot ID, so that each boot-up of
1376          * the container gets a new one */
1377
1378         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1379         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1380         if (!from || !to)
1381                 return log_oom();
1382
1383         r = sd_id128_randomize(&rnd);
1384         if (r < 0)
1385                 return log_error_errno(r, "Failed to generate random boot id: %m");
1386
1387         id128_format_as_uuid(rnd, as_uuid);
1388
1389         r = write_string_file(from, as_uuid);
1390         if (r < 0)
1391                 return log_error_errno(r, "Failed to write boot id: %m");
1392
1393         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1394                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1395                 r = -errno;
1396         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1397                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1398
1399         unlink(from);
1400         return r;
1401 }
1402
1403 static int copy_devnodes(const char *dest) {
1404
1405         static const char devnodes[] =
1406                 "null\0"
1407                 "zero\0"
1408                 "full\0"
1409                 "random\0"
1410                 "urandom\0"
1411                 "tty\0"
1412                 "net/tun\0";
1413
1414         const char *d;
1415         int r = 0;
1416         _cleanup_umask_ mode_t u;
1417
1418         assert(dest);
1419
1420         u = umask(0000);
1421
1422         NULSTR_FOREACH(d, devnodes) {
1423                 _cleanup_free_ char *from = NULL, *to = NULL;
1424                 struct stat st;
1425
1426                 from = strappend("/dev/", d);
1427                 to = strjoin(dest, "/dev/", d, NULL);
1428                 if (!from || !to)
1429                         return log_oom();
1430
1431                 if (stat(from, &st) < 0) {
1432
1433                         if (errno != ENOENT)
1434                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1435
1436                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1437
1438                         log_error("%s is not a char or block device, cannot copy", from);
1439                         return -EIO;
1440
1441                 } else {
1442                         r = mkdir_parents(to, 0775);
1443                         if (r < 0) {
1444                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1445                                 return -r;
1446                         }
1447
1448                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1449                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1450                 }
1451         }
1452
1453         return r;
1454 }
1455
1456 static int setup_ptmx(const char *dest) {
1457         _cleanup_free_ char *p = NULL;
1458
1459         p = strappend(dest, "/dev/ptmx");
1460         if (!p)
1461                 return log_oom();
1462
1463         if (symlink("pts/ptmx", p) < 0)
1464                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1465
1466         return 0;
1467 }
1468
1469 static int setup_dev_console(const char *dest, const char *console) {
1470         _cleanup_umask_ mode_t u;
1471         const char *to;
1472         struct stat st;
1473         int r;
1474
1475         assert(dest);
1476         assert(console);
1477
1478         u = umask(0000);
1479
1480         if (stat("/dev/null", &st) < 0)
1481                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1482
1483         r = chmod_and_chown(console, 0600, 0, 0);
1484         if (r < 0)
1485                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1486
1487         /* We need to bind mount the right tty to /dev/console since
1488          * ptys can only exist on pts file systems. To have something
1489          * to bind mount things on we create a device node first, and
1490          * use /dev/null for that since we the cgroups device policy
1491          * allows us to create that freely, while we cannot create
1492          * /dev/console. (Note that the major minor doesn't actually
1493          * matter here, since we mount it over anyway). */
1494
1495         to = strjoina(dest, "/dev/console");
1496         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1497                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1498
1499         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1500                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1501
1502         return 0;
1503 }
1504
1505 static int setup_kmsg(const char *dest, int kmsg_socket) {
1506         _cleanup_free_ char *from = NULL, *to = NULL;
1507         _cleanup_umask_ mode_t u;
1508         int r, fd, k;
1509         union {
1510                 struct cmsghdr cmsghdr;
1511                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1512         } control = {};
1513         struct msghdr mh = {
1514                 .msg_control = &control,
1515                 .msg_controllen = sizeof(control),
1516         };
1517         struct cmsghdr *cmsg;
1518
1519         assert(dest);
1520         assert(kmsg_socket >= 0);
1521
1522         u = umask(0000);
1523
1524         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1525          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1526          * on the reading side behave very similar to /proc/kmsg,
1527          * their writing side behaves differently from /dev/kmsg in
1528          * that writing blocks when nothing is reading. In order to
1529          * avoid any problems with containers deadlocking due to this
1530          * we simply make /dev/kmsg unavailable to the container. */
1531         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1532             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1533                 return log_oom();
1534
1535         if (mkfifo(from, 0600) < 0)
1536                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1537
1538         r = chmod_and_chown(from, 0600, 0, 0);
1539         if (r < 0)
1540                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1541
1542         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1543                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1544
1545         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1546         if (fd < 0)
1547                 return log_error_errno(errno, "Failed to open fifo: %m");
1548
1549         cmsg = CMSG_FIRSTHDR(&mh);
1550         cmsg->cmsg_level = SOL_SOCKET;
1551         cmsg->cmsg_type = SCM_RIGHTS;
1552         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1553         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1554
1555         mh.msg_controllen = cmsg->cmsg_len;
1556
1557         /* Store away the fd in the socket, so that it stays open as
1558          * long as we run the child */
1559         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1560         safe_close(fd);
1561
1562         if (k < 0)
1563                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1564
1565         /* And now make the FIFO unavailable as /dev/kmsg... */
1566         unlink(from);
1567         return 0;
1568 }
1569
1570 static int send_rtnl(int send_fd) {
1571         union {
1572                 struct cmsghdr cmsghdr;
1573                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1574         } control = {};
1575         struct msghdr mh = {
1576                 .msg_control = &control,
1577                 .msg_controllen = sizeof(control),
1578         };
1579         struct cmsghdr *cmsg;
1580         _cleanup_close_ int fd = -1;
1581         ssize_t k;
1582
1583         assert(send_fd >= 0);
1584
1585         if (!arg_expose_ports)
1586                 return 0;
1587
1588         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1589         if (fd < 0)
1590                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1591
1592         cmsg = CMSG_FIRSTHDR(&mh);
1593         cmsg->cmsg_level = SOL_SOCKET;
1594         cmsg->cmsg_type = SCM_RIGHTS;
1595         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1596         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1597
1598         mh.msg_controllen = cmsg->cmsg_len;
1599
1600         /* Store away the fd in the socket, so that it stays open as
1601          * long as we run the child */
1602         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1603         if (k < 0)
1604                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1605
1606         return 0;
1607 }
1608
1609 static int flush_ports(union in_addr_union *exposed) {
1610         ExposePort *p;
1611         int r, af = AF_INET;
1612
1613         assert(exposed);
1614
1615         if (!arg_expose_ports)
1616                 return 0;
1617
1618         if (in_addr_is_null(af, exposed))
1619                 return 0;
1620
1621         log_debug("Lost IP address.");
1622
1623         LIST_FOREACH(ports, p, arg_expose_ports) {
1624                 r = fw_add_local_dnat(false,
1625                                       af,
1626                                       p->protocol,
1627                                       NULL,
1628                                       NULL, 0,
1629                                       NULL, 0,
1630                                       p->host_port,
1631                                       exposed,
1632                                       p->container_port,
1633                                       NULL);
1634                 if (r < 0)
1635                         log_warning_errno(r, "Failed to modify firewall: %m");
1636         }
1637
1638         *exposed = IN_ADDR_NULL;
1639         return 0;
1640 }
1641
1642 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1643         _cleanup_free_ struct local_address *addresses = NULL;
1644         _cleanup_free_ char *pretty = NULL;
1645         union in_addr_union new_exposed;
1646         ExposePort *p;
1647         bool add;
1648         int af = AF_INET, r;
1649
1650         assert(exposed);
1651
1652         /* Invoked each time an address is added or removed inside the
1653          * container */
1654
1655         if (!arg_expose_ports)
1656                 return 0;
1657
1658         r = local_addresses(rtnl, 0, af, &addresses);
1659         if (r < 0)
1660                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1661
1662         add = r > 0 &&
1663                 addresses[0].family == af &&
1664                 addresses[0].scope < RT_SCOPE_LINK;
1665
1666         if (!add)
1667                 return flush_ports(exposed);
1668
1669         new_exposed = addresses[0].address;
1670         if (in_addr_equal(af, exposed, &new_exposed))
1671                 return 0;
1672
1673         in_addr_to_string(af, &new_exposed, &pretty);
1674         log_debug("New container IP is %s.", strna(pretty));
1675
1676         LIST_FOREACH(ports, p, arg_expose_ports) {
1677
1678                 r = fw_add_local_dnat(true,
1679                                       af,
1680                                       p->protocol,
1681                                       NULL,
1682                                       NULL, 0,
1683                                       NULL, 0,
1684                                       p->host_port,
1685                                       &new_exposed,
1686                                       p->container_port,
1687                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1688                 if (r < 0)
1689                         log_warning_errno(r, "Failed to modify firewall: %m");
1690         }
1691
1692         *exposed = new_exposed;
1693         return 0;
1694 }
1695
1696 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1697         union in_addr_union *exposed = userdata;
1698
1699         assert(rtnl);
1700         assert(m);
1701         assert(exposed);
1702
1703         expose_ports(rtnl, exposed);
1704         return 0;
1705 }
1706
1707 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1708         union {
1709                 struct cmsghdr cmsghdr;
1710                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1711         } control = {};
1712         struct msghdr mh = {
1713                 .msg_control = &control,
1714                 .msg_controllen = sizeof(control),
1715         };
1716         struct cmsghdr *cmsg;
1717         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1718         int fd, r;
1719         ssize_t k;
1720
1721         assert(event);
1722         assert(recv_fd >= 0);
1723         assert(ret);
1724
1725         if (!arg_expose_ports)
1726                 return 0;
1727
1728         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1729         if (k < 0)
1730                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1731
1732         cmsg = CMSG_FIRSTHDR(&mh);
1733         assert(cmsg->cmsg_level == SOL_SOCKET);
1734         assert(cmsg->cmsg_type == SCM_RIGHTS);
1735         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1736         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1737
1738         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1739         if (r < 0) {
1740                 safe_close(fd);
1741                 return log_error_errno(r, "Failed to create rtnl object: %m");
1742         }
1743
1744         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1747
1748         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1749         if (r < 0)
1750                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1751
1752         r = sd_rtnl_attach_event(rtnl, event, 0);
1753         if (r < 0)
1754                 return log_error_errno(r, "Failed to add to even loop: %m");
1755
1756         *ret = rtnl;
1757         rtnl = NULL;
1758
1759         return 0;
1760 }
1761
1762 static int setup_hostname(void) {
1763
1764         if (arg_share_system)
1765                 return 0;
1766
1767         if (sethostname_idempotent(arg_machine) < 0)
1768                 return -errno;
1769
1770         return 0;
1771 }
1772
1773 static int setup_journal(const char *directory) {
1774         sd_id128_t machine_id, this_id;
1775         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1776         char *id;
1777         int r;
1778
1779         /* Don't link journals in ephemeral mode */
1780         if (arg_ephemeral)
1781                 return 0;
1782
1783         p = strappend(directory, "/etc/machine-id");
1784         if (!p)
1785                 return log_oom();
1786
1787         r = read_one_line_file(p, &b);
1788         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1789                 return 0;
1790         else if (r < 0)
1791                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1792
1793         id = strstrip(b);
1794         if (isempty(id) && arg_link_journal == LINK_AUTO)
1795                 return 0;
1796
1797         /* Verify validity */
1798         r = sd_id128_from_string(id, &machine_id);
1799         if (r < 0)
1800                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1801
1802         r = sd_id128_get_machine(&this_id);
1803         if (r < 0)
1804                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1805
1806         if (sd_id128_equal(machine_id, this_id)) {
1807                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1808                          "Host and machine ids are equal (%s): refusing to link journals", id);
1809                 if (arg_link_journal == LINK_AUTO)
1810                         return 0;
1811                 return -EEXIST;
1812         }
1813
1814         if (arg_link_journal == LINK_NO)
1815                 return 0;
1816
1817         free(p);
1818         p = strappend("/var/log/journal/", id);
1819         q = strjoin(directory, "/var/log/journal/", id, NULL);
1820         if (!p || !q)
1821                 return log_oom();
1822
1823         if (path_is_mount_point(p, false) > 0) {
1824                 if (arg_link_journal != LINK_AUTO) {
1825                         log_error("%s: already a mount point, refusing to use for journal", p);
1826                         return -EEXIST;
1827                 }
1828
1829                 return 0;
1830         }
1831
1832         if (path_is_mount_point(q, false) > 0) {
1833                 if (arg_link_journal != LINK_AUTO) {
1834                         log_error("%s: already a mount point, refusing to use for journal", q);
1835                         return -EEXIST;
1836                 }
1837
1838                 return 0;
1839         }
1840
1841         r = readlink_and_make_absolute(p, &d);
1842         if (r >= 0) {
1843                 if ((arg_link_journal == LINK_GUEST ||
1844                      arg_link_journal == LINK_AUTO) &&
1845                     path_equal(d, q)) {
1846
1847                         r = mkdir_p(q, 0755);
1848                         if (r < 0)
1849                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1850                         return 0;
1851                 }
1852
1853                 if (unlink(p) < 0)
1854                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1855         } else if (r == -EINVAL) {
1856
1857                 if (arg_link_journal == LINK_GUEST &&
1858                     rmdir(p) < 0) {
1859
1860                         if (errno == ENOTDIR) {
1861                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1862                                 return r;
1863                         } else {
1864                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1865                                 return -errno;
1866                         }
1867                 }
1868         } else if (r != -ENOENT) {
1869                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1870                 return r;
1871         }
1872
1873         if (arg_link_journal == LINK_GUEST) {
1874
1875                 if (symlink(q, p) < 0) {
1876                         if (arg_link_journal_try) {
1877                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1878                                 return 0;
1879                         } else {
1880                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1881                                 return -errno;
1882                         }
1883                 }
1884
1885                 r = mkdir_p(q, 0755);
1886                 if (r < 0)
1887                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1888                 return 0;
1889         }
1890
1891         if (arg_link_journal == LINK_HOST) {
1892                 /* don't create parents here -- if the host doesn't have
1893                  * permanent journal set up, don't force it here */
1894                 r = mkdir(p, 0755);
1895                 if (r < 0) {
1896                         if (arg_link_journal_try) {
1897                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1898                                 return 0;
1899                         } else {
1900                                 log_error_errno(errno, "Failed to create %s: %m", p);
1901                                 return r;
1902                         }
1903                 }
1904
1905         } else if (access(p, F_OK) < 0)
1906                 return 0;
1907
1908         if (dir_is_empty(q) == 0)
1909                 log_warning("%s is not empty, proceeding anyway.", q);
1910
1911         r = mkdir_p(q, 0755);
1912         if (r < 0) {
1913                 log_error_errno(errno, "Failed to create %s: %m", q);
1914                 return r;
1915         }
1916
1917         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1918                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1919
1920         return 0;
1921 }
1922
1923 static int drop_capabilities(void) {
1924         return capability_bounding_set_drop(~arg_retain, false);
1925 }
1926
1927 static int register_machine(pid_t pid, int local_ifindex) {
1928         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1929         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1930         int r;
1931
1932         if (!arg_register)
1933                 return 0;
1934
1935         r = sd_bus_default_system(&bus);
1936         if (r < 0)
1937                 return log_error_errno(r, "Failed to open system bus: %m");
1938
1939         if (arg_keep_unit) {
1940                 r = sd_bus_call_method(
1941                                 bus,
1942                                 "org.freedesktop.machine1",
1943                                 "/org/freedesktop/machine1",
1944                                 "org.freedesktop.machine1.Manager",
1945                                 "RegisterMachineWithNetwork",
1946                                 &error,
1947                                 NULL,
1948                                 "sayssusai",
1949                                 arg_machine,
1950                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1951                                 "nspawn",
1952                                 "container",
1953                                 (uint32_t) pid,
1954                                 strempty(arg_directory),
1955                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1956         } else {
1957                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1958                 char **i;
1959
1960                 r = sd_bus_message_new_method_call(
1961                                 bus,
1962                                 &m,
1963                                 "org.freedesktop.machine1",
1964                                 "/org/freedesktop/machine1",
1965                                 "org.freedesktop.machine1.Manager",
1966                                 "CreateMachineWithNetwork");
1967                 if (r < 0)
1968                         return bus_log_create_error(r);
1969
1970                 r = sd_bus_message_append(
1971                                 m,
1972                                 "sayssusai",
1973                                 arg_machine,
1974                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1975                                 "nspawn",
1976                                 "container",
1977                                 (uint32_t) pid,
1978                                 strempty(arg_directory),
1979                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1980                 if (r < 0)
1981                         return bus_log_create_error(r);
1982
1983                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1984                 if (r < 0)
1985                         return bus_log_create_error(r);
1986
1987                 if (!isempty(arg_slice)) {
1988                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1989                         if (r < 0)
1990                                 return bus_log_create_error(r);
1991                 }
1992
1993                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1994                 if (r < 0)
1995                         return bus_log_create_error(r);
1996
1997                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1998                                           /* Allow the container to
1999                                            * access and create the API
2000                                            * device nodes, so that
2001                                            * PrivateDevices= in the
2002                                            * container can work
2003                                            * fine */
2004                                           "/dev/null", "rwm",
2005                                           "/dev/zero", "rwm",
2006                                           "/dev/full", "rwm",
2007                                           "/dev/random", "rwm",
2008                                           "/dev/urandom", "rwm",
2009                                           "/dev/tty", "rwm",
2010                                           "/dev/net/tun", "rwm",
2011                                           /* Allow the container
2012                                            * access to ptys. However,
2013                                            * do not permit the
2014                                            * container to ever create
2015                                            * these device nodes. */
2016                                           "/dev/pts/ptmx", "rw",
2017                                           "char-pts", "rw");
2018                 if (r < 0)
2019                         return log_error_errno(r, "Failed to add device whitelist: %m");
2020
2021                 STRV_FOREACH(i, arg_property) {
2022                         r = sd_bus_message_open_container(m, 'r', "sv");
2023                         if (r < 0)
2024                                 return bus_log_create_error(r);
2025
2026                         r = bus_append_unit_property_assignment(m, *i);
2027                         if (r < 0)
2028                                 return r;
2029
2030                         r = sd_bus_message_close_container(m);
2031                         if (r < 0)
2032                                 return bus_log_create_error(r);
2033                 }
2034
2035                 r = sd_bus_message_close_container(m);
2036                 if (r < 0)
2037                         return bus_log_create_error(r);
2038
2039                 r = sd_bus_call(bus, m, 0, &error, NULL);
2040         }
2041
2042         if (r < 0) {
2043                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2044                 return r;
2045         }
2046
2047         return 0;
2048 }
2049
2050 static int terminate_machine(pid_t pid) {
2051         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2052         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2053         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2054         const char *path;
2055         int r;
2056
2057         if (!arg_register)
2058                 return 0;
2059
2060         r = sd_bus_default_system(&bus);
2061         if (r < 0)
2062                 return log_error_errno(r, "Failed to open system bus: %m");
2063
2064         r = sd_bus_call_method(
2065                         bus,
2066                         "org.freedesktop.machine1",
2067                         "/org/freedesktop/machine1",
2068                         "org.freedesktop.machine1.Manager",
2069                         "GetMachineByPID",
2070                         &error,
2071                         &reply,
2072                         "u",
2073                         (uint32_t) pid);
2074         if (r < 0) {
2075                 /* Note that the machine might already have been
2076                  * cleaned up automatically, hence don't consider it a
2077                  * failure if we cannot get the machine object. */
2078                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2079                 return 0;
2080         }
2081
2082         r = sd_bus_message_read(reply, "o", &path);
2083         if (r < 0)
2084                 return bus_log_parse_error(r);
2085
2086         r = sd_bus_call_method(
2087                         bus,
2088                         "org.freedesktop.machine1",
2089                         path,
2090                         "org.freedesktop.machine1.Machine",
2091                         "Terminate",
2092                         &error,
2093                         NULL,
2094                         NULL);
2095         if (r < 0) {
2096                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2097                 return 0;
2098         }
2099
2100         return 0;
2101 }
2102
2103 static int reset_audit_loginuid(void) {
2104         _cleanup_free_ char *p = NULL;
2105         int r;
2106
2107         if (arg_share_system)
2108                 return 0;
2109
2110         r = read_one_line_file("/proc/self/loginuid", &p);
2111         if (r == -ENOENT)
2112                 return 0;
2113         if (r < 0)
2114                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2115
2116         /* Already reset? */
2117         if (streq(p, "4294967295"))
2118                 return 0;
2119
2120         r = write_string_file("/proc/self/loginuid", "4294967295");
2121         if (r < 0) {
2122                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2123                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2124                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2125                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2126                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2127
2128                 sleep(5);
2129         }
2130
2131         return 0;
2132 }
2133
2134 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2135 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2136 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2137
2138 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2139         uint8_t result[8];
2140         size_t l, sz;
2141         uint8_t *v, *i;
2142         int r;
2143
2144         l = strlen(arg_machine);
2145         sz = sizeof(sd_id128_t) + l;
2146         if (idx > 0)
2147                 sz += sizeof(idx);
2148
2149         v = alloca(sz);
2150
2151         /* fetch some persistent data unique to the host */
2152         r = sd_id128_get_machine((sd_id128_t*) v);
2153         if (r < 0)
2154                 return r;
2155
2156         /* combine with some data unique (on this host) to this
2157          * container instance */
2158         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2159         if (idx > 0) {
2160                 idx = htole64(idx);
2161                 memcpy(i, &idx, sizeof(idx));
2162         }
2163
2164         /* Let's hash the host machine ID plus the container name. We
2165          * use a fixed, but originally randomly created hash key here. */
2166         siphash24(result, v, sz, hash_key.bytes);
2167
2168         assert_cc(ETH_ALEN <= sizeof(result));
2169         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2170
2171         /* see eth_random_addr in the kernel */
2172         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2173         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2174
2175         return 0;
2176 }
2177
2178 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2179         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2181         struct ether_addr mac_host, mac_container;
2182         int r, i;
2183
2184         if (!arg_private_network)
2185                 return 0;
2186
2187         if (!arg_network_veth)
2188                 return 0;
2189
2190         /* Use two different interface name prefixes depending whether
2191          * we are in bridge mode or not. */
2192         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2193                  arg_network_bridge ? "vb" : "ve", arg_machine);
2194
2195         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2196         if (r < 0)
2197                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2198
2199         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2200         if (r < 0)
2201                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2202
2203         r = sd_rtnl_open(&rtnl, 0);
2204         if (r < 0)
2205                 return log_error_errno(r, "Failed to connect to netlink: %m");
2206
2207         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2210
2211         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2214
2215         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2218
2219         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to open netlink container: %m");
2222
2223         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to open netlink container: %m");
2226
2227         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to open netlink container: %m");
2230
2231         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2234
2235         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2238
2239         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2240         if (r < 0)
2241                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2242
2243         r = sd_rtnl_message_close_container(m);
2244         if (r < 0)
2245                 return log_error_errno(r, "Failed to close netlink container: %m");
2246
2247         r = sd_rtnl_message_close_container(m);
2248         if (r < 0)
2249                 return log_error_errno(r, "Failed to close netlink container: %m");
2250
2251         r = sd_rtnl_message_close_container(m);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to close netlink container: %m");
2254
2255         r = sd_rtnl_call(rtnl, m, 0, NULL);
2256         if (r < 0)
2257                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2258
2259         i = (int) if_nametoindex(iface_name);
2260         if (i <= 0)
2261                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2262
2263         *ifi = i;
2264
2265         return 0;
2266 }
2267
2268 static int setup_bridge(const char veth_name[], int *ifi) {
2269         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2270         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2271         int r, bridge;
2272
2273         if (!arg_private_network)
2274                 return 0;
2275
2276         if (!arg_network_veth)
2277                 return 0;
2278
2279         if (!arg_network_bridge)
2280                 return 0;
2281
2282         bridge = (int) if_nametoindex(arg_network_bridge);
2283         if (bridge <= 0)
2284                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2285
2286         *ifi = bridge;
2287
2288         r = sd_rtnl_open(&rtnl, 0);
2289         if (r < 0)
2290                 return log_error_errno(r, "Failed to connect to netlink: %m");
2291
2292         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2293         if (r < 0)
2294                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2295
2296         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2297         if (r < 0)
2298                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2299
2300         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2301         if (r < 0)
2302                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2303
2304         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to add netlink master field: %m");
2307
2308         r = sd_rtnl_call(rtnl, m, 0, NULL);
2309         if (r < 0)
2310                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2311
2312         return 0;
2313 }
2314
2315 static int parse_interface(struct udev *udev, const char *name) {
2316         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2317         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2318         int ifi;
2319
2320         ifi = (int) if_nametoindex(name);
2321         if (ifi <= 0)
2322                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2323
2324         sprintf(ifi_str, "n%i", ifi);
2325         d = udev_device_new_from_device_id(udev, ifi_str);
2326         if (!d)
2327                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2328
2329         if (udev_device_get_is_initialized(d) <= 0) {
2330                 log_error("Network interface %s is not initialized yet.", name);
2331                 return -EBUSY;
2332         }
2333
2334         return ifi;
2335 }
2336
2337 static int move_network_interfaces(pid_t pid) {
2338         _cleanup_udev_unref_ struct udev *udev = NULL;
2339         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2340         char **i;
2341         int r;
2342
2343         if (!arg_private_network)
2344                 return 0;
2345
2346         if (strv_isempty(arg_network_interfaces))
2347                 return 0;
2348
2349         r = sd_rtnl_open(&rtnl, 0);
2350         if (r < 0)
2351                 return log_error_errno(r, "Failed to connect to netlink: %m");
2352
2353         udev = udev_new();
2354         if (!udev) {
2355                 log_error("Failed to connect to udev.");
2356                 return -ENOMEM;
2357         }
2358
2359         STRV_FOREACH(i, arg_network_interfaces) {
2360                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2361                 int ifi;
2362
2363                 ifi = parse_interface(udev, *i);
2364                 if (ifi < 0)
2365                         return ifi;
2366
2367                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2368                 if (r < 0)
2369                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2370
2371                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2372                 if (r < 0)
2373                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2374
2375                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2376                 if (r < 0)
2377                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2378         }
2379
2380         return 0;
2381 }
2382
2383 static int setup_macvlan(pid_t pid) {
2384         _cleanup_udev_unref_ struct udev *udev = NULL;
2385         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2386         unsigned idx = 0;
2387         char **i;
2388         int r;
2389
2390         if (!arg_private_network)
2391                 return 0;
2392
2393         if (strv_isempty(arg_network_macvlan))
2394                 return 0;
2395
2396         r = sd_rtnl_open(&rtnl, 0);
2397         if (r < 0)
2398                 return log_error_errno(r, "Failed to connect to netlink: %m");
2399
2400         udev = udev_new();
2401         if (!udev) {
2402                 log_error("Failed to connect to udev.");
2403                 return -ENOMEM;
2404         }
2405
2406         STRV_FOREACH(i, arg_network_macvlan) {
2407                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2408                 _cleanup_free_ char *n = NULL;
2409                 struct ether_addr mac;
2410                 int ifi;
2411
2412                 ifi = parse_interface(udev, *i);
2413                 if (ifi < 0)
2414                         return ifi;
2415
2416                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2417                 if (r < 0)
2418                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2419
2420                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2421                 if (r < 0)
2422                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2423
2424                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2425                 if (r < 0)
2426                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2427
2428                 n = strappend("mv-", *i);
2429                 if (!n)
2430                         return log_oom();
2431
2432                 strshorten(n, IFNAMSIZ-1);
2433
2434                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2435                 if (r < 0)
2436                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2437
2438                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2439                 if (r < 0)
2440                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2441
2442                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2457
2458                 r = sd_rtnl_message_close_container(m);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2469         }
2470
2471         return 0;
2472 }
2473
2474 static int setup_ipvlan(pid_t pid) {
2475         _cleanup_udev_unref_ struct udev *udev = NULL;
2476         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2477         char **i;
2478         int r;
2479
2480         if (!arg_private_network)
2481                 return 0;
2482
2483         if (strv_isempty(arg_network_ipvlan))
2484                 return 0;
2485
2486         r = sd_rtnl_open(&rtnl, 0);
2487         if (r < 0)
2488                 return log_error_errno(r, "Failed to connect to netlink: %m");
2489
2490         udev = udev_new();
2491         if (!udev) {
2492                 log_error("Failed to connect to udev.");
2493                 return -ENOMEM;
2494         }
2495
2496         STRV_FOREACH(i, arg_network_ipvlan) {
2497                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2498                 _cleanup_free_ char *n = NULL;
2499                 int ifi;
2500
2501                 ifi = parse_interface(udev, *i);
2502                 if (ifi < 0)
2503                         return ifi;
2504
2505                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2506                 if (r < 0)
2507                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2508
2509                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2510                 if (r < 0)
2511                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2512
2513                 n = strappend("iv-", *i);
2514                 if (!n)
2515                         return log_oom();
2516
2517                 strshorten(n, IFNAMSIZ-1);
2518
2519                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2520                 if (r < 0)
2521                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2522
2523                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2524                 if (r < 0)
2525                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2526
2527                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2528                 if (r < 0)
2529                         return log_error_errno(r, "Failed to open netlink container: %m");
2530
2531                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2532                 if (r < 0)
2533                         return log_error_errno(r, "Failed to open netlink container: %m");
2534
2535                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2536                 if (r < 0)
2537                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2538
2539                 r = sd_rtnl_message_close_container(m);
2540                 if (r < 0)
2541                         return log_error_errno(r, "Failed to close netlink container: %m");
2542
2543                 r = sd_rtnl_message_close_container(m);
2544                 if (r < 0)
2545                         return log_error_errno(r, "Failed to close netlink container: %m");
2546
2547                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2548                 if (r < 0)
2549                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2550         }
2551
2552         return 0;
2553 }
2554
2555 static int setup_seccomp(void) {
2556
2557 #ifdef HAVE_SECCOMP
2558         static const int blacklist[] = {
2559                 SCMP_SYS(kexec_load),
2560                 SCMP_SYS(open_by_handle_at),
2561                 SCMP_SYS(iopl),
2562                 SCMP_SYS(ioperm),
2563                 SCMP_SYS(swapon),
2564                 SCMP_SYS(swapoff),
2565         };
2566
2567         static const int kmod_blacklist[] = {
2568                 SCMP_SYS(init_module),
2569                 SCMP_SYS(finit_module),
2570                 SCMP_SYS(delete_module),
2571         };
2572
2573         scmp_filter_ctx seccomp;
2574         unsigned i;
2575         int r;
2576
2577         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2578         if (!seccomp)
2579                 return log_oom();
2580
2581         r = seccomp_add_secondary_archs(seccomp);
2582         if (r < 0) {
2583                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2584                 goto finish;
2585         }
2586
2587         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2588                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2589                 if (r == -EFAULT)
2590                         continue; /* unknown syscall */
2591                 if (r < 0) {
2592                         log_error_errno(r, "Failed to block syscall: %m");
2593                         goto finish;
2594                 }
2595         }
2596
2597         /* If the CAP_SYS_MODULE capability is not requested then
2598          * we'll block the kmod syscalls too */
2599         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2600                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2601                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2602                         if (r == -EFAULT)
2603                                 continue; /* unknown syscall */
2604                         if (r < 0) {
2605                                 log_error_errno(r, "Failed to block syscall: %m");
2606                                 goto finish;
2607                         }
2608                 }
2609         }
2610
2611         /*
2612            Audit is broken in containers, much of the userspace audit
2613            hookup will fail if running inside a container. We don't
2614            care and just turn off creation of audit sockets.
2615
2616            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2617            with EAFNOSUPPORT which audit userspace uses as indication
2618            that audit is disabled in the kernel.
2619          */
2620
2621         r = seccomp_rule_add(
2622                         seccomp,
2623                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2624                         SCMP_SYS(socket),
2625                         2,
2626                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2627                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2628         if (r < 0) {
2629                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2630                 goto finish;
2631         }
2632
2633         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2634         if (r < 0) {
2635                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2636                 goto finish;
2637         }
2638
2639         r = seccomp_load(seccomp);
2640         if (r < 0)
2641                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2642
2643 finish:
2644         seccomp_release(seccomp);
2645         return r;
2646 #else
2647         return 0;
2648 #endif
2649
2650 }
2651
2652 static int setup_propagate(const char *root) {
2653         const char *p, *q;
2654
2655         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2656         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2657         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2658         (void) mkdir_p(p, 0600);
2659
2660         q = strjoina(root, "/run/systemd/nspawn/incoming");
2661         mkdir_parents(q, 0755);
2662         mkdir_p(q, 0600);
2663
2664         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2665                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2666
2667         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2668                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2669
2670         return 0;
2671 }
2672
2673 static int setup_image(char **device_path, int *loop_nr) {
2674         struct loop_info64 info = {
2675                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2676         };
2677         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2678         _cleanup_free_ char* loopdev = NULL;
2679         struct stat st;
2680         int r, nr;
2681
2682         assert(device_path);
2683         assert(loop_nr);
2684         assert(arg_image);
2685
2686         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2687         if (fd < 0)
2688                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2689
2690         if (fstat(fd, &st) < 0)
2691                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2692
2693         if (S_ISBLK(st.st_mode)) {
2694                 char *p;
2695
2696                 p = strdup(arg_image);
2697                 if (!p)
2698                         return log_oom();
2699
2700                 *device_path = p;
2701
2702                 *loop_nr = -1;
2703
2704                 r = fd;
2705                 fd = -1;
2706
2707                 return r;
2708         }
2709
2710         if (!S_ISREG(st.st_mode)) {
2711                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2712                 return -EINVAL;
2713         }
2714
2715         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2716         if (control < 0)
2717                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2718
2719         nr = ioctl(control, LOOP_CTL_GET_FREE);
2720         if (nr < 0)
2721                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2722
2723         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2724                 return log_oom();
2725
2726         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2727         if (loop < 0)
2728                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2729
2730         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2731                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2732
2733         if (arg_read_only)
2734                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2735
2736         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2737                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2738
2739         *device_path = loopdev;
2740         loopdev = NULL;
2741
2742         *loop_nr = nr;
2743
2744         r = loop;
2745         loop = -1;
2746
2747         return r;
2748 }
2749
2750 #define PARTITION_TABLE_BLURB \
2751         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2752         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2753         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2754         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2755         "to be bootable with systemd-nspawn."
2756
2757 static int dissect_image(
2758                 int fd,
2759                 char **root_device, bool *root_device_rw,
2760                 char **home_device, bool *home_device_rw,
2761                 char **srv_device, bool *srv_device_rw,
2762                 bool *secondary) {
2763
2764 #ifdef HAVE_BLKID
2765         int home_nr = -1, srv_nr = -1;
2766 #ifdef GPT_ROOT_NATIVE
2767         int root_nr = -1;
2768 #endif
2769 #ifdef GPT_ROOT_SECONDARY
2770         int secondary_root_nr = -1;
2771 #endif
2772         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2773         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2774         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2775         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2776         _cleanup_udev_unref_ struct udev *udev = NULL;
2777         struct udev_list_entry *first, *item;
2778         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2779         bool is_gpt, is_mbr, multiple_generic = false;
2780         const char *pttype = NULL;
2781         blkid_partlist pl;
2782         struct stat st;
2783         unsigned i;
2784         int r;
2785
2786         assert(fd >= 0);
2787         assert(root_device);
2788         assert(home_device);
2789         assert(srv_device);
2790         assert(secondary);
2791         assert(arg_image);
2792
2793         b = blkid_new_probe();
2794         if (!b)
2795                 return log_oom();
2796
2797         errno = 0;
2798         r = blkid_probe_set_device(b, fd, 0, 0);
2799         if (r != 0) {
2800                 if (errno == 0)
2801                         return log_oom();
2802
2803                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2804                 return -errno;
2805         }
2806
2807         blkid_probe_enable_partitions(b, 1);
2808         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2809
2810         errno = 0;
2811         r = blkid_do_safeprobe(b);
2812         if (r == -2 || r == 1) {
2813                 log_error("Failed to identify any partition table on\n"
2814                           "    %s\n"
2815                           PARTITION_TABLE_BLURB, arg_image);
2816                 return -EINVAL;
2817         } else if (r != 0) {
2818                 if (errno == 0)
2819                         errno = EIO;
2820                 log_error_errno(errno, "Failed to probe: %m");
2821                 return -errno;
2822         }
2823
2824         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2825
2826         is_gpt = streq_ptr(pttype, "gpt");
2827         is_mbr = streq_ptr(pttype, "dos");
2828
2829         if (!is_gpt && !is_mbr) {
2830                 log_error("No GPT or MBR partition table discovered on\n"
2831                           "    %s\n"
2832                           PARTITION_TABLE_BLURB, arg_image);
2833                 return -EINVAL;
2834         }
2835
2836         errno = 0;
2837         pl = blkid_probe_get_partitions(b);
2838         if (!pl) {
2839                 if (errno == 0)
2840                         return log_oom();
2841
2842                 log_error("Failed to list partitions of %s", arg_image);
2843                 return -errno;
2844         }
2845
2846         udev = udev_new();
2847         if (!udev)
2848                 return log_oom();
2849
2850         if (fstat(fd, &st) < 0)
2851                 return log_error_errno(errno, "Failed to stat block device: %m");
2852
2853         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2854         if (!d)
2855                 return log_oom();
2856
2857         for (i = 0;; i++) {
2858                 int n, m;
2859
2860                 if (i >= 10) {
2861                         log_error("Kernel partitions never appeared.");
2862                         return -ENXIO;
2863                 }
2864
2865                 e = udev_enumerate_new(udev);
2866                 if (!e)
2867                         return log_oom();
2868
2869                 r = udev_enumerate_add_match_parent(e, d);
2870                 if (r < 0)
2871                         return log_oom();
2872
2873                 r = udev_enumerate_scan_devices(e);
2874                 if (r < 0)
2875                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2876
2877                 /* Count the partitions enumerated by the kernel */
2878                 n = 0;
2879                 first = udev_enumerate_get_list_entry(e);
2880                 udev_list_entry_foreach(item, first)
2881                         n++;
2882
2883                 /* Count the partitions enumerated by blkid */
2884                 m = blkid_partlist_numof_partitions(pl);
2885                 if (n == m + 1)
2886                         break;
2887                 if (n > m + 1) {
2888                         log_error("blkid and kernel partition list do not match.");
2889                         return -EIO;
2890                 }
2891                 if (n < m + 1) {
2892                         unsigned j;
2893
2894                         /* The kernel has probed fewer partitions than
2895                          * blkid? Maybe the kernel prober is still
2896                          * running or it got EBUSY because udev
2897                          * already opened the device. Let's reprobe
2898                          * the device, which is a synchronous call
2899                          * that waits until probing is complete. */
2900
2901                         for (j = 0; j < 20; j++) {
2902
2903                                 r = ioctl(fd, BLKRRPART, 0);
2904                                 if (r < 0)
2905                                         r = -errno;
2906                                 if (r >= 0 || r != -EBUSY)
2907                                         break;
2908
2909                                 /* If something else has the device
2910                                  * open, such as an udev rule, the
2911                                  * ioctl will return EBUSY. Since
2912                                  * there's no way to wait until it
2913                                  * isn't busy anymore, let's just wait
2914                                  * a bit, and try again.
2915                                  *
2916                                  * This is really something they
2917                                  * should fix in the kernel! */
2918
2919                                 usleep(50 * USEC_PER_MSEC);
2920                         }
2921
2922                         if (r < 0)
2923                                 return log_error_errno(r, "Failed to reread partition table: %m");
2924                 }
2925
2926                 e = udev_enumerate_unref(e);
2927         }
2928
2929         first = udev_enumerate_get_list_entry(e);
2930         udev_list_entry_foreach(item, first) {
2931                 _cleanup_udev_device_unref_ struct udev_device *q;
2932                 const char *node;
2933                 unsigned long long flags;
2934                 blkid_partition pp;
2935                 dev_t qn;
2936                 int nr;
2937
2938                 errno = 0;
2939                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2940                 if (!q) {
2941                         if (!errno)
2942                                 errno = ENOMEM;
2943
2944                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2945                         return -errno;
2946                 }
2947
2948                 qn = udev_device_get_devnum(q);
2949                 if (major(qn) == 0)
2950                         continue;
2951
2952                 if (st.st_rdev == qn)
2953                         continue;
2954
2955                 node = udev_device_get_devnode(q);
2956                 if (!node)
2957                         continue;
2958
2959                 pp = blkid_partlist_devno_to_partition(pl, qn);
2960                 if (!pp)
2961                         continue;
2962
2963                 flags = blkid_partition_get_flags(pp);
2964
2965                 nr = blkid_partition_get_partno(pp);
2966                 if (nr < 0)
2967                         continue;
2968
2969                 if (is_gpt) {
2970                         sd_id128_t type_id;
2971                         const char *stype;
2972
2973                         if (flags & GPT_FLAG_NO_AUTO)
2974                                 continue;
2975
2976                         stype = blkid_partition_get_type_string(pp);
2977                         if (!stype)
2978                                 continue;
2979
2980                         if (sd_id128_from_string(stype, &type_id) < 0)
2981                                 continue;
2982
2983                         if (sd_id128_equal(type_id, GPT_HOME)) {
2984
2985                                 if (home && nr >= home_nr)
2986                                         continue;
2987
2988                                 home_nr = nr;
2989                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2990
2991                                 r = free_and_strdup(&home, node);
2992                                 if (r < 0)
2993                                         return log_oom();
2994
2995                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2996
2997                                 if (srv && nr >= srv_nr)
2998                                         continue;
2999
3000                                 srv_nr = nr;
3001                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3002
3003                                 r = free_and_strdup(&srv, node);
3004                                 if (r < 0)
3005                                         return log_oom();
3006                         }
3007 #ifdef GPT_ROOT_NATIVE
3008                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3009
3010                                 if (root && nr >= root_nr)
3011                                         continue;
3012
3013                                 root_nr = nr;
3014                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3015
3016                                 r = free_and_strdup(&root, node);
3017                                 if (r < 0)
3018                                         return log_oom();
3019                         }
3020 #endif
3021 #ifdef GPT_ROOT_SECONDARY
3022                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3023
3024                                 if (secondary_root && nr >= secondary_root_nr)
3025                                         continue;
3026
3027                                 secondary_root_nr = nr;
3028                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3029
3030                                 r = free_and_strdup(&secondary_root, node);
3031                                 if (r < 0)
3032                                         return log_oom();
3033                         }
3034 #endif
3035                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3036
3037                                 if (generic)
3038                                         multiple_generic = true;
3039                                 else {
3040                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3041
3042                                         r = free_and_strdup(&generic, node);
3043                                         if (r < 0)
3044                                                 return log_oom();
3045                                 }
3046                         }
3047
3048                 } else if (is_mbr) {
3049                         int type;
3050
3051                         if (flags != 0x80) /* Bootable flag */
3052                                 continue;
3053
3054                         type = blkid_partition_get_type(pp);
3055                         if (type != 0x83) /* Linux partition */
3056                                 continue;
3057
3058                         if (generic)
3059                                 multiple_generic = true;
3060                         else {
3061                                 generic_rw = true;
3062
3063                                 r = free_and_strdup(&root, node);
3064                                 if (r < 0)
3065                                         return log_oom();
3066                         }
3067                 }