chiark / gitweb /
nspawn: mount /tmp in the container, don't leave this to the container's init
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
820 #ifdef HAVE_SELINUX
821                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
822                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
823 #endif
824         };
825
826         unsigned k;
827         int r = 0;
828
829         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830                 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832                 _cleanup_free_ char *options = NULL;
833 #endif
834                 const char *o;
835                 int t;
836
837                 where = strjoin(dest, "/", mount_table[k].where, NULL);
838                 if (!where)
839                         return log_oom();
840
841                 t = path_is_mount_point(where, true);
842                 if (t < 0) {
843                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845                         if (r == 0)
846                                 r = t;
847
848                         continue;
849                 }
850
851                 /* Skip this entry if it is not a remount. */
852                 if (mount_table[k].what && t > 0)
853                         continue;
854
855                 t = mkdir_p(where, 0755);
856                 if (t < 0) {
857                         if (mount_table[k].fatal) {
858                                log_error_errno(t, "Failed to create directory %s: %m", where);
859
860                                 if (r == 0)
861                                         r = t;
862                         } else
863                                log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865                         continue;
866                 }
867
868 #ifdef HAVE_SELINUX
869                 if (arg_selinux_apifs_context &&
870                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872                         if (!options)
873                                 return log_oom();
874
875                         o = options;
876                 } else
877 #endif
878                         o = mount_table[k].options;
879
880
881                 if (mount(mount_table[k].what,
882                           where,
883                           mount_table[k].type,
884                           mount_table[k].flags,
885                           o) < 0) {
886
887                         if (mount_table[k].fatal) {
888                                 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890                                 if (r == 0)
891                                         r = -errno;
892                         } else
893                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
894                 }
895         }
896
897         return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901         char **x, **y;
902
903         STRV_FOREACH_PAIR(x, y, l) {
904                 _cleanup_free_ char *where = NULL;
905                 struct stat source_st, dest_st;
906                 int r;
907
908                 if (stat(*x, &source_st) < 0)
909                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911                 where = strappend(dest, *y);
912                 if (!where)
913                         return log_oom();
914
915                 r = stat(where, &dest_st);
916                 if (r == 0) {
917                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919                                 return -EINVAL;
920                         }
921                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923                                 return -EINVAL;
924                         }
925                 } else if (errno == ENOENT) {
926                         r = mkdir_parents_label(where, 0755);
927                         if (r < 0)
928                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929                 } else {
930                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931                         return -errno;
932                 }
933
934                 /* Create the mount point. Any non-directory file can be
935                  * mounted on any non-directory file (regular, fifo, socket,
936                  * char, block).
937                  */
938                 if (S_ISDIR(source_st.st_mode)) {
939                         r = mkdir_label(where, 0755);
940                         if (r < 0 && errno != EEXIST)
941                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942                 } else {
943                         r = touch(where);
944                         if (r < 0)
945                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946                 }
947
948                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949                         return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951                 if (ro) {
952                         r = bind_remount_recursive(where, true);
953                         if (r < 0)
954                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
955                 }
956         }
957
958         return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962         char *to;
963         int r;
964
965         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
966
967         r = path_is_mount_point(to, false);
968         if (r < 0)
969                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970         if (r > 0)
971                 return 0;
972
973         mkdir_p(to, 0755);
974
975         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
976                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
977
978         return 1;
979 }
980
981 static int mount_cgroup(const char *dest) {
982         _cleanup_set_free_free_ Set *controllers = NULL;
983         _cleanup_free_ char *own_cgroup_path = NULL;
984         const char *cgroup_root, *systemd_root, *systemd_own;
985         int r;
986
987         controllers = set_new(&string_hash_ops);
988         if (!controllers)
989                 return log_oom();
990
991         r = cg_kernel_controllers(controllers);
992         if (r < 0)
993                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
994
995         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
996         if (r < 0)
997                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
998
999         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1000         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1001                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1002
1003         for (;;) {
1004                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1005
1006                 controller = set_steal_first(controllers);
1007                 if (!controller)
1008                         break;
1009
1010                 origin = strappend("/sys/fs/cgroup/", controller);
1011                 if (!origin)
1012                         return log_oom();
1013
1014                 r = readlink_malloc(origin, &combined);
1015                 if (r == -EINVAL) {
1016                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1017
1018                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1019                         if (r < 0)
1020                                 return r;
1021
1022                 } else if (r < 0)
1023                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1024                 else {
1025                         _cleanup_free_ char *target = NULL;
1026
1027                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1028                         if (!target)
1029                                 return log_oom();
1030
1031                         /* A symbolic link, a combination of controllers in one hierarchy */
1032
1033                         if (!filename_is_valid(combined)) {
1034                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1035                                 continue;
1036                         }
1037
1038                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1039                         if (r < 0)
1040                                 return r;
1041
1042                         if (symlink(combined, target) < 0)
1043                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1044                 }
1045         }
1046
1047         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1048         if (r < 0)
1049                 return r;
1050
1051         /* Make our own cgroup a (writable) bind mount */
1052         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1053         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1054                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1055
1056         /* And then remount the systemd cgroup root read-only */
1057         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1058         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1059                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1060
1061         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1062                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1063
1064         return 0;
1065 }
1066
1067 static int mount_tmpfs(const char *dest) {
1068         char **i, **o;
1069
1070         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1071                 _cleanup_free_ char *where = NULL;
1072                 int r;
1073
1074                 where = strappend(dest, *i);
1075                 if (!where)
1076                         return log_oom();
1077
1078                 r = mkdir_label(where, 0755);
1079                 if (r < 0 && r != -EEXIST)
1080                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1081
1082                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1083                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1084         }
1085
1086         return 0;
1087 }
1088
1089 static int setup_timezone(const char *dest) {
1090         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1091         char *z, *y;
1092         int r;
1093
1094         assert(dest);
1095
1096         /* Fix the timezone, if possible */
1097         r = readlink_malloc("/etc/localtime", &p);
1098         if (r < 0) {
1099                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1100                 return 0;
1101         }
1102
1103         z = path_startswith(p, "../usr/share/zoneinfo/");
1104         if (!z)
1105                 z = path_startswith(p, "/usr/share/zoneinfo/");
1106         if (!z) {
1107                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1108                 return 0;
1109         }
1110
1111         where = strappend(dest, "/etc/localtime");
1112         if (!where)
1113                 return log_oom();
1114
1115         r = readlink_malloc(where, &q);
1116         if (r >= 0) {
1117                 y = path_startswith(q, "../usr/share/zoneinfo/");
1118                 if (!y)
1119                         y = path_startswith(q, "/usr/share/zoneinfo/");
1120
1121                 /* Already pointing to the right place? Then do nothing .. */
1122                 if (y && streq(y, z))
1123                         return 0;
1124         }
1125
1126         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1127         if (!check)
1128                 return log_oom();
1129
1130         if (access(check, F_OK) < 0) {
1131                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1132                 return 0;
1133         }
1134
1135         what = strappend("../usr/share/zoneinfo/", z);
1136         if (!what)
1137                 return log_oom();
1138
1139         r = mkdir_parents(where, 0755);
1140         if (r < 0) {
1141                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1142
1143                 return 0;
1144         }
1145
1146         r = unlink(where);
1147         if (r < 0 && errno != ENOENT) {
1148                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1149
1150                 return 0;
1151         }
1152
1153         if (symlink(what, where) < 0) {
1154                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1155                 return 0;
1156         }
1157
1158         return 0;
1159 }
1160
1161 static int setup_resolv_conf(const char *dest) {
1162         _cleanup_free_ char *where = NULL;
1163         int r;
1164
1165         assert(dest);
1166
1167         if (arg_private_network)
1168                 return 0;
1169
1170         /* Fix resolv.conf, if possible */
1171         where = strappend(dest, "/etc/resolv.conf");
1172         if (!where)
1173                 return log_oom();
1174
1175         /* We don't really care for the results of this really. If it
1176          * fails, it fails, but meh... */
1177         r = mkdir_parents(where, 0755);
1178         if (r < 0) {
1179                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1180
1181                 return 0;
1182         }
1183
1184         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1185         if (r < 0) {
1186                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1187
1188                 return 0;
1189         }
1190
1191         return 0;
1192 }
1193
1194 static int setup_volatile_state(const char *directory) {
1195         const char *p;
1196         int r;
1197
1198         assert(directory);
1199
1200         if (arg_volatile != VOLATILE_STATE)
1201                 return 0;
1202
1203         /* --volatile=state means we simply overmount /var
1204            with a tmpfs, and the rest read-only. */
1205
1206         r = bind_remount_recursive(directory, true);
1207         if (r < 0)
1208                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1209
1210         p = strappenda(directory, "/var");
1211         r = mkdir(p, 0755);
1212         if (r < 0 && errno != EEXIST)
1213                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1214
1215         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1216                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1217
1218         return 0;
1219 }
1220
1221 static int setup_volatile(const char *directory) {
1222         bool tmpfs_mounted = false, bind_mounted = false;
1223         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1224         const char *f, *t;
1225         int r;
1226
1227         assert(directory);
1228
1229         if (arg_volatile != VOLATILE_YES)
1230                 return 0;
1231
1232         /* --volatile=yes means we mount a tmpfs to the root dir, and
1233            the original /usr to use inside it, and that read-only. */
1234
1235         if (!mkdtemp(template))
1236                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1237
1238         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1239                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1240                 r = -errno;
1241                 goto fail;
1242         }
1243
1244         tmpfs_mounted = true;
1245
1246         f = strappenda(directory, "/usr");
1247         t = strappenda(template, "/usr");
1248
1249         r = mkdir(t, 0755);
1250         if (r < 0 && errno != EEXIST) {
1251                 log_error_errno(errno, "Failed to create %s: %m", t);
1252                 r = -errno;
1253                 goto fail;
1254         }
1255
1256         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1257                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1258                 r = -errno;
1259                 goto fail;
1260         }
1261
1262         bind_mounted = true;
1263
1264         r = bind_remount_recursive(t, true);
1265         if (r < 0) {
1266                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1267                 goto fail;
1268         }
1269
1270         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1271                 log_error_errno(errno, "Failed to move root mount: %m");
1272                 r = -errno;
1273                 goto fail;
1274         }
1275
1276         rmdir(template);
1277
1278         return 0;
1279
1280 fail:
1281         if (bind_mounted)
1282                 umount(t);
1283         if (tmpfs_mounted)
1284                 umount(template);
1285         rmdir(template);
1286         return r;
1287 }
1288
1289 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1290
1291         snprintf(s, 37,
1292                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1293                  SD_ID128_FORMAT_VAL(id));
1294
1295         return s;
1296 }
1297
1298 static int setup_boot_id(const char *dest) {
1299         _cleanup_free_ char *from = NULL, *to = NULL;
1300         sd_id128_t rnd = {};
1301         char as_uuid[37];
1302         int r;
1303
1304         assert(dest);
1305
1306         if (arg_share_system)
1307                 return 0;
1308
1309         /* Generate a new randomized boot ID, so that each boot-up of
1310          * the container gets a new one */
1311
1312         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1313         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1314         if (!from || !to)
1315                 return log_oom();
1316
1317         r = sd_id128_randomize(&rnd);
1318         if (r < 0)
1319                 return log_error_errno(r, "Failed to generate random boot id: %m");
1320
1321         id128_format_as_uuid(rnd, as_uuid);
1322
1323         r = write_string_file(from, as_uuid);
1324         if (r < 0)
1325                 return log_error_errno(r, "Failed to write boot id: %m");
1326
1327         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1328                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1329                 r = -errno;
1330         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1331                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1332
1333         unlink(from);
1334         return r;
1335 }
1336
1337 static int copy_devnodes(const char *dest) {
1338
1339         static const char devnodes[] =
1340                 "null\0"
1341                 "zero\0"
1342                 "full\0"
1343                 "random\0"
1344                 "urandom\0"
1345                 "tty\0"
1346                 "net/tun\0";
1347
1348         const char *d;
1349         int r = 0;
1350         _cleanup_umask_ mode_t u;
1351
1352         assert(dest);
1353
1354         u = umask(0000);
1355
1356         NULSTR_FOREACH(d, devnodes) {
1357                 _cleanup_free_ char *from = NULL, *to = NULL;
1358                 struct stat st;
1359
1360                 from = strappend("/dev/", d);
1361                 to = strjoin(dest, "/dev/", d, NULL);
1362                 if (!from || !to)
1363                         return log_oom();
1364
1365                 if (stat(from, &st) < 0) {
1366
1367                         if (errno != ENOENT)
1368                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1369
1370                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1371
1372                         log_error("%s is not a char or block device, cannot copy", from);
1373                         return -EIO;
1374
1375                 } else {
1376                         r = mkdir_parents(to, 0775);
1377                         if (r < 0) {
1378                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1379                                 return -r;
1380                         }
1381
1382                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1383                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1384                 }
1385         }
1386
1387         return r;
1388 }
1389
1390 static int setup_ptmx(const char *dest) {
1391         _cleanup_free_ char *p = NULL;
1392
1393         p = strappend(dest, "/dev/ptmx");
1394         if (!p)
1395                 return log_oom();
1396
1397         if (symlink("pts/ptmx", p) < 0)
1398                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1399
1400         return 0;
1401 }
1402
1403 static int setup_dev_console(const char *dest, const char *console) {
1404         _cleanup_umask_ mode_t u;
1405         const char *to;
1406         struct stat st;
1407         int r;
1408
1409         assert(dest);
1410         assert(console);
1411
1412         u = umask(0000);
1413
1414         if (stat("/dev/null", &st) < 0)
1415                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1416
1417         r = chmod_and_chown(console, 0600, 0, 0);
1418         if (r < 0)
1419                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1420
1421         /* We need to bind mount the right tty to /dev/console since
1422          * ptys can only exist on pts file systems. To have something
1423          * to bind mount things on we create a device node first, and
1424          * use /dev/null for that since we the cgroups device policy
1425          * allows us to create that freely, while we cannot create
1426          * /dev/console. (Note that the major minor doesn't actually
1427          * matter here, since we mount it over anyway). */
1428
1429         to = strappenda(dest, "/dev/console");
1430         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1431                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1432
1433         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1434                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1435
1436         return 0;
1437 }
1438
1439 static int setup_kmsg(const char *dest, int kmsg_socket) {
1440         _cleanup_free_ char *from = NULL, *to = NULL;
1441         _cleanup_umask_ mode_t u;
1442         int r, fd, k;
1443         union {
1444                 struct cmsghdr cmsghdr;
1445                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1446         } control = {};
1447         struct msghdr mh = {
1448                 .msg_control = &control,
1449                 .msg_controllen = sizeof(control),
1450         };
1451         struct cmsghdr *cmsg;
1452
1453         assert(dest);
1454         assert(kmsg_socket >= 0);
1455
1456         u = umask(0000);
1457
1458         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1459          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1460          * on the reading side behave very similar to /proc/kmsg,
1461          * their writing side behaves differently from /dev/kmsg in
1462          * that writing blocks when nothing is reading. In order to
1463          * avoid any problems with containers deadlocking due to this
1464          * we simply make /dev/kmsg unavailable to the container. */
1465         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1466             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1467                 return log_oom();
1468
1469         if (mkfifo(from, 0600) < 0)
1470                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1471
1472         r = chmod_and_chown(from, 0600, 0, 0);
1473         if (r < 0)
1474                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1475
1476         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1477                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1478
1479         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1480         if (fd < 0)
1481                 return log_error_errno(errno, "Failed to open fifo: %m");
1482
1483         cmsg = CMSG_FIRSTHDR(&mh);
1484         cmsg->cmsg_level = SOL_SOCKET;
1485         cmsg->cmsg_type = SCM_RIGHTS;
1486         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1487         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1488
1489         mh.msg_controllen = cmsg->cmsg_len;
1490
1491         /* Store away the fd in the socket, so that it stays open as
1492          * long as we run the child */
1493         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1494         safe_close(fd);
1495
1496         if (k < 0)
1497                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1498
1499         /* And now make the FIFO unavailable as /dev/kmsg... */
1500         unlink(from);
1501         return 0;
1502 }
1503
1504 static int send_rtnl(int send_fd) {
1505         union {
1506                 struct cmsghdr cmsghdr;
1507                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1508         } control = {};
1509         struct msghdr mh = {
1510                 .msg_control = &control,
1511                 .msg_controllen = sizeof(control),
1512         };
1513         struct cmsghdr *cmsg;
1514         _cleanup_close_ int fd = -1;
1515         ssize_t k;
1516
1517         assert(send_fd >= 0);
1518
1519         if (!arg_expose_ports)
1520                 return 0;
1521
1522         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1523         if (fd < 0)
1524                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1525
1526         cmsg = CMSG_FIRSTHDR(&mh);
1527         cmsg->cmsg_level = SOL_SOCKET;
1528         cmsg->cmsg_type = SCM_RIGHTS;
1529         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1530         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1531
1532         mh.msg_controllen = cmsg->cmsg_len;
1533
1534         /* Store away the fd in the socket, so that it stays open as
1535          * long as we run the child */
1536         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1537         if (k < 0)
1538                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1539
1540         return 0;
1541 }
1542
1543 static int flush_ports(union in_addr_union *exposed) {
1544         ExposePort *p;
1545         int r, af = AF_INET;
1546
1547         assert(exposed);
1548
1549         if (!arg_expose_ports)
1550                 return 0;
1551
1552         if (in_addr_is_null(af, exposed))
1553                 return 0;
1554
1555         log_debug("Lost IP address.");
1556
1557         LIST_FOREACH(ports, p, arg_expose_ports) {
1558                 r = fw_add_local_dnat(false,
1559                                       af,
1560                                       p->protocol,
1561                                       NULL,
1562                                       NULL, 0,
1563                                       NULL, 0,
1564                                       p->host_port,
1565                                       exposed,
1566                                       p->container_port,
1567                                       NULL);
1568                 if (r < 0)
1569                         log_warning_errno(r, "Failed to modify firewall: %m");
1570         }
1571
1572         *exposed = IN_ADDR_NULL;
1573         return 0;
1574 }
1575
1576 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1577         _cleanup_free_ struct local_address *addresses = NULL;
1578         _cleanup_free_ char *pretty = NULL;
1579         union in_addr_union new_exposed;
1580         ExposePort *p;
1581         bool add;
1582         int af = AF_INET, r;
1583
1584         assert(exposed);
1585
1586         /* Invoked each time an address is added or removed inside the
1587          * container */
1588
1589         if (!arg_expose_ports)
1590                 return 0;
1591
1592         r = local_addresses(rtnl, 0, af, &addresses);
1593         if (r < 0)
1594                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1595
1596         add = r > 0 &&
1597                 addresses[0].family == af &&
1598                 addresses[0].scope < RT_SCOPE_LINK;
1599
1600         if (!add)
1601                 return flush_ports(exposed);
1602
1603         new_exposed = addresses[0].address;
1604         if (in_addr_equal(af, exposed, &new_exposed))
1605                 return 0;
1606
1607         in_addr_to_string(af, &new_exposed, &pretty);
1608         log_debug("New container IP is %s.", strna(pretty));
1609
1610         LIST_FOREACH(ports, p, arg_expose_ports) {
1611
1612                 r = fw_add_local_dnat(true,
1613                                       af,
1614                                       p->protocol,
1615                                       NULL,
1616                                       NULL, 0,
1617                                       NULL, 0,
1618                                       p->host_port,
1619                                       &new_exposed,
1620                                       p->container_port,
1621                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1622                 if (r < 0)
1623                         log_warning_errno(r, "Failed to modify firewall: %m");
1624         }
1625
1626         *exposed = new_exposed;
1627         return 0;
1628 }
1629
1630 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1631         union in_addr_union *exposed = userdata;
1632
1633         assert(rtnl);
1634         assert(m);
1635         assert(exposed);
1636
1637         expose_ports(rtnl, exposed);
1638         return 0;
1639 }
1640
1641 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1642         union {
1643                 struct cmsghdr cmsghdr;
1644                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1645         } control = {};
1646         struct msghdr mh = {
1647                 .msg_control = &control,
1648                 .msg_controllen = sizeof(control),
1649         };
1650         struct cmsghdr *cmsg;
1651         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1652         int fd, r;
1653         ssize_t k;
1654
1655         assert(event);
1656         assert(recv_fd >= 0);
1657         assert(ret);
1658
1659         if (!arg_expose_ports)
1660                 return 0;
1661
1662         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1663         if (k < 0)
1664                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1665
1666         cmsg = CMSG_FIRSTHDR(&mh);
1667         assert(cmsg->cmsg_level == SOL_SOCKET);
1668         assert(cmsg->cmsg_type == SCM_RIGHTS);
1669         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1670         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1671
1672         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1673         if (r < 0) {
1674                 safe_close(fd);
1675                 return log_error_errno(r, "Failed to create rtnl object: %m");
1676         }
1677
1678         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1679         if (r < 0)
1680                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1681
1682         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1683         if (r < 0)
1684                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1685
1686         r = sd_rtnl_attach_event(rtnl, event, 0);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to add to even loop: %m");
1689
1690         *ret = rtnl;
1691         rtnl = NULL;
1692
1693         return 0;
1694 }
1695
1696 static int setup_hostname(void) {
1697
1698         if (arg_share_system)
1699                 return 0;
1700
1701         if (sethostname_idempotent(arg_machine) < 0)
1702                 return -errno;
1703
1704         return 0;
1705 }
1706
1707 static int setup_journal(const char *directory) {
1708         sd_id128_t machine_id, this_id;
1709         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1710         char *id;
1711         int r;
1712
1713         /* Don't link journals in ephemeral mode */
1714         if (arg_ephemeral)
1715                 return 0;
1716
1717         p = strappend(directory, "/etc/machine-id");
1718         if (!p)
1719                 return log_oom();
1720
1721         r = read_one_line_file(p, &b);
1722         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1723                 return 0;
1724         else if (r < 0)
1725                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1726
1727         id = strstrip(b);
1728         if (isempty(id) && arg_link_journal == LINK_AUTO)
1729                 return 0;
1730
1731         /* Verify validity */
1732         r = sd_id128_from_string(id, &machine_id);
1733         if (r < 0)
1734                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1735
1736         r = sd_id128_get_machine(&this_id);
1737         if (r < 0)
1738                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1739
1740         if (sd_id128_equal(machine_id, this_id)) {
1741                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1742                          "Host and machine ids are equal (%s): refusing to link journals", id);
1743                 if (arg_link_journal == LINK_AUTO)
1744                         return 0;
1745                 return -EEXIST;
1746         }
1747
1748         if (arg_link_journal == LINK_NO)
1749                 return 0;
1750
1751         free(p);
1752         p = strappend("/var/log/journal/", id);
1753         q = strjoin(directory, "/var/log/journal/", id, NULL);
1754         if (!p || !q)
1755                 return log_oom();
1756
1757         if (path_is_mount_point(p, false) > 0) {
1758                 if (arg_link_journal != LINK_AUTO) {
1759                         log_error("%s: already a mount point, refusing to use for journal", p);
1760                         return -EEXIST;
1761                 }
1762
1763                 return 0;
1764         }
1765
1766         if (path_is_mount_point(q, false) > 0) {
1767                 if (arg_link_journal != LINK_AUTO) {
1768                         log_error("%s: already a mount point, refusing to use for journal", q);
1769                         return -EEXIST;
1770                 }
1771
1772                 return 0;
1773         }
1774
1775         r = readlink_and_make_absolute(p, &d);
1776         if (r >= 0) {
1777                 if ((arg_link_journal == LINK_GUEST ||
1778                      arg_link_journal == LINK_AUTO) &&
1779                     path_equal(d, q)) {
1780
1781                         r = mkdir_p(q, 0755);
1782                         if (r < 0)
1783                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1784                         return 0;
1785                 }
1786
1787                 if (unlink(p) < 0)
1788                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1789         } else if (r == -EINVAL) {
1790
1791                 if (arg_link_journal == LINK_GUEST &&
1792                     rmdir(p) < 0) {
1793
1794                         if (errno == ENOTDIR) {
1795                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1796                                 return r;
1797                         } else {
1798                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1799                                 return -errno;
1800                         }
1801                 }
1802         } else if (r != -ENOENT) {
1803                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1804                 return r;
1805         }
1806
1807         if (arg_link_journal == LINK_GUEST) {
1808
1809                 if (symlink(q, p) < 0) {
1810                         if (arg_link_journal_try) {
1811                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1812                                 return 0;
1813                         } else {
1814                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1815                                 return -errno;
1816                         }
1817                 }
1818
1819                 r = mkdir_p(q, 0755);
1820                 if (r < 0)
1821                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1822                 return 0;
1823         }
1824
1825         if (arg_link_journal == LINK_HOST) {
1826                 /* don't create parents here -- if the host doesn't have
1827                  * permanent journal set up, don't force it here */
1828                 r = mkdir(p, 0755);
1829                 if (r < 0) {
1830                         if (arg_link_journal_try) {
1831                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1832                                 return 0;
1833                         } else {
1834                                 log_error_errno(errno, "Failed to create %s: %m", p);
1835                                 return r;
1836                         }
1837                 }
1838
1839         } else if (access(p, F_OK) < 0)
1840                 return 0;
1841
1842         if (dir_is_empty(q) == 0)
1843                 log_warning("%s is not empty, proceeding anyway.", q);
1844
1845         r = mkdir_p(q, 0755);
1846         if (r < 0) {
1847                 log_error_errno(errno, "Failed to create %s: %m", q);
1848                 return r;
1849         }
1850
1851         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1852                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1853
1854         return 0;
1855 }
1856
1857 static int drop_capabilities(void) {
1858         return capability_bounding_set_drop(~arg_retain, false);
1859 }
1860
1861 static int register_machine(pid_t pid, int local_ifindex) {
1862         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1863         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1864         int r;
1865
1866         if (!arg_register)
1867                 return 0;
1868
1869         r = sd_bus_default_system(&bus);
1870         if (r < 0)
1871                 return log_error_errno(r, "Failed to open system bus: %m");
1872
1873         if (arg_keep_unit) {
1874                 r = sd_bus_call_method(
1875                                 bus,
1876                                 "org.freedesktop.machine1",
1877                                 "/org/freedesktop/machine1",
1878                                 "org.freedesktop.machine1.Manager",
1879                                 "RegisterMachineWithNetwork",
1880                                 &error,
1881                                 NULL,
1882                                 "sayssusai",
1883                                 arg_machine,
1884                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1885                                 "nspawn",
1886                                 "container",
1887                                 (uint32_t) pid,
1888                                 strempty(arg_directory),
1889                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1890         } else {
1891                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1892
1893                 r = sd_bus_message_new_method_call(
1894                                 bus,
1895                                 &m,
1896                                 "org.freedesktop.machine1",
1897                                 "/org/freedesktop/machine1",
1898                                 "org.freedesktop.machine1.Manager",
1899                                 "CreateMachineWithNetwork");
1900                 if (r < 0)
1901                         return log_error_errno(r, "Failed to create message: %m");
1902
1903                 r = sd_bus_message_append(
1904                                 m,
1905                                 "sayssusai",
1906                                 arg_machine,
1907                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1908                                 "nspawn",
1909                                 "container",
1910                                 (uint32_t) pid,
1911                                 strempty(arg_directory),
1912                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1913                 if (r < 0)
1914                         return log_error_errno(r, "Failed to append message arguments: %m");
1915
1916                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1917                 if (r < 0)
1918                         return log_error_errno(r, "Failed to open container: %m");
1919
1920                 if (!isempty(arg_slice)) {
1921                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1922                         if (r < 0)
1923                                 return log_error_errno(r, "Failed to append slice: %m");
1924                 }
1925
1926                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1927                 if (r < 0)
1928                         return log_error_errno(r, "Failed to add device policy: %m");
1929
1930                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1931                                           /* Allow the container to
1932                                            * access and create the API
1933                                            * device nodes, so that
1934                                            * PrivateDevices= in the
1935                                            * container can work
1936                                            * fine */
1937                                           "/dev/null", "rwm",
1938                                           "/dev/zero", "rwm",
1939                                           "/dev/full", "rwm",
1940                                           "/dev/random", "rwm",
1941                                           "/dev/urandom", "rwm",
1942                                           "/dev/tty", "rwm",
1943                                           "/dev/net/tun", "rwm",
1944                                           /* Allow the container
1945                                            * access to ptys. However,
1946                                            * do not permit the
1947                                            * container to ever create
1948                                            * these device nodes. */
1949                                           "/dev/pts/ptmx", "rw",
1950                                           "char-pts", "rw");
1951                 if (r < 0)
1952                         return log_error_errno(r, "Failed to add device whitelist: %m");
1953
1954                 r = sd_bus_message_close_container(m);
1955                 if (r < 0)
1956                         return log_error_errno(r, "Failed to close container: %m");
1957
1958                 r = sd_bus_call(bus, m, 0, &error, NULL);
1959         }
1960
1961         if (r < 0) {
1962                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1963                 return r;
1964         }
1965
1966         return 0;
1967 }
1968
1969 static int terminate_machine(pid_t pid) {
1970         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1971         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1972         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1973         const char *path;
1974         int r;
1975
1976         if (!arg_register)
1977                 return 0;
1978
1979         r = sd_bus_default_system(&bus);
1980         if (r < 0)
1981                 return log_error_errno(r, "Failed to open system bus: %m");
1982
1983         r = sd_bus_call_method(
1984                         bus,
1985                         "org.freedesktop.machine1",
1986                         "/org/freedesktop/machine1",
1987                         "org.freedesktop.machine1.Manager",
1988                         "GetMachineByPID",
1989                         &error,
1990                         &reply,
1991                         "u",
1992                         (uint32_t) pid);
1993         if (r < 0) {
1994                 /* Note that the machine might already have been
1995                  * cleaned up automatically, hence don't consider it a
1996                  * failure if we cannot get the machine object. */
1997                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1998                 return 0;
1999         }
2000
2001         r = sd_bus_message_read(reply, "o", &path);
2002         if (r < 0)
2003                 return bus_log_parse_error(r);
2004
2005         r = sd_bus_call_method(
2006                         bus,
2007                         "org.freedesktop.machine1",
2008                         path,
2009                         "org.freedesktop.machine1.Machine",
2010                         "Terminate",
2011                         &error,
2012                         NULL,
2013                         NULL);
2014         if (r < 0) {
2015                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2016                 return 0;
2017         }
2018
2019         return 0;
2020 }
2021
2022 static int reset_audit_loginuid(void) {
2023         _cleanup_free_ char *p = NULL;
2024         int r;
2025
2026         if (arg_share_system)
2027                 return 0;
2028
2029         r = read_one_line_file("/proc/self/loginuid", &p);
2030         if (r == -ENOENT)
2031                 return 0;
2032         if (r < 0)
2033                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2034
2035         /* Already reset? */
2036         if (streq(p, "4294967295"))
2037                 return 0;
2038
2039         r = write_string_file("/proc/self/loginuid", "4294967295");
2040         if (r < 0) {
2041                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2042                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2043                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2044                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2045                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2046
2047                 sleep(5);
2048         }
2049
2050         return 0;
2051 }
2052
2053 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2054 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2055 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2056
2057 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2058         uint8_t result[8];
2059         size_t l, sz;
2060         uint8_t *v, *i;
2061         int r;
2062
2063         l = strlen(arg_machine);
2064         sz = sizeof(sd_id128_t) + l;
2065         if (idx > 0)
2066                 sz += sizeof(idx);
2067
2068         v = alloca(sz);
2069
2070         /* fetch some persistent data unique to the host */
2071         r = sd_id128_get_machine((sd_id128_t*) v);
2072         if (r < 0)
2073                 return r;
2074
2075         /* combine with some data unique (on this host) to this
2076          * container instance */
2077         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2078         if (idx > 0) {
2079                 idx = htole64(idx);
2080                 memcpy(i, &idx, sizeof(idx));
2081         }
2082
2083         /* Let's hash the host machine ID plus the container name. We
2084          * use a fixed, but originally randomly created hash key here. */
2085         siphash24(result, v, sz, hash_key.bytes);
2086
2087         assert_cc(ETH_ALEN <= sizeof(result));
2088         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2089
2090         /* see eth_random_addr in the kernel */
2091         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2092         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2093
2094         return 0;
2095 }
2096
2097 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2098         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2099         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2100         struct ether_addr mac_host, mac_container;
2101         int r, i;
2102
2103         if (!arg_private_network)
2104                 return 0;
2105
2106         if (!arg_network_veth)
2107                 return 0;
2108
2109         /* Use two different interface name prefixes depending whether
2110          * we are in bridge mode or not. */
2111         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2112                  arg_network_bridge ? "vb" : "ve", arg_machine);
2113
2114         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2115         if (r < 0)
2116                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2117
2118         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2119         if (r < 0)
2120                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2121
2122         r = sd_rtnl_open(&rtnl, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to connect to netlink: %m");
2125
2126         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2129
2130         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2133
2134         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2137
2138         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to open netlink container: %m");
2141
2142         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to open netlink container: %m");
2145
2146         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2153
2154         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2157
2158         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2161
2162         r = sd_rtnl_message_close_container(m);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to close netlink container: %m");
2165
2166         r = sd_rtnl_message_close_container(m);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to close netlink container: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_call(rtnl, m, 0, NULL);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2177
2178         i = (int) if_nametoindex(iface_name);
2179         if (i <= 0)
2180                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2181
2182         *ifi = i;
2183
2184         return 0;
2185 }
2186
2187 static int setup_bridge(const char veth_name[], int *ifi) {
2188         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2189         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2190         int r, bridge;
2191
2192         if (!arg_private_network)
2193                 return 0;
2194
2195         if (!arg_network_veth)
2196                 return 0;
2197
2198         if (!arg_network_bridge)
2199                 return 0;
2200
2201         bridge = (int) if_nametoindex(arg_network_bridge);
2202         if (bridge <= 0)
2203                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2204
2205         *ifi = bridge;
2206
2207         r = sd_rtnl_open(&rtnl, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to connect to netlink: %m");
2210
2211         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2214
2215         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2218
2219         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2222
2223         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink master field: %m");
2226
2227         r = sd_rtnl_call(rtnl, m, 0, NULL);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2230
2231         return 0;
2232 }
2233
2234 static int parse_interface(struct udev *udev, const char *name) {
2235         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2236         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2237         int ifi;
2238
2239         ifi = (int) if_nametoindex(name);
2240         if (ifi <= 0)
2241                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2242
2243         sprintf(ifi_str, "n%i", ifi);
2244         d = udev_device_new_from_device_id(udev, ifi_str);
2245         if (!d)
2246                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2247
2248         if (udev_device_get_is_initialized(d) <= 0) {
2249                 log_error("Network interface %s is not initialized yet.", name);
2250                 return -EBUSY;
2251         }
2252
2253         return ifi;
2254 }
2255
2256 static int move_network_interfaces(pid_t pid) {
2257         _cleanup_udev_unref_ struct udev *udev = NULL;
2258         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2259         char **i;
2260         int r;
2261
2262         if (!arg_private_network)
2263                 return 0;
2264
2265         if (strv_isempty(arg_network_interfaces))
2266                 return 0;
2267
2268         r = sd_rtnl_open(&rtnl, 0);
2269         if (r < 0)
2270                 return log_error_errno(r, "Failed to connect to netlink: %m");
2271
2272         udev = udev_new();
2273         if (!udev) {
2274                 log_error("Failed to connect to udev.");
2275                 return -ENOMEM;
2276         }
2277
2278         STRV_FOREACH(i, arg_network_interfaces) {
2279                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2280                 int ifi;
2281
2282                 ifi = parse_interface(udev, *i);
2283                 if (ifi < 0)
2284                         return ifi;
2285
2286                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2287                 if (r < 0)
2288                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2289
2290                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2291                 if (r < 0)
2292                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2293
2294                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2297         }
2298
2299         return 0;
2300 }
2301
2302 static int setup_macvlan(pid_t pid) {
2303         _cleanup_udev_unref_ struct udev *udev = NULL;
2304         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2305         unsigned idx = 0;
2306         char **i;
2307         int r;
2308
2309         if (!arg_private_network)
2310                 return 0;
2311
2312         if (strv_isempty(arg_network_macvlan))
2313                 return 0;
2314
2315         r = sd_rtnl_open(&rtnl, 0);
2316         if (r < 0)
2317                 return log_error_errno(r, "Failed to connect to netlink: %m");
2318
2319         udev = udev_new();
2320         if (!udev) {
2321                 log_error("Failed to connect to udev.");
2322                 return -ENOMEM;
2323         }
2324
2325         STRV_FOREACH(i, arg_network_macvlan) {
2326                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2327                 _cleanup_free_ char *n = NULL;
2328                 struct ether_addr mac;
2329                 int ifi;
2330
2331                 ifi = parse_interface(udev, *i);
2332                 if (ifi < 0)
2333                         return ifi;
2334
2335                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2336                 if (r < 0)
2337                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2338
2339                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2340                 if (r < 0)
2341                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2342
2343                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2346
2347                 n = strappend("mv-", *i);
2348                 if (!n)
2349                         return log_oom();
2350
2351                 strshorten(n, IFNAMSIZ-1);
2352
2353                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2354                 if (r < 0)
2355                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2356
2357                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2358                 if (r < 0)
2359                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2360
2361                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2364
2365                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to open netlink container: %m");
2368
2369                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to open netlink container: %m");
2372
2373                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2376
2377                 r = sd_rtnl_message_close_container(m);
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to close netlink container: %m");
2380
2381                 r = sd_rtnl_message_close_container(m);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to close netlink container: %m");
2384
2385                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2388         }
2389
2390         return 0;
2391 }
2392
2393 static int setup_ipvlan(pid_t pid) {
2394         _cleanup_udev_unref_ struct udev *udev = NULL;
2395         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2396         char **i;
2397         int r;
2398
2399         if (!arg_private_network)
2400                 return 0;
2401
2402         if (strv_isempty(arg_network_ipvlan))
2403                 return 0;
2404
2405         r = sd_rtnl_open(&rtnl, 0);
2406         if (r < 0)
2407                 return log_error_errno(r, "Failed to connect to netlink: %m");
2408
2409         udev = udev_new();
2410         if (!udev) {
2411                 log_error("Failed to connect to udev.");
2412                 return -ENOMEM;
2413         }
2414
2415         STRV_FOREACH(i, arg_network_ipvlan) {
2416                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2417                 _cleanup_free_ char *n = NULL;
2418                 int ifi;
2419
2420                 ifi = parse_interface(udev, *i);
2421                 if (ifi < 0)
2422                         return ifi;
2423
2424                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2425                 if (r < 0)
2426                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2427
2428                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2431
2432                 n = strappend("iv-", *i);
2433                 if (!n)
2434                         return log_oom();
2435
2436                 strshorten(n, IFNAMSIZ-1);
2437
2438                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2439                 if (r < 0)
2440                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2441
2442                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2457
2458                 r = sd_rtnl_message_close_container(m);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2469         }
2470
2471         return 0;
2472 }
2473
2474 static int setup_seccomp(void) {
2475
2476 #ifdef HAVE_SECCOMP
2477         static const int blacklist[] = {
2478                 SCMP_SYS(kexec_load),
2479                 SCMP_SYS(open_by_handle_at),
2480                 SCMP_SYS(init_module),
2481                 SCMP_SYS(finit_module),
2482                 SCMP_SYS(delete_module),
2483                 SCMP_SYS(iopl),
2484                 SCMP_SYS(ioperm),
2485                 SCMP_SYS(swapon),
2486                 SCMP_SYS(swapoff),
2487         };
2488
2489         scmp_filter_ctx seccomp;
2490         unsigned i;
2491         int r;
2492
2493         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2494         if (!seccomp)
2495                 return log_oom();
2496
2497         r = seccomp_add_secondary_archs(seccomp);
2498         if (r < 0) {
2499                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2500                 goto finish;
2501         }
2502
2503         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2504                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2505                 if (r == -EFAULT)
2506                         continue; /* unknown syscall */
2507                 if (r < 0) {
2508                         log_error_errno(r, "Failed to block syscall: %m");
2509                         goto finish;
2510                 }
2511         }
2512
2513         /*
2514            Audit is broken in containers, much of the userspace audit
2515            hookup will fail if running inside a container. We don't
2516            care and just turn off creation of audit sockets.
2517
2518            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2519            with EAFNOSUPPORT which audit userspace uses as indication
2520            that audit is disabled in the kernel.
2521          */
2522
2523         r = seccomp_rule_add(
2524                         seccomp,
2525                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2526                         SCMP_SYS(socket),
2527                         2,
2528                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2529                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2530         if (r < 0) {
2531                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2532                 goto finish;
2533         }
2534
2535         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2536         if (r < 0) {
2537                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2538                 goto finish;
2539         }
2540
2541         r = seccomp_load(seccomp);
2542         if (r < 0)
2543                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2544
2545 finish:
2546         seccomp_release(seccomp);
2547         return r;
2548 #else
2549         return 0;
2550 #endif
2551
2552 }
2553
2554 static int setup_propagate(const char *root) {
2555         const char *p, *q;
2556
2557         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2558         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2559         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2560         (void) mkdir_p(p, 0600);
2561
2562         q = strappenda(root, "/run/systemd/nspawn/incoming");
2563         mkdir_parents(q, 0755);
2564         mkdir_p(q, 0600);
2565
2566         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2567                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2568
2569         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2570                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2571
2572         return 0;
2573 }
2574
2575 static int setup_image(char **device_path, int *loop_nr) {
2576         struct loop_info64 info = {
2577                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2578         };
2579         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2580         _cleanup_free_ char* loopdev = NULL;
2581         struct stat st;
2582         int r, nr;
2583
2584         assert(device_path);
2585         assert(loop_nr);
2586         assert(arg_image);
2587
2588         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2589         if (fd < 0)
2590                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2591
2592         if (fstat(fd, &st) < 0)
2593                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2594
2595         if (S_ISBLK(st.st_mode)) {
2596                 char *p;
2597
2598                 p = strdup(arg_image);
2599                 if (!p)
2600                         return log_oom();
2601
2602                 *device_path = p;
2603
2604                 *loop_nr = -1;
2605
2606                 r = fd;
2607                 fd = -1;
2608
2609                 return r;
2610         }
2611
2612         if (!S_ISREG(st.st_mode)) {
2613                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2614                 return -EINVAL;
2615         }
2616
2617         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2618         if (control < 0)
2619                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2620
2621         nr = ioctl(control, LOOP_CTL_GET_FREE);
2622         if (nr < 0)
2623                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2624
2625         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2626                 return log_oom();
2627
2628         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2629         if (loop < 0)
2630                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2631
2632         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2633                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2634
2635         if (arg_read_only)
2636                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2637
2638         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2639                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2640
2641         *device_path = loopdev;
2642         loopdev = NULL;
2643
2644         *loop_nr = nr;
2645
2646         r = loop;
2647         loop = -1;
2648
2649         return r;
2650 }
2651
2652 #define PARTITION_TABLE_BLURB \
2653         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2654         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2655         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2656         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2657         "to be bootable with systemd-nspawn."
2658
2659 static int dissect_image(
2660                 int fd,
2661                 char **root_device, bool *root_device_rw,
2662                 char **home_device, bool *home_device_rw,
2663                 char **srv_device, bool *srv_device_rw,
2664                 bool *secondary) {
2665
2666 #ifdef HAVE_BLKID
2667         int home_nr = -1, srv_nr = -1;
2668 #ifdef GPT_ROOT_NATIVE
2669         int root_nr = -1;
2670 #endif
2671 #ifdef GPT_ROOT_SECONDARY
2672         int secondary_root_nr = -1;
2673 #endif
2674         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2675         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2676         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2677         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2678         _cleanup_udev_unref_ struct udev *udev = NULL;
2679         struct udev_list_entry *first, *item;
2680         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2681         bool is_gpt, is_mbr, multiple_generic = false;
2682         const char *pttype = NULL;
2683         blkid_partlist pl;
2684         struct stat st;
2685         unsigned i;
2686         int r;
2687
2688         assert(fd >= 0);
2689         assert(root_device);
2690         assert(home_device);
2691         assert(srv_device);
2692         assert(secondary);
2693         assert(arg_image);
2694
2695         b = blkid_new_probe();
2696         if (!b)
2697                 return log_oom();
2698
2699         errno = 0;
2700         r = blkid_probe_set_device(b, fd, 0, 0);
2701         if (r != 0) {
2702                 if (errno == 0)
2703                         return log_oom();
2704
2705                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2706                 return -errno;
2707         }
2708
2709         blkid_probe_enable_partitions(b, 1);
2710         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2711
2712         errno = 0;
2713         r = blkid_do_safeprobe(b);
2714         if (r == -2 || r == 1) {
2715                 log_error("Failed to identify any partition table on\n"
2716                           "    %s\n"
2717                           PARTITION_TABLE_BLURB, arg_image);
2718                 return -EINVAL;
2719         } else if (r != 0) {
2720                 if (errno == 0)
2721                         errno = EIO;
2722                 log_error_errno(errno, "Failed to probe: %m");
2723                 return -errno;
2724         }
2725
2726         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2727
2728         is_gpt = streq_ptr(pttype, "gpt");
2729         is_mbr = streq_ptr(pttype, "dos");
2730
2731         if (!is_gpt && !is_mbr) {
2732                 log_error("No GPT or MBR partition table discovered on\n"
2733                           "    %s\n"
2734                           PARTITION_TABLE_BLURB, arg_image);
2735                 return -EINVAL;
2736         }
2737
2738         errno = 0;
2739         pl = blkid_probe_get_partitions(b);
2740         if (!pl) {
2741                 if (errno == 0)
2742                         return log_oom();
2743
2744                 log_error("Failed to list partitions of %s", arg_image);
2745                 return -errno;
2746         }
2747
2748         udev = udev_new();
2749         if (!udev)
2750                 return log_oom();
2751
2752         if (fstat(fd, &st) < 0)
2753                 return log_error_errno(errno, "Failed to stat block device: %m");
2754
2755         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2756         if (!d)
2757                 return log_oom();
2758
2759         for (i = 0;; i++) {
2760                 int n, m;
2761
2762                 if (i >= 10) {
2763                         log_error("Kernel partitions never appeared.");
2764                         return -ENXIO;
2765                 }
2766
2767                 e = udev_enumerate_new(udev);
2768                 if (!e)
2769                         return log_oom();
2770
2771                 r = udev_enumerate_add_match_parent(e, d);
2772                 if (r < 0)
2773                         return log_oom();
2774
2775                 r = udev_enumerate_scan_devices(e);
2776                 if (r < 0)
2777                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2778
2779                 /* Count the partitions enumerated by the kernel */
2780                 n = 0;
2781                 first = udev_enumerate_get_list_entry(e);
2782                 udev_list_entry_foreach(item, first)
2783                         n++;
2784
2785                 /* Count the partitions enumerated by blkid */
2786                 m = blkid_partlist_numof_partitions(pl);
2787                 if (n == m + 1)
2788                         break;
2789                 if (n > m + 1) {
2790                         log_error("blkid and kernel partition list do not match.");
2791                         return -EIO;
2792                 }
2793                 if (n < m + 1) {
2794                         unsigned j;
2795
2796                         /* The kernel has probed fewer partitions than
2797                          * blkid? Maybe the kernel prober is still
2798                          * running or it got EBUSY because udev
2799                          * already opened the device. Let's reprobe
2800                          * the device, which is a synchronous call
2801                          * that waits until probing is complete. */
2802
2803                         for (j = 0; j < 20; j++) {
2804
2805                                 r = ioctl(fd, BLKRRPART, 0);
2806                                 if (r < 0)
2807                                         r = -errno;
2808                                 if (r >= 0 || r != -EBUSY)
2809                                         break;
2810
2811                                 /* If something else has the device
2812                                  * open, such as an udev rule, the
2813                                  * ioctl will return EBUSY. Since
2814                                  * there's no way to wait until it
2815                                  * isn't busy anymore, let's just wait
2816                                  * a bit, and try again.
2817                                  *
2818                                  * This is really something they
2819                                  * should fix in the kernel! */
2820
2821                                 usleep(50 * USEC_PER_MSEC);
2822                         }
2823
2824                         if (r < 0)
2825                                 return log_error_errno(r, "Failed to reread partition table: %m");
2826                 }
2827
2828                 e = udev_enumerate_unref(e);
2829         }
2830
2831         first = udev_enumerate_get_list_entry(e);
2832         udev_list_entry_foreach(item, first) {
2833                 _cleanup_udev_device_unref_ struct udev_device *q;
2834                 const char *node;
2835                 unsigned long long flags;
2836                 blkid_partition pp;
2837                 dev_t qn;
2838                 int nr;
2839
2840                 errno = 0;
2841                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2842                 if (!q) {
2843                         if (!errno)
2844                                 errno = ENOMEM;
2845
2846                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2847                         return -errno;
2848                 }
2849
2850                 qn = udev_device_get_devnum(q);
2851                 if (major(qn) == 0)
2852                         continue;
2853
2854                 if (st.st_rdev == qn)
2855                         continue;
2856
2857                 node = udev_device_get_devnode(q);
2858                 if (!node)
2859                         continue;
2860
2861                 pp = blkid_partlist_devno_to_partition(pl, qn);
2862                 if (!pp)
2863                         continue;
2864
2865                 flags = blkid_partition_get_flags(pp);
2866
2867                 nr = blkid_partition_get_partno(pp);
2868                 if (nr < 0)
2869                         continue;
2870
2871                 if (is_gpt) {
2872                         sd_id128_t type_id;
2873                         const char *stype;
2874
2875                         if (flags & GPT_FLAG_NO_AUTO)
2876                                 continue;
2877
2878                         stype = blkid_partition_get_type_string(pp);
2879                         if (!stype)
2880                                 continue;
2881
2882                         if (sd_id128_from_string(stype, &type_id) < 0)
2883                                 continue;
2884
2885                         if (sd_id128_equal(type_id, GPT_HOME)) {
2886
2887                                 if (home && nr >= home_nr)
2888                                         continue;
2889
2890                                 home_nr = nr;
2891                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2892
2893                                 r = free_and_strdup(&home, node);
2894                                 if (r < 0)
2895                                         return log_oom();
2896
2897                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2898
2899                                 if (srv && nr >= srv_nr)
2900                                         continue;
2901
2902                                 srv_nr = nr;
2903                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2904
2905                                 r = free_and_strdup(&srv, node);
2906                                 if (r < 0)
2907                                         return log_oom();
2908                         }
2909 #ifdef GPT_ROOT_NATIVE
2910                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2911
2912                                 if (root && nr >= root_nr)
2913                                         continue;
2914
2915                                 root_nr = nr;
2916                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918                                 r = free_and_strdup(&root, node);
2919                                 if (r < 0)
2920                                         return log_oom();
2921                         }
2922 #endif
2923 #ifdef GPT_ROOT_SECONDARY
2924                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2925
2926                                 if (secondary_root && nr >= secondary_root_nr)
2927                                         continue;
2928
2929                                 secondary_root_nr = nr;
2930                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2931
2932                                 r = free_and_strdup(&secondary_root, node);
2933                                 if (r < 0)
2934                                         return log_oom();
2935                         }
2936 #endif
2937                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2938
2939                                 if (generic)
2940                                         multiple_generic = true;
2941                                 else {
2942                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2943
2944                                         r = free_and_strdup(&generic, node);
2945                                         if (r < 0)
2946                                                 return log_oom();
2947                                 }
2948                         }
2949
2950                 } else if (is_mbr) {
2951                         int type;
2952
2953                         if (flags != 0x80) /* Bootable flag */
2954                                 continue;
2955
2956                         type = blkid_partition_get_type(pp);
2957                         if (type != 0x83) /* Linux partition */
2958                                 continue;
2959
2960                         if (generic)
2961                                 multiple_generic = true;
2962                         else {
2963                                 generic_rw = true;
2964
2965                                 r = free_and_strdup(&root, node);
2966                                 if (r < 0)
2967                                         return log_oom();
2968                         }
2969                 }
2970         }
2971
2972         if (root) {
2973                 *root_device = root;
2974                 root = NULL;
2975
2976                 *root_device_rw = root_rw;
2977                 *secondary = false;
2978         } else if (secondary_root) {
2979                 *root_device = secondary_root;
2980                 secondary_root = NULL;
2981
2982                 *root_device_rw = secondary_root_rw;
2983                 *secondary = true;
2984         } else if (generic) {
2985
2986                 /* There were no partitions with precise meanings
2987                  * around, but we found generic partitions. In this
2988                  * case, if there's only one, we can go ahead and boot
2989                  * it, otherwise we bail out, because we really cannot
2990                  * make any sense of it. */
2991
2992                 if (multiple_generic) {
2993                         log_error("Identified multiple bootable Linux partitions on\n"
2994                                   "    %s\n"
2995                                   PARTITION_TABLE_BLURB, arg_image);
2996                         return -EINVAL;
2997                 }
2998
2999                 *root_device = generic;
3000                 generic = NULL;
3001
3002                 *root_device_rw = generic_rw;
3003                 *secondary = false;
3004         } else {
3005                 log_error("Failed to identify root partition in disk image\n"
3006                           "    %s\n"
3007                           PARTITION_TABLE_BLURB, arg_image);
3008                 return -EINVAL;
3009         }
3010
3011         if (home) {
3012                 *home_device = home;
3013                 home = NULL;
3014
3015                 *home_device_rw = home_rw;
3016         }
3017
3018         if (srv) {
3019                 *srv_device = srv;
3020                 srv = NULL;
3021
3022                 *srv_device_rw = srv_rw;
3023         }
3024
3025         return 0;
3026 #else
3027         log_error("--image= is not supported, compiled without blkid support.");
3028         return -ENOTSUP;
3029 #endif
3030 }
3031
3032 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3033 #ifdef HAVE_BLKID
3034         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3035         const char *fstype, *p;
3036         int r;
3037
3038         assert(what);
3039         assert(where);
3040
3041         if (arg_read_only)
3042                 rw = false;
3043
3044         if (directory)
3045                 p = strappenda(where, directory);
3046         else
3047                 p = where;
3048
3049         errno = 0;
3050         b = blkid_new_probe_from_filename(what);
3051         if (!b) {
3052                 if (errno == 0)
3053                         return log_oom();
3054                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3055                 return -errno;
3056         }
3057
3058         blkid_probe_enable_superblocks(b, 1);
3059         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3060
3061         errno = 0;
3062         r = blkid_do_safeprobe(b);
3063         if (r == -1 || r == 1) {
3064                 log_error("Cannot determine file system type of %s", what);
3065                 return -EINVAL;
3066         } else if (r != 0) {
3067                 if (errno == 0)
3068                         errno = EIO;
3069                 log_error_errno(errno, "Failed to probe %s: %m", what);
3070                 return -errno;
3071         }
3072
3073         errno = 0;
3074         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3075                 if (errno == 0)
3076                         errno = EINVAL;
3077                 log_error("Failed to determine file system type of %s", what);
3078                 return -errno;
3079         }
3080
3081         if (streq(fstype, "crypto_LUKS")) {
3082                 log_error("nspawn currently does not support LUKS disk images.");
3083                 return -ENOTSUP;
3084         }
3085
3086         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3087                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3088
3089