chiark / gitweb /
nspawn: allow bind-mounting char and block files
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819 #ifdef HAVE_SELINUX
820                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
821                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
822 #endif
823         };
824
825         unsigned k;
826         int r = 0;
827
828         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
829                 _cleanup_free_ char *where = NULL;
830 #ifdef HAVE_SELINUX
831                 _cleanup_free_ char *options = NULL;
832 #endif
833                 const char *o;
834                 int t;
835
836                 where = strjoin(dest, "/", mount_table[k].where, NULL);
837                 if (!where)
838                         return log_oom();
839
840                 t = path_is_mount_point(where, true);
841                 if (t < 0) {
842                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
843
844                         if (r == 0)
845                                 r = t;
846
847                         continue;
848                 }
849
850                 /* Skip this entry if it is not a remount. */
851                 if (mount_table[k].what && t > 0)
852                         continue;
853
854                 t = mkdir_p(where, 0755);
855                 if (t < 0) {
856                         if (mount_table[k].fatal) {
857                                log_error_errno(t, "Failed to create directory %s: %m", where);
858
859                                 if (r == 0)
860                                         r = t;
861                         } else
862                                log_warning_errno(t, "Failed to create directory %s: %m", where);
863
864                         continue;
865                 }
866
867 #ifdef HAVE_SELINUX
868                 if (arg_selinux_apifs_context &&
869                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
870                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
871                         if (!options)
872                                 return log_oom();
873
874                         o = options;
875                 } else
876 #endif
877                         o = mount_table[k].options;
878
879
880                 if (mount(mount_table[k].what,
881                           where,
882                           mount_table[k].type,
883                           mount_table[k].flags,
884                           o) < 0) {
885
886                         if (mount_table[k].fatal) {
887                                 log_error_errno(errno, "mount(%s) failed: %m", where);
888
889                                 if (r == 0)
890                                         r = -errno;
891                         } else
892                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
893                 }
894         }
895
896         return r;
897 }
898
899 static int mount_binds(const char *dest, char **l, bool ro) {
900         char **x, **y;
901
902         STRV_FOREACH_PAIR(x, y, l) {
903                 _cleanup_free_ char *where = NULL;
904                 struct stat source_st, dest_st;
905                 int r;
906
907                 if (stat(*x, &source_st) < 0)
908                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
909
910                 where = strappend(dest, *y);
911                 if (!where)
912                         return log_oom();
913
914                 r = stat(where, &dest_st);
915                 if (r == 0) {
916                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
917                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
918                                 return -EINVAL;
919                         }
920                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
921                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
922                                 return -EINVAL;
923                         }
924                 } else if (errno == ENOENT) {
925                         r = mkdir_parents_label(where, 0755);
926                         if (r < 0)
927                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
928                 } else {
929                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
930                         return -errno;
931                 }
932
933                 /* Create the mount point. Any non-directory file can be
934                  * mounted on any non-directory file (regular, fifo, socket,
935                  * char, block).
936                  */
937                 if (S_ISDIR(source_st.st_mode)) {
938                         r = mkdir_label(where, 0755);
939                         if (r < 0 && errno != EEXIST)
940                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
941                 } else {
942                         r = touch(where);
943                         if (r < 0)
944                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
945                 }
946
947                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
948                         return log_error_errno(errno, "mount(%s) failed: %m", where);
949
950                 if (ro) {
951                         r = bind_remount_recursive(where, true);
952                         if (r < 0)
953                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
954                 }
955         }
956
957         return 0;
958 }
959
960 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
961         char *to;
962         int r;
963
964         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
965
966         r = path_is_mount_point(to, false);
967         if (r < 0)
968                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
969         if (r > 0)
970                 return 0;
971
972         mkdir_p(to, 0755);
973
974         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
975                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
976
977         return 1;
978 }
979
980 static int mount_cgroup(const char *dest) {
981         _cleanup_set_free_free_ Set *controllers = NULL;
982         _cleanup_free_ char *own_cgroup_path = NULL;
983         const char *cgroup_root, *systemd_root, *systemd_own;
984         int r;
985
986         controllers = set_new(&string_hash_ops);
987         if (!controllers)
988                 return log_oom();
989
990         r = cg_kernel_controllers(controllers);
991         if (r < 0)
992                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
993
994         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
995         if (r < 0)
996                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
997
998         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
999         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1000                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1001
1002         for (;;) {
1003                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1004
1005                 controller = set_steal_first(controllers);
1006                 if (!controller)
1007                         break;
1008
1009                 origin = strappend("/sys/fs/cgroup/", controller);
1010                 if (!origin)
1011                         return log_oom();
1012
1013                 r = readlink_malloc(origin, &combined);
1014                 if (r == -EINVAL) {
1015                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1016
1017                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1018                         if (r < 0)
1019                                 return r;
1020
1021                 } else if (r < 0)
1022                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1023                 else {
1024                         _cleanup_free_ char *target = NULL;
1025
1026                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1027                         if (!target)
1028                                 return log_oom();
1029
1030                         /* A symbolic link, a combination of controllers in one hierarchy */
1031
1032                         if (!filename_is_valid(combined)) {
1033                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1034                                 continue;
1035                         }
1036
1037                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1038                         if (r < 0)
1039                                 return r;
1040
1041                         if (symlink(combined, target) < 0)
1042                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1043                 }
1044         }
1045
1046         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1047         if (r < 0)
1048                 return r;
1049
1050         /* Make our own cgroup a (writable) bind mount */
1051         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1052         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1053                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1054
1055         /* And then remount the systemd cgroup root read-only */
1056         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1057         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1058                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1059
1060         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1061                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1062
1063         return 0;
1064 }
1065
1066 static int mount_tmpfs(const char *dest) {
1067         char **i, **o;
1068
1069         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1070                 _cleanup_free_ char *where = NULL;
1071                 int r;
1072
1073                 where = strappend(dest, *i);
1074                 if (!where)
1075                         return log_oom();
1076
1077                 r = mkdir_label(where, 0755);
1078                 if (r < 0 && r != -EEXIST)
1079                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1080
1081                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1082                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1083         }
1084
1085         return 0;
1086 }
1087
1088 static int setup_timezone(const char *dest) {
1089         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1090         char *z, *y;
1091         int r;
1092
1093         assert(dest);
1094
1095         /* Fix the timezone, if possible */
1096         r = readlink_malloc("/etc/localtime", &p);
1097         if (r < 0) {
1098                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1099                 return 0;
1100         }
1101
1102         z = path_startswith(p, "../usr/share/zoneinfo/");
1103         if (!z)
1104                 z = path_startswith(p, "/usr/share/zoneinfo/");
1105         if (!z) {
1106                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1107                 return 0;
1108         }
1109
1110         where = strappend(dest, "/etc/localtime");
1111         if (!where)
1112                 return log_oom();
1113
1114         r = readlink_malloc(where, &q);
1115         if (r >= 0) {
1116                 y = path_startswith(q, "../usr/share/zoneinfo/");
1117                 if (!y)
1118                         y = path_startswith(q, "/usr/share/zoneinfo/");
1119
1120                 /* Already pointing to the right place? Then do nothing .. */
1121                 if (y && streq(y, z))
1122                         return 0;
1123         }
1124
1125         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1126         if (!check)
1127                 return log_oom();
1128
1129         if (access(check, F_OK) < 0) {
1130                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1131                 return 0;
1132         }
1133
1134         what = strappend("../usr/share/zoneinfo/", z);
1135         if (!what)
1136                 return log_oom();
1137
1138         r = mkdir_parents(where, 0755);
1139         if (r < 0) {
1140                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1141
1142                 return 0;
1143         }
1144
1145         r = unlink(where);
1146         if (r < 0 && errno != ENOENT) {
1147                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1148
1149                 return 0;
1150         }
1151
1152         if (symlink(what, where) < 0) {
1153                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1154                 return 0;
1155         }
1156
1157         return 0;
1158 }
1159
1160 static int setup_resolv_conf(const char *dest) {
1161         _cleanup_free_ char *where = NULL;
1162         int r;
1163
1164         assert(dest);
1165
1166         if (arg_private_network)
1167                 return 0;
1168
1169         /* Fix resolv.conf, if possible */
1170         where = strappend(dest, "/etc/resolv.conf");
1171         if (!where)
1172                 return log_oom();
1173
1174         /* We don't really care for the results of this really. If it
1175          * fails, it fails, but meh... */
1176         r = mkdir_parents(where, 0755);
1177         if (r < 0) {
1178                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1179
1180                 return 0;
1181         }
1182
1183         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1184         if (r < 0) {
1185                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1186
1187                 return 0;
1188         }
1189
1190         return 0;
1191 }
1192
1193 static int setup_volatile_state(const char *directory) {
1194         const char *p;
1195         int r;
1196
1197         assert(directory);
1198
1199         if (arg_volatile != VOLATILE_STATE)
1200                 return 0;
1201
1202         /* --volatile=state means we simply overmount /var
1203            with a tmpfs, and the rest read-only. */
1204
1205         r = bind_remount_recursive(directory, true);
1206         if (r < 0)
1207                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1208
1209         p = strappenda(directory, "/var");
1210         r = mkdir(p, 0755);
1211         if (r < 0 && errno != EEXIST)
1212                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1213
1214         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1215                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1216
1217         return 0;
1218 }
1219
1220 static int setup_volatile(const char *directory) {
1221         bool tmpfs_mounted = false, bind_mounted = false;
1222         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1223         const char *f, *t;
1224         int r;
1225
1226         assert(directory);
1227
1228         if (arg_volatile != VOLATILE_YES)
1229                 return 0;
1230
1231         /* --volatile=yes means we mount a tmpfs to the root dir, and
1232            the original /usr to use inside it, and that read-only. */
1233
1234         if (!mkdtemp(template))
1235                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1236
1237         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1238                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1239                 r = -errno;
1240                 goto fail;
1241         }
1242
1243         tmpfs_mounted = true;
1244
1245         f = strappenda(directory, "/usr");
1246         t = strappenda(template, "/usr");
1247
1248         r = mkdir(t, 0755);
1249         if (r < 0 && errno != EEXIST) {
1250                 log_error_errno(errno, "Failed to create %s: %m", t);
1251                 r = -errno;
1252                 goto fail;
1253         }
1254
1255         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1256                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1257                 r = -errno;
1258                 goto fail;
1259         }
1260
1261         bind_mounted = true;
1262
1263         r = bind_remount_recursive(t, true);
1264         if (r < 0) {
1265                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1266                 goto fail;
1267         }
1268
1269         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1270                 log_error_errno(errno, "Failed to move root mount: %m");
1271                 r = -errno;
1272                 goto fail;
1273         }
1274
1275         rmdir(template);
1276
1277         return 0;
1278
1279 fail:
1280         if (bind_mounted)
1281                 umount(t);
1282         if (tmpfs_mounted)
1283                 umount(template);
1284         rmdir(template);
1285         return r;
1286 }
1287
1288 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1289
1290         snprintf(s, 37,
1291                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1292                  SD_ID128_FORMAT_VAL(id));
1293
1294         return s;
1295 }
1296
1297 static int setup_boot_id(const char *dest) {
1298         _cleanup_free_ char *from = NULL, *to = NULL;
1299         sd_id128_t rnd = {};
1300         char as_uuid[37];
1301         int r;
1302
1303         assert(dest);
1304
1305         if (arg_share_system)
1306                 return 0;
1307
1308         /* Generate a new randomized boot ID, so that each boot-up of
1309          * the container gets a new one */
1310
1311         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1312         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1313         if (!from || !to)
1314                 return log_oom();
1315
1316         r = sd_id128_randomize(&rnd);
1317         if (r < 0)
1318                 return log_error_errno(r, "Failed to generate random boot id: %m");
1319
1320         id128_format_as_uuid(rnd, as_uuid);
1321
1322         r = write_string_file(from, as_uuid);
1323         if (r < 0)
1324                 return log_error_errno(r, "Failed to write boot id: %m");
1325
1326         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1327                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1328                 r = -errno;
1329         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1330                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1331
1332         unlink(from);
1333         return r;
1334 }
1335
1336 static int copy_devnodes(const char *dest) {
1337
1338         static const char devnodes[] =
1339                 "null\0"
1340                 "zero\0"
1341                 "full\0"
1342                 "random\0"
1343                 "urandom\0"
1344                 "tty\0"
1345                 "net/tun\0";
1346
1347         const char *d;
1348         int r = 0;
1349         _cleanup_umask_ mode_t u;
1350
1351         assert(dest);
1352
1353         u = umask(0000);
1354
1355         NULSTR_FOREACH(d, devnodes) {
1356                 _cleanup_free_ char *from = NULL, *to = NULL;
1357                 struct stat st;
1358
1359                 from = strappend("/dev/", d);
1360                 to = strjoin(dest, "/dev/", d, NULL);
1361                 if (!from || !to)
1362                         return log_oom();
1363
1364                 if (stat(from, &st) < 0) {
1365
1366                         if (errno != ENOENT)
1367                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1368
1369                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1370
1371                         log_error("%s is not a char or block device, cannot copy", from);
1372                         return -EIO;
1373
1374                 } else {
1375                         r = mkdir_parents(to, 0775);
1376                         if (r < 0) {
1377                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1378                                 return -r;
1379                         }
1380
1381                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1382                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1383                 }
1384         }
1385
1386         return r;
1387 }
1388
1389 static int setup_ptmx(const char *dest) {
1390         _cleanup_free_ char *p = NULL;
1391
1392         p = strappend(dest, "/dev/ptmx");
1393         if (!p)
1394                 return log_oom();
1395
1396         if (symlink("pts/ptmx", p) < 0)
1397                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1398
1399         return 0;
1400 }
1401
1402 static int setup_dev_console(const char *dest, const char *console) {
1403         _cleanup_umask_ mode_t u;
1404         const char *to;
1405         struct stat st;
1406         int r;
1407
1408         assert(dest);
1409         assert(console);
1410
1411         u = umask(0000);
1412
1413         if (stat("/dev/null", &st) < 0)
1414                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1415
1416         r = chmod_and_chown(console, 0600, 0, 0);
1417         if (r < 0)
1418                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1419
1420         /* We need to bind mount the right tty to /dev/console since
1421          * ptys can only exist on pts file systems. To have something
1422          * to bind mount things on we create a device node first, and
1423          * use /dev/null for that since we the cgroups device policy
1424          * allows us to create that freely, while we cannot create
1425          * /dev/console. (Note that the major minor doesn't actually
1426          * matter here, since we mount it over anyway). */
1427
1428         to = strappenda(dest, "/dev/console");
1429         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1430                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1431
1432         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1433                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1434
1435         return 0;
1436 }
1437
1438 static int setup_kmsg(const char *dest, int kmsg_socket) {
1439         _cleanup_free_ char *from = NULL, *to = NULL;
1440         _cleanup_umask_ mode_t u;
1441         int r, fd, k;
1442         union {
1443                 struct cmsghdr cmsghdr;
1444                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1445         } control = {};
1446         struct msghdr mh = {
1447                 .msg_control = &control,
1448                 .msg_controllen = sizeof(control),
1449         };
1450         struct cmsghdr *cmsg;
1451
1452         assert(dest);
1453         assert(kmsg_socket >= 0);
1454
1455         u = umask(0000);
1456
1457         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1458          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1459          * on the reading side behave very similar to /proc/kmsg,
1460          * their writing side behaves differently from /dev/kmsg in
1461          * that writing blocks when nothing is reading. In order to
1462          * avoid any problems with containers deadlocking due to this
1463          * we simply make /dev/kmsg unavailable to the container. */
1464         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1465             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1466                 return log_oom();
1467
1468         if (mkfifo(from, 0600) < 0)
1469                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1470
1471         r = chmod_and_chown(from, 0600, 0, 0);
1472         if (r < 0)
1473                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1474
1475         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1476                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1477
1478         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1479         if (fd < 0)
1480                 return log_error_errno(errno, "Failed to open fifo: %m");
1481
1482         cmsg = CMSG_FIRSTHDR(&mh);
1483         cmsg->cmsg_level = SOL_SOCKET;
1484         cmsg->cmsg_type = SCM_RIGHTS;
1485         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1486         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1487
1488         mh.msg_controllen = cmsg->cmsg_len;
1489
1490         /* Store away the fd in the socket, so that it stays open as
1491          * long as we run the child */
1492         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1493         safe_close(fd);
1494
1495         if (k < 0)
1496                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1497
1498         /* And now make the FIFO unavailable as /dev/kmsg... */
1499         unlink(from);
1500         return 0;
1501 }
1502
1503 static int send_rtnl(int send_fd) {
1504         union {
1505                 struct cmsghdr cmsghdr;
1506                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1507         } control = {};
1508         struct msghdr mh = {
1509                 .msg_control = &control,
1510                 .msg_controllen = sizeof(control),
1511         };
1512         struct cmsghdr *cmsg;
1513         _cleanup_close_ int fd = -1;
1514         ssize_t k;
1515
1516         assert(send_fd >= 0);
1517
1518         if (!arg_expose_ports)
1519                 return 0;
1520
1521         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1522         if (fd < 0)
1523                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1524
1525         cmsg = CMSG_FIRSTHDR(&mh);
1526         cmsg->cmsg_level = SOL_SOCKET;
1527         cmsg->cmsg_type = SCM_RIGHTS;
1528         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1529         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1530
1531         mh.msg_controllen = cmsg->cmsg_len;
1532
1533         /* Store away the fd in the socket, so that it stays open as
1534          * long as we run the child */
1535         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1536         if (k < 0)
1537                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1538
1539         return 0;
1540 }
1541
1542 static int flush_ports(union in_addr_union *exposed) {
1543         ExposePort *p;
1544         int r, af = AF_INET;
1545
1546         assert(exposed);
1547
1548         if (!arg_expose_ports)
1549                 return 0;
1550
1551         if (in_addr_is_null(af, exposed))
1552                 return 0;
1553
1554         log_debug("Lost IP address.");
1555
1556         LIST_FOREACH(ports, p, arg_expose_ports) {
1557                 r = fw_add_local_dnat(false,
1558                                       af,
1559                                       p->protocol,
1560                                       NULL,
1561                                       NULL, 0,
1562                                       NULL, 0,
1563                                       p->host_port,
1564                                       exposed,
1565                                       p->container_port,
1566                                       NULL);
1567                 if (r < 0)
1568                         log_warning_errno(r, "Failed to modify firewall: %m");
1569         }
1570
1571         *exposed = IN_ADDR_NULL;
1572         return 0;
1573 }
1574
1575 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1576         _cleanup_free_ struct local_address *addresses = NULL;
1577         _cleanup_free_ char *pretty = NULL;
1578         union in_addr_union new_exposed;
1579         ExposePort *p;
1580         bool add;
1581         int af = AF_INET, r;
1582
1583         assert(exposed);
1584
1585         /* Invoked each time an address is added or removed inside the
1586          * container */
1587
1588         if (!arg_expose_ports)
1589                 return 0;
1590
1591         r = local_addresses(rtnl, 0, af, &addresses);
1592         if (r < 0)
1593                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1594
1595         add = r > 0 &&
1596                 addresses[0].family == af &&
1597                 addresses[0].scope < RT_SCOPE_LINK;
1598
1599         if (!add)
1600                 return flush_ports(exposed);
1601
1602         new_exposed = addresses[0].address;
1603         if (in_addr_equal(af, exposed, &new_exposed))
1604                 return 0;
1605
1606         in_addr_to_string(af, &new_exposed, &pretty);
1607         log_debug("New container IP is %s.", strna(pretty));
1608
1609         LIST_FOREACH(ports, p, arg_expose_ports) {
1610
1611                 r = fw_add_local_dnat(true,
1612                                       af,
1613                                       p->protocol,
1614                                       NULL,
1615                                       NULL, 0,
1616                                       NULL, 0,
1617                                       p->host_port,
1618                                       &new_exposed,
1619                                       p->container_port,
1620                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1621                 if (r < 0)
1622                         log_warning_errno(r, "Failed to modify firewall: %m");
1623         }
1624
1625         *exposed = new_exposed;
1626         return 0;
1627 }
1628
1629 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1630         union in_addr_union *exposed = userdata;
1631
1632         assert(rtnl);
1633         assert(m);
1634         assert(exposed);
1635
1636         expose_ports(rtnl, exposed);
1637         return 0;
1638 }
1639
1640 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1641         union {
1642                 struct cmsghdr cmsghdr;
1643                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1644         } control = {};
1645         struct msghdr mh = {
1646                 .msg_control = &control,
1647                 .msg_controllen = sizeof(control),
1648         };
1649         struct cmsghdr *cmsg;
1650         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1651         int fd, r;
1652         ssize_t k;
1653
1654         assert(event);
1655         assert(recv_fd >= 0);
1656         assert(ret);
1657
1658         if (!arg_expose_ports)
1659                 return 0;
1660
1661         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1662         if (k < 0)
1663                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1664
1665         cmsg = CMSG_FIRSTHDR(&mh);
1666         assert(cmsg->cmsg_level == SOL_SOCKET);
1667         assert(cmsg->cmsg_type == SCM_RIGHTS);
1668         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1669         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1670
1671         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1672         if (r < 0) {
1673                 safe_close(fd);
1674                 return log_error_errno(r, "Failed to create rtnl object: %m");
1675         }
1676
1677         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1678         if (r < 0)
1679                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1680
1681         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1682         if (r < 0)
1683                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1684
1685         r = sd_rtnl_attach_event(rtnl, event, 0);
1686         if (r < 0)
1687                 return log_error_errno(r, "Failed to add to even loop: %m");
1688
1689         *ret = rtnl;
1690         rtnl = NULL;
1691
1692         return 0;
1693 }
1694
1695 static int setup_hostname(void) {
1696
1697         if (arg_share_system)
1698                 return 0;
1699
1700         if (sethostname_idempotent(arg_machine) < 0)
1701                 return -errno;
1702
1703         return 0;
1704 }
1705
1706 static int setup_journal(const char *directory) {
1707         sd_id128_t machine_id, this_id;
1708         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1709         char *id;
1710         int r;
1711
1712         /* Don't link journals in ephemeral mode */
1713         if (arg_ephemeral)
1714                 return 0;
1715
1716         p = strappend(directory, "/etc/machine-id");
1717         if (!p)
1718                 return log_oom();
1719
1720         r = read_one_line_file(p, &b);
1721         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1722                 return 0;
1723         else if (r < 0)
1724                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1725
1726         id = strstrip(b);
1727         if (isempty(id) && arg_link_journal == LINK_AUTO)
1728                 return 0;
1729
1730         /* Verify validity */
1731         r = sd_id128_from_string(id, &machine_id);
1732         if (r < 0)
1733                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1734
1735         r = sd_id128_get_machine(&this_id);
1736         if (r < 0)
1737                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1738
1739         if (sd_id128_equal(machine_id, this_id)) {
1740                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1741                          "Host and machine ids are equal (%s): refusing to link journals", id);
1742                 if (arg_link_journal == LINK_AUTO)
1743                         return 0;
1744                 return -EEXIST;
1745         }
1746
1747         if (arg_link_journal == LINK_NO)
1748                 return 0;
1749
1750         free(p);
1751         p = strappend("/var/log/journal/", id);
1752         q = strjoin(directory, "/var/log/journal/", id, NULL);
1753         if (!p || !q)
1754                 return log_oom();
1755
1756         if (path_is_mount_point(p, false) > 0) {
1757                 if (arg_link_journal != LINK_AUTO) {
1758                         log_error("%s: already a mount point, refusing to use for journal", p);
1759                         return -EEXIST;
1760                 }
1761
1762                 return 0;
1763         }
1764
1765         if (path_is_mount_point(q, false) > 0) {
1766                 if (arg_link_journal != LINK_AUTO) {
1767                         log_error("%s: already a mount point, refusing to use for journal", q);
1768                         return -EEXIST;
1769                 }
1770
1771                 return 0;
1772         }
1773
1774         r = readlink_and_make_absolute(p, &d);
1775         if (r >= 0) {
1776                 if ((arg_link_journal == LINK_GUEST ||
1777                      arg_link_journal == LINK_AUTO) &&
1778                     path_equal(d, q)) {
1779
1780                         r = mkdir_p(q, 0755);
1781                         if (r < 0)
1782                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1783                         return 0;
1784                 }
1785
1786                 if (unlink(p) < 0)
1787                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1788         } else if (r == -EINVAL) {
1789
1790                 if (arg_link_journal == LINK_GUEST &&
1791                     rmdir(p) < 0) {
1792
1793                         if (errno == ENOTDIR) {
1794                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1795                                 return r;
1796                         } else {
1797                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1798                                 return -errno;
1799                         }
1800                 }
1801         } else if (r != -ENOENT) {
1802                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1803                 return r;
1804         }
1805
1806         if (arg_link_journal == LINK_GUEST) {
1807
1808                 if (symlink(q, p) < 0) {
1809                         if (arg_link_journal_try) {
1810                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1811                                 return 0;
1812                         } else {
1813                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1814                                 return -errno;
1815                         }
1816                 }
1817
1818                 r = mkdir_p(q, 0755);
1819                 if (r < 0)
1820                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1821                 return 0;
1822         }
1823
1824         if (arg_link_journal == LINK_HOST) {
1825                 /* don't create parents here -- if the host doesn't have
1826                  * permanent journal set up, don't force it here */
1827                 r = mkdir(p, 0755);
1828                 if (r < 0) {
1829                         if (arg_link_journal_try) {
1830                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1831                                 return 0;
1832                         } else {
1833                                 log_error_errno(errno, "Failed to create %s: %m", p);
1834                                 return r;
1835                         }
1836                 }
1837
1838         } else if (access(p, F_OK) < 0)
1839                 return 0;
1840
1841         if (dir_is_empty(q) == 0)
1842                 log_warning("%s is not empty, proceeding anyway.", q);
1843
1844         r = mkdir_p(q, 0755);
1845         if (r < 0) {
1846                 log_error_errno(errno, "Failed to create %s: %m", q);
1847                 return r;
1848         }
1849
1850         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1851                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1852
1853         return 0;
1854 }
1855
1856 static int drop_capabilities(void) {
1857         return capability_bounding_set_drop(~arg_retain, false);
1858 }
1859
1860 static int register_machine(pid_t pid, int local_ifindex) {
1861         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1862         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1863         int r;
1864
1865         if (!arg_register)
1866                 return 0;
1867
1868         r = sd_bus_default_system(&bus);
1869         if (r < 0)
1870                 return log_error_errno(r, "Failed to open system bus: %m");
1871
1872         if (arg_keep_unit) {
1873                 r = sd_bus_call_method(
1874                                 bus,
1875                                 "org.freedesktop.machine1",
1876                                 "/org/freedesktop/machine1",
1877                                 "org.freedesktop.machine1.Manager",
1878                                 "RegisterMachineWithNetwork",
1879                                 &error,
1880                                 NULL,
1881                                 "sayssusai",
1882                                 arg_machine,
1883                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1884                                 "nspawn",
1885                                 "container",
1886                                 (uint32_t) pid,
1887                                 strempty(arg_directory),
1888                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1889         } else {
1890                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1891
1892                 r = sd_bus_message_new_method_call(
1893                                 bus,
1894                                 &m,
1895                                 "org.freedesktop.machine1",
1896                                 "/org/freedesktop/machine1",
1897                                 "org.freedesktop.machine1.Manager",
1898                                 "CreateMachineWithNetwork");
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to create message: %m");
1901
1902                 r = sd_bus_message_append(
1903                                 m,
1904                                 "sayssusai",
1905                                 arg_machine,
1906                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1907                                 "nspawn",
1908                                 "container",
1909                                 (uint32_t) pid,
1910                                 strempty(arg_directory),
1911                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1912                 if (r < 0)
1913                         return log_error_errno(r, "Failed to append message arguments: %m");
1914
1915                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to open container: %m");
1918
1919                 if (!isempty(arg_slice)) {
1920                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1921                         if (r < 0)
1922                                 return log_error_errno(r, "Failed to append slice: %m");
1923                 }
1924
1925                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1926                 if (r < 0)
1927                         return log_error_errno(r, "Failed to add device policy: %m");
1928
1929                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1930                                           /* Allow the container to
1931                                            * access and create the API
1932                                            * device nodes, so that
1933                                            * PrivateDevices= in the
1934                                            * container can work
1935                                            * fine */
1936                                           "/dev/null", "rwm",
1937                                           "/dev/zero", "rwm",
1938                                           "/dev/full", "rwm",
1939                                           "/dev/random", "rwm",
1940                                           "/dev/urandom", "rwm",
1941                                           "/dev/tty", "rwm",
1942                                           "/dev/net/tun", "rwm",
1943                                           /* Allow the container
1944                                            * access to ptys. However,
1945                                            * do not permit the
1946                                            * container to ever create
1947                                            * these device nodes. */
1948                                           "/dev/pts/ptmx", "rw",
1949                                           "char-pts", "rw");
1950                 if (r < 0)
1951                         return log_error_errno(r, "Failed to add device whitelist: %m");
1952
1953                 r = sd_bus_message_close_container(m);
1954                 if (r < 0)
1955                         return log_error_errno(r, "Failed to close container: %m");
1956
1957                 r = sd_bus_call(bus, m, 0, &error, NULL);
1958         }
1959
1960         if (r < 0) {
1961                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1962                 return r;
1963         }
1964
1965         return 0;
1966 }
1967
1968 static int terminate_machine(pid_t pid) {
1969         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1970         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1971         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1972         const char *path;
1973         int r;
1974
1975         if (!arg_register)
1976                 return 0;
1977
1978         r = sd_bus_default_system(&bus);
1979         if (r < 0)
1980                 return log_error_errno(r, "Failed to open system bus: %m");
1981
1982         r = sd_bus_call_method(
1983                         bus,
1984                         "org.freedesktop.machine1",
1985                         "/org/freedesktop/machine1",
1986                         "org.freedesktop.machine1.Manager",
1987                         "GetMachineByPID",
1988                         &error,
1989                         &reply,
1990                         "u",
1991                         (uint32_t) pid);
1992         if (r < 0) {
1993                 /* Note that the machine might already have been
1994                  * cleaned up automatically, hence don't consider it a
1995                  * failure if we cannot get the machine object. */
1996                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1997                 return 0;
1998         }
1999
2000         r = sd_bus_message_read(reply, "o", &path);
2001         if (r < 0)
2002                 return bus_log_parse_error(r);
2003
2004         r = sd_bus_call_method(
2005                         bus,
2006                         "org.freedesktop.machine1",
2007                         path,
2008                         "org.freedesktop.machine1.Machine",
2009                         "Terminate",
2010                         &error,
2011                         NULL,
2012                         NULL);
2013         if (r < 0) {
2014                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2015                 return 0;
2016         }
2017
2018         return 0;
2019 }
2020
2021 static int reset_audit_loginuid(void) {
2022         _cleanup_free_ char *p = NULL;
2023         int r;
2024
2025         if (arg_share_system)
2026                 return 0;
2027
2028         r = read_one_line_file("/proc/self/loginuid", &p);
2029         if (r == -ENOENT)
2030                 return 0;
2031         if (r < 0)
2032                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2033
2034         /* Already reset? */
2035         if (streq(p, "4294967295"))
2036                 return 0;
2037
2038         r = write_string_file("/proc/self/loginuid", "4294967295");
2039         if (r < 0) {
2040                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2041                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2042                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2043                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2044                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2045
2046                 sleep(5);
2047         }
2048
2049         return 0;
2050 }
2051
2052 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2053 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2054 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2055
2056 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2057         uint8_t result[8];
2058         size_t l, sz;
2059         uint8_t *v, *i;
2060         int r;
2061
2062         l = strlen(arg_machine);
2063         sz = sizeof(sd_id128_t) + l;
2064         if (idx > 0)
2065                 sz += sizeof(idx);
2066
2067         v = alloca(sz);
2068
2069         /* fetch some persistent data unique to the host */
2070         r = sd_id128_get_machine((sd_id128_t*) v);
2071         if (r < 0)
2072                 return r;
2073
2074         /* combine with some data unique (on this host) to this
2075          * container instance */
2076         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2077         if (idx > 0) {
2078                 idx = htole64(idx);
2079                 memcpy(i, &idx, sizeof(idx));
2080         }
2081
2082         /* Let's hash the host machine ID plus the container name. We
2083          * use a fixed, but originally randomly created hash key here. */
2084         siphash24(result, v, sz, hash_key.bytes);
2085
2086         assert_cc(ETH_ALEN <= sizeof(result));
2087         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2088
2089         /* see eth_random_addr in the kernel */
2090         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2091         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2092
2093         return 0;
2094 }
2095
2096 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2097         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2098         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2099         struct ether_addr mac_host, mac_container;
2100         int r, i;
2101
2102         if (!arg_private_network)
2103                 return 0;
2104
2105         if (!arg_network_veth)
2106                 return 0;
2107
2108         /* Use two different interface name prefixes depending whether
2109          * we are in bridge mode or not. */
2110         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2111                  arg_network_bridge ? "vb" : "ve", arg_machine);
2112
2113         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2114         if (r < 0)
2115                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2116
2117         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2120
2121         r = sd_rtnl_open(&rtnl, 0);
2122         if (r < 0)
2123                 return log_error_errno(r, "Failed to connect to netlink: %m");
2124
2125         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2126         if (r < 0)
2127                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2128
2129         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2130         if (r < 0)
2131                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2132
2133         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2134         if (r < 0)
2135                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2136
2137         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2138         if (r < 0)
2139                 return log_error_errno(r, "Failed to open netlink container: %m");
2140
2141         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2142         if (r < 0)
2143                 return log_error_errno(r, "Failed to open netlink container: %m");
2144
2145         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2146         if (r < 0)
2147                 return log_error_errno(r, "Failed to open netlink container: %m");
2148
2149         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2150         if (r < 0)
2151                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2152
2153         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2154         if (r < 0)
2155                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2156
2157         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2158         if (r < 0)
2159                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2160
2161         r = sd_rtnl_message_close_container(m);
2162         if (r < 0)
2163                 return log_error_errno(r, "Failed to close netlink container: %m");
2164
2165         r = sd_rtnl_message_close_container(m);
2166         if (r < 0)
2167                 return log_error_errno(r, "Failed to close netlink container: %m");
2168
2169         r = sd_rtnl_message_close_container(m);
2170         if (r < 0)
2171                 return log_error_errno(r, "Failed to close netlink container: %m");
2172
2173         r = sd_rtnl_call(rtnl, m, 0, NULL);
2174         if (r < 0)
2175                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2176
2177         i = (int) if_nametoindex(iface_name);
2178         if (i <= 0)
2179                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2180
2181         *ifi = i;
2182
2183         return 0;
2184 }
2185
2186 static int setup_bridge(const char veth_name[], int *ifi) {
2187         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2188         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2189         int r, bridge;
2190
2191         if (!arg_private_network)
2192                 return 0;
2193
2194         if (!arg_network_veth)
2195                 return 0;
2196
2197         if (!arg_network_bridge)
2198                 return 0;
2199
2200         bridge = (int) if_nametoindex(arg_network_bridge);
2201         if (bridge <= 0)
2202                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2203
2204         *ifi = bridge;
2205
2206         r = sd_rtnl_open(&rtnl, 0);
2207         if (r < 0)
2208                 return log_error_errno(r, "Failed to connect to netlink: %m");
2209
2210         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2211         if (r < 0)
2212                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2213
2214         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2215         if (r < 0)
2216                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2217
2218         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2219         if (r < 0)
2220                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2221
2222         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2223         if (r < 0)
2224                 return log_error_errno(r, "Failed to add netlink master field: %m");
2225
2226         r = sd_rtnl_call(rtnl, m, 0, NULL);
2227         if (r < 0)
2228                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2229
2230         return 0;
2231 }
2232
2233 static int parse_interface(struct udev *udev, const char *name) {
2234         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2235         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2236         int ifi;
2237
2238         ifi = (int) if_nametoindex(name);
2239         if (ifi <= 0)
2240                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2241
2242         sprintf(ifi_str, "n%i", ifi);
2243         d = udev_device_new_from_device_id(udev, ifi_str);
2244         if (!d)
2245                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2246
2247         if (udev_device_get_is_initialized(d) <= 0) {
2248                 log_error("Network interface %s is not initialized yet.", name);
2249                 return -EBUSY;
2250         }
2251
2252         return ifi;
2253 }
2254
2255 static int move_network_interfaces(pid_t pid) {
2256         _cleanup_udev_unref_ struct udev *udev = NULL;
2257         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2258         char **i;
2259         int r;
2260
2261         if (!arg_private_network)
2262                 return 0;
2263
2264         if (strv_isempty(arg_network_interfaces))
2265                 return 0;
2266
2267         r = sd_rtnl_open(&rtnl, 0);
2268         if (r < 0)
2269                 return log_error_errno(r, "Failed to connect to netlink: %m");
2270
2271         udev = udev_new();
2272         if (!udev) {
2273                 log_error("Failed to connect to udev.");
2274                 return -ENOMEM;
2275         }
2276
2277         STRV_FOREACH(i, arg_network_interfaces) {
2278                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2279                 int ifi;
2280
2281                 ifi = parse_interface(udev, *i);
2282                 if (ifi < 0)
2283                         return ifi;
2284
2285                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2286                 if (r < 0)
2287                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2288
2289                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2290                 if (r < 0)
2291                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2292
2293                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2294                 if (r < 0)
2295                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2296         }
2297
2298         return 0;
2299 }
2300
2301 static int setup_macvlan(pid_t pid) {
2302         _cleanup_udev_unref_ struct udev *udev = NULL;
2303         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2304         unsigned idx = 0;
2305         char **i;
2306         int r;
2307
2308         if (!arg_private_network)
2309                 return 0;
2310
2311         if (strv_isempty(arg_network_macvlan))
2312                 return 0;
2313
2314         r = sd_rtnl_open(&rtnl, 0);
2315         if (r < 0)
2316                 return log_error_errno(r, "Failed to connect to netlink: %m");
2317
2318         udev = udev_new();
2319         if (!udev) {
2320                 log_error("Failed to connect to udev.");
2321                 return -ENOMEM;
2322         }
2323
2324         STRV_FOREACH(i, arg_network_macvlan) {
2325                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2326                 _cleanup_free_ char *n = NULL;
2327                 struct ether_addr mac;
2328                 int ifi;
2329
2330                 ifi = parse_interface(udev, *i);
2331                 if (ifi < 0)
2332                         return ifi;
2333
2334                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2335                 if (r < 0)
2336                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2337
2338                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2339                 if (r < 0)
2340                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2341
2342                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2343                 if (r < 0)
2344                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2345
2346                 n = strappend("mv-", *i);
2347                 if (!n)
2348                         return log_oom();
2349
2350                 strshorten(n, IFNAMSIZ-1);
2351
2352                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2353                 if (r < 0)
2354                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2355
2356                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2357                 if (r < 0)
2358                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2359
2360                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2361                 if (r < 0)
2362                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2363
2364                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2365                 if (r < 0)
2366                         return log_error_errno(r, "Failed to open netlink container: %m");
2367
2368                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to open netlink container: %m");
2371
2372                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2375
2376                 r = sd_rtnl_message_close_container(m);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to close netlink container: %m");
2379
2380                 r = sd_rtnl_message_close_container(m);
2381                 if (r < 0)
2382                         return log_error_errno(r, "Failed to close netlink container: %m");
2383
2384                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2385                 if (r < 0)
2386                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2387         }
2388
2389         return 0;
2390 }
2391
2392 static int setup_ipvlan(pid_t pid) {
2393         _cleanup_udev_unref_ struct udev *udev = NULL;
2394         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2395         char **i;
2396         int r;
2397
2398         if (!arg_private_network)
2399                 return 0;
2400
2401         if (strv_isempty(arg_network_ipvlan))
2402                 return 0;
2403
2404         r = sd_rtnl_open(&rtnl, 0);
2405         if (r < 0)
2406                 return log_error_errno(r, "Failed to connect to netlink: %m");
2407
2408         udev = udev_new();
2409         if (!udev) {
2410                 log_error("Failed to connect to udev.");
2411                 return -ENOMEM;
2412         }
2413
2414         STRV_FOREACH(i, arg_network_ipvlan) {
2415                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2416                 _cleanup_free_ char *n = NULL;
2417                 int ifi;
2418
2419                 ifi = parse_interface(udev, *i);
2420                 if (ifi < 0)
2421                         return ifi;
2422
2423                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2424                 if (r < 0)
2425                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2426
2427                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2428                 if (r < 0)
2429                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2430
2431                 n = strappend("iv-", *i);
2432                 if (!n)
2433                         return log_oom();
2434
2435                 strshorten(n, IFNAMSIZ-1);
2436
2437                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2438                 if (r < 0)
2439                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2440
2441                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2442                 if (r < 0)
2443                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2444
2445                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2446                 if (r < 0)
2447                         return log_error_errno(r, "Failed to open netlink container: %m");
2448
2449                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2450                 if (r < 0)
2451                         return log_error_errno(r, "Failed to open netlink container: %m");
2452
2453                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2454                 if (r < 0)
2455                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2456
2457                 r = sd_rtnl_message_close_container(m);
2458                 if (r < 0)
2459                         return log_error_errno(r, "Failed to close netlink container: %m");
2460
2461                 r = sd_rtnl_message_close_container(m);
2462                 if (r < 0)
2463                         return log_error_errno(r, "Failed to close netlink container: %m");
2464
2465                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2466                 if (r < 0)
2467                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2468         }
2469
2470         return 0;
2471 }
2472
2473 static int setup_seccomp(void) {
2474
2475 #ifdef HAVE_SECCOMP
2476         static const int blacklist[] = {
2477                 SCMP_SYS(kexec_load),
2478                 SCMP_SYS(open_by_handle_at),
2479                 SCMP_SYS(init_module),
2480                 SCMP_SYS(finit_module),
2481                 SCMP_SYS(delete_module),
2482                 SCMP_SYS(iopl),
2483                 SCMP_SYS(ioperm),
2484                 SCMP_SYS(swapon),
2485                 SCMP_SYS(swapoff),
2486         };
2487
2488         scmp_filter_ctx seccomp;
2489         unsigned i;
2490         int r;
2491
2492         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2493         if (!seccomp)
2494                 return log_oom();
2495
2496         r = seccomp_add_secondary_archs(seccomp);
2497         if (r < 0) {
2498                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2499                 goto finish;
2500         }
2501
2502         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2503                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2504                 if (r == -EFAULT)
2505                         continue; /* unknown syscall */
2506                 if (r < 0) {
2507                         log_error_errno(r, "Failed to block syscall: %m");
2508                         goto finish;
2509                 }
2510         }
2511
2512         /*
2513            Audit is broken in containers, much of the userspace audit
2514            hookup will fail if running inside a container. We don't
2515            care and just turn off creation of audit sockets.
2516
2517            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2518            with EAFNOSUPPORT which audit userspace uses as indication
2519            that audit is disabled in the kernel.
2520          */
2521
2522         r = seccomp_rule_add(
2523                         seccomp,
2524                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2525                         SCMP_SYS(socket),
2526                         2,
2527                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2528                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2529         if (r < 0) {
2530                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2531                 goto finish;
2532         }
2533
2534         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2535         if (r < 0) {
2536                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2537                 goto finish;
2538         }
2539
2540         r = seccomp_load(seccomp);
2541         if (r < 0)
2542                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2543
2544 finish:
2545         seccomp_release(seccomp);
2546         return r;
2547 #else
2548         return 0;
2549 #endif
2550
2551 }
2552
2553 static int setup_propagate(const char *root) {
2554         const char *p, *q;
2555
2556         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2557         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2558         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2559         (void) mkdir_p(p, 0600);
2560
2561         q = strappenda(root, "/run/systemd/nspawn/incoming");
2562         mkdir_parents(q, 0755);
2563         mkdir_p(q, 0600);
2564
2565         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2566                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2567
2568         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2569                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2570
2571         return 0;
2572 }
2573
2574 static int setup_image(char **device_path, int *loop_nr) {
2575         struct loop_info64 info = {
2576                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2577         };
2578         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2579         _cleanup_free_ char* loopdev = NULL;
2580         struct stat st;
2581         int r, nr;
2582
2583         assert(device_path);
2584         assert(loop_nr);
2585         assert(arg_image);
2586
2587         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2588         if (fd < 0)
2589                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2590
2591         if (fstat(fd, &st) < 0)
2592                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2593
2594         if (S_ISBLK(st.st_mode)) {
2595                 char *p;
2596
2597                 p = strdup(arg_image);
2598                 if (!p)
2599                         return log_oom();
2600
2601                 *device_path = p;
2602
2603                 *loop_nr = -1;
2604
2605                 r = fd;
2606                 fd = -1;
2607
2608                 return r;
2609         }
2610
2611         if (!S_ISREG(st.st_mode)) {
2612                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2613                 return -EINVAL;
2614         }
2615
2616         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2617         if (control < 0)
2618                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2619
2620         nr = ioctl(control, LOOP_CTL_GET_FREE);
2621         if (nr < 0)
2622                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2623
2624         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2625                 return log_oom();
2626
2627         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2628         if (loop < 0)
2629                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2630
2631         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2632                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2633
2634         if (arg_read_only)
2635                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2636
2637         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2638                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2639
2640         *device_path = loopdev;
2641         loopdev = NULL;
2642
2643         *loop_nr = nr;
2644
2645         r = loop;
2646         loop = -1;
2647
2648         return r;
2649 }
2650
2651 #define PARTITION_TABLE_BLURB \
2652         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2653         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2654         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2655         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2656         "to be bootable with systemd-nspawn."
2657
2658 static int dissect_image(
2659                 int fd,
2660                 char **root_device, bool *root_device_rw,
2661                 char **home_device, bool *home_device_rw,
2662                 char **srv_device, bool *srv_device_rw,
2663                 bool *secondary) {
2664
2665 #ifdef HAVE_BLKID
2666         int home_nr = -1, srv_nr = -1;
2667 #ifdef GPT_ROOT_NATIVE
2668         int root_nr = -1;
2669 #endif
2670 #ifdef GPT_ROOT_SECONDARY
2671         int secondary_root_nr = -1;
2672 #endif
2673         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2674         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2675         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2676         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2677         _cleanup_udev_unref_ struct udev *udev = NULL;
2678         struct udev_list_entry *first, *item;
2679         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2680         bool is_gpt, is_mbr, multiple_generic = false;
2681         const char *pttype = NULL;
2682         blkid_partlist pl;
2683         struct stat st;
2684         unsigned i;
2685         int r;
2686
2687         assert(fd >= 0);
2688         assert(root_device);
2689         assert(home_device);
2690         assert(srv_device);
2691         assert(secondary);
2692         assert(arg_image);
2693
2694         b = blkid_new_probe();
2695         if (!b)
2696                 return log_oom();
2697
2698         errno = 0;
2699         r = blkid_probe_set_device(b, fd, 0, 0);
2700         if (r != 0) {
2701                 if (errno == 0)
2702                         return log_oom();
2703
2704                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2705                 return -errno;
2706         }
2707
2708         blkid_probe_enable_partitions(b, 1);
2709         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2710
2711         errno = 0;
2712         r = blkid_do_safeprobe(b);
2713         if (r == -2 || r == 1) {
2714                 log_error("Failed to identify any partition table on\n"
2715                           "    %s\n"
2716                           PARTITION_TABLE_BLURB, arg_image);
2717                 return -EINVAL;
2718         } else if (r != 0) {
2719                 if (errno == 0)
2720                         errno = EIO;
2721                 log_error_errno(errno, "Failed to probe: %m");
2722                 return -errno;
2723         }
2724
2725         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2726
2727         is_gpt = streq_ptr(pttype, "gpt");
2728         is_mbr = streq_ptr(pttype, "dos");
2729
2730         if (!is_gpt && !is_mbr) {
2731                 log_error("No GPT or MBR partition table discovered on\n"
2732                           "    %s\n"
2733                           PARTITION_TABLE_BLURB, arg_image);
2734                 return -EINVAL;
2735         }
2736
2737         errno = 0;
2738         pl = blkid_probe_get_partitions(b);
2739         if (!pl) {
2740                 if (errno == 0)
2741                         return log_oom();
2742
2743                 log_error("Failed to list partitions of %s", arg_image);
2744                 return -errno;
2745         }
2746
2747         udev = udev_new();
2748         if (!udev)
2749                 return log_oom();
2750
2751         if (fstat(fd, &st) < 0)
2752                 return log_error_errno(errno, "Failed to stat block device: %m");
2753
2754         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2755         if (!d)
2756                 return log_oom();
2757
2758         for (i = 0;; i++) {
2759                 int n, m;
2760
2761                 if (i >= 10) {
2762                         log_error("Kernel partitions never appeared.");
2763                         return -ENXIO;
2764                 }
2765
2766                 e = udev_enumerate_new(udev);
2767                 if (!e)
2768                         return log_oom();
2769
2770                 r = udev_enumerate_add_match_parent(e, d);
2771                 if (r < 0)
2772                         return log_oom();
2773
2774                 r = udev_enumerate_scan_devices(e);
2775                 if (r < 0)
2776                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2777
2778                 /* Count the partitions enumerated by the kernel */
2779                 n = 0;
2780                 first = udev_enumerate_get_list_entry(e);
2781                 udev_list_entry_foreach(item, first)
2782                         n++;
2783
2784                 /* Count the partitions enumerated by blkid */
2785                 m = blkid_partlist_numof_partitions(pl);
2786                 if (n == m + 1)
2787                         break;
2788                 if (n > m + 1) {
2789                         log_error("blkid and kernel partition list do not match.");
2790                         return -EIO;
2791                 }
2792                 if (n < m + 1) {
2793                         unsigned j;
2794
2795                         /* The kernel has probed fewer partitions than
2796                          * blkid? Maybe the kernel prober is still
2797                          * running or it got EBUSY because udev
2798                          * already opened the device. Let's reprobe
2799                          * the device, which is a synchronous call
2800                          * that waits until probing is complete. */
2801
2802                         for (j = 0; j < 20; j++) {
2803
2804                                 r = ioctl(fd, BLKRRPART, 0);
2805                                 if (r < 0)
2806                                         r = -errno;
2807                                 if (r >= 0 || r != -EBUSY)
2808                                         break;
2809
2810                                 /* If something else has the device
2811                                  * open, such as an udev rule, the
2812                                  * ioctl will return EBUSY. Since
2813                                  * there's no way to wait until it
2814                                  * isn't busy anymore, let's just wait
2815                                  * a bit, and try again.
2816                                  *
2817                                  * This is really something they
2818                                  * should fix in the kernel! */
2819
2820                                 usleep(50 * USEC_PER_MSEC);
2821                         }
2822
2823                         if (r < 0)
2824                                 return log_error_errno(r, "Failed to reread partition table: %m");
2825                 }
2826
2827                 e = udev_enumerate_unref(e);
2828         }
2829
2830         first = udev_enumerate_get_list_entry(e);
2831         udev_list_entry_foreach(item, first) {
2832                 _cleanup_udev_device_unref_ struct udev_device *q;
2833                 const char *node;
2834                 unsigned long long flags;
2835                 blkid_partition pp;
2836                 dev_t qn;
2837                 int nr;
2838
2839                 errno = 0;
2840                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2841                 if (!q) {
2842                         if (!errno)
2843                                 errno = ENOMEM;
2844
2845                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2846                         return -errno;
2847                 }
2848
2849                 qn = udev_device_get_devnum(q);
2850                 if (major(qn) == 0)
2851                         continue;
2852
2853                 if (st.st_rdev == qn)
2854                         continue;
2855
2856                 node = udev_device_get_devnode(q);
2857                 if (!node)
2858                         continue;
2859
2860                 pp = blkid_partlist_devno_to_partition(pl, qn);
2861                 if (!pp)
2862                         continue;
2863
2864                 flags = blkid_partition_get_flags(pp);
2865
2866                 nr = blkid_partition_get_partno(pp);
2867                 if (nr < 0)
2868                         continue;
2869
2870                 if (is_gpt) {
2871                         sd_id128_t type_id;
2872                         const char *stype;
2873
2874                         if (flags & GPT_FLAG_NO_AUTO)
2875                                 continue;
2876
2877                         stype = blkid_partition_get_type_string(pp);
2878                         if (!stype)
2879                                 continue;
2880
2881                         if (sd_id128_from_string(stype, &type_id) < 0)
2882                                 continue;
2883
2884                         if (sd_id128_equal(type_id, GPT_HOME)) {
2885
2886                                 if (home && nr >= home_nr)
2887                                         continue;
2888
2889                                 home_nr = nr;
2890                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2891
2892                                 r = free_and_strdup(&home, node);
2893                                 if (r < 0)
2894                                         return log_oom();
2895
2896                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2897
2898                                 if (srv && nr >= srv_nr)
2899                                         continue;
2900
2901                                 srv_nr = nr;
2902                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2903
2904                                 r = free_and_strdup(&srv, node);
2905                                 if (r < 0)
2906                                         return log_oom();
2907                         }
2908 #ifdef GPT_ROOT_NATIVE
2909                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2910
2911                                 if (root && nr >= root_nr)
2912                                         continue;
2913
2914                                 root_nr = nr;
2915                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2916
2917                                 r = free_and_strdup(&root, node);
2918                                 if (r < 0)
2919                                         return log_oom();
2920                         }
2921 #endif
2922 #ifdef GPT_ROOT_SECONDARY
2923                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2924
2925                                 if (secondary_root && nr >= secondary_root_nr)
2926                                         continue;
2927
2928                                 secondary_root_nr = nr;
2929                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2930
2931                                 r = free_and_strdup(&secondary_root, node);
2932                                 if (r < 0)
2933                                         return log_oom();
2934                         }
2935 #endif
2936                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2937
2938                                 if (generic)
2939                                         multiple_generic = true;
2940                                 else {
2941                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943                                         r = free_and_strdup(&generic, node);
2944                                         if (r < 0)
2945                                                 return log_oom();
2946                                 }
2947                         }
2948
2949                 } else if (is_mbr) {
2950                         int type;
2951
2952                         if (flags != 0x80) /* Bootable flag */
2953                                 continue;
2954
2955                         type = blkid_partition_get_type(pp);
2956                         if (type != 0x83) /* Linux partition */
2957                                 continue;
2958
2959                         if (generic)
2960                                 multiple_generic = true;
2961                         else {
2962                                 generic_rw = true;
2963
2964                                 r = free_and_strdup(&root, node);
2965                                 if (r < 0)
2966                                         return log_oom();
2967                         }
2968                 }
2969         }
2970
2971         if (root) {
2972                 *root_device = root;
2973                 root = NULL;
2974
2975                 *root_device_rw = root_rw;
2976                 *secondary = false;
2977         } else if (secondary_root) {
2978                 *root_device = secondary_root;
2979                 secondary_root = NULL;
2980
2981                 *root_device_rw = secondary_root_rw;
2982                 *secondary = true;
2983         } else if (generic) {
2984
2985                 /* There were no partitions with precise meanings
2986                  * around, but we found generic partitions. In this
2987                  * case, if there's only one, we can go ahead and boot
2988                  * it, otherwise we bail out, because we really cannot
2989                  * make any sense of it. */
2990
2991                 if (multiple_generic) {
2992                         log_error("Identified multiple bootable Linux partitions on\n"
2993                                   "    %s\n"
2994                                   PARTITION_TABLE_BLURB, arg_image);
2995                         return -EINVAL;
2996                 }
2997
2998                 *root_device = generic;
2999                 generic = NULL;
3000
3001                 *root_device_rw = generic_rw;
3002                 *secondary = false;
3003         } else {
3004                 log_error("Failed to identify root partition in disk image\n"
3005                           "    %s\n"
3006                           PARTITION_TABLE_BLURB, arg_image);
3007                 return -EINVAL;
3008         }
3009
3010         if (home) {
3011                 *home_device = home;
3012                 home = NULL;
3013
3014                 *home_device_rw = home_rw;
3015         }
3016
3017         if (srv) {
3018                 *srv_device = srv;
3019                 srv = NULL;
3020
3021                 *srv_device_rw = srv_rw;
3022         }
3023
3024         return 0;
3025 #else
3026         log_error("--image= is not supported, compiled without blkid support.");
3027         return -ENOTSUP;
3028 #endif
3029 }
3030
3031 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3032 #ifdef HAVE_BLKID
3033         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3034         const char *fstype, *p;
3035         int r;
3036
3037         assert(what);
3038         assert(where);
3039
3040         if (arg_read_only)
3041                 rw = false;
3042
3043         if (directory)
3044                 p = strappenda(where, directory);
3045         else
3046                 p = where;
3047
3048         errno = 0;
3049         b = blkid_new_probe_from_filename(what);
3050         if (!b) {
3051                 if (errno == 0)
3052                         return log_oom();
3053                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3054                 return -errno;
3055         }
3056
3057         blkid_probe_enable_superblocks(b, 1);
3058         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3059
3060         errno = 0;
3061         r = blkid_do_safeprobe(b);
3062         if (r == -1 || r == 1) {
3063                 log_error("Cannot determine file system type of %s", what);
3064                 return -EINVAL;
3065         } else if (r != 0) {
3066                 if (errno == 0)
3067                         errno = EIO;
3068                 log_error_errno(errno, "Failed to probe %s: %m", what);
3069                 return -errno;
3070         }
3071
3072         errno = 0;
3073         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3074                 if (errno == 0)
3075                         errno = EINVAL;
3076                 log_error("Failed to determine file system type of %s", what);
3077                 return -errno;
3078         }
3079
3080         if (streq(fstype, "crypto_LUKS")) {
3081                 log_error("nspawn currently does not support LUKS disk images.");
3082                 return -ENOTSUP;
3083         }
3084
3085         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3086                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3087
3088         return 0;
3089 #else
3090         log_error("--image= is not supported, compiled without blkid support.");