chiark / gitweb /
nspawn: allow bind-mounting char and block files
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819 #ifdef HAVE_SELINUX
820                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
821                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
822 #endif
823         };
824
825         unsigned k;
826         int r = 0;
827
828         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
829                 _cleanup_free_ char *where = NULL;
830 #ifdef HAVE_SELINUX
831                 _cleanup_free_ char *options = NULL;
832 #endif
833                 const char *o;
834                 int t;
835
836                 where = strjoin(dest, "/", mount_table[k].where, NULL);
837                 if (!where)
838                         return log_oom();
839
840                 t = path_is_mount_point(where, true);
841                 if (t < 0) {
842                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
843
844                         if (r == 0)
845                                 r = t;
846
847                         continue;
848                 }
849
850                 /* Skip this entry if it is not a remount. */
851                 if (mount_table[k].what && t > 0)
852                         continue;
853
854                 t = mkdir_p(where, 0755);
855                 if (t < 0) {
856                         if (mount_table[k].fatal) {
857                                log_error_errno(t, "Failed to create directory %s: %m", where);
858
859                                 if (r == 0)
860                                         r = t;
861                         } else
862                                log_warning_errno(t, "Failed to create directory %s: %m", where);
863
864                         continue;
865                 }
866
867 #ifdef HAVE_SELINUX
868                 if (arg_selinux_apifs_context &&
869                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
870                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
871                         if (!options)
872                                 return log_oom();
873
874                         o = options;
875                 } else
876 #endif
877                         o = mount_table[k].options;
878
879
880                 if (mount(mount_table[k].what,
881                           where,
882                           mount_table[k].type,
883                           mount_table[k].flags,
884                           o) < 0) {
885
886                         if (mount_table[k].fatal) {
887                                 log_error_errno(errno, "mount(%s) failed: %m", where);
888
889                                 if (r == 0)
890                                         r = -errno;
891                         } else
892                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
893                 }
894         }
895
896         return r;
897 }
898
899 static int mount_binds(const char *dest, char **l, bool ro) {
900         char **x, **y;
901
902         STRV_FOREACH_PAIR(x, y, l) {
903                 _cleanup_free_ char *where = NULL;
904                 struct stat source_st, dest_st;
905                 int r;
906
907                 if (stat(*x, &source_st) < 0)
908                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
909
910                 where = strappend(dest, *y);
911                 if (!where)
912                         return log_oom();
913
914                 r = stat(where, &dest_st);
915                 if (r == 0) {
916                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
917                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
918                                 return -EINVAL;
919                         }
920                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
921                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
922                                 return -EINVAL;
923                         }
924                 } else if (errno == ENOENT) {
925                         r = mkdir_parents_label(where, 0755);
926                         if (r < 0)
927                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
928                 } else {
929                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
930                         return -errno;
931                 }
932
933                 /* Create the mount point. Any non-directory file can be
934                  * mounted on any non-directory file (regular, fifo, socket,
935                  * char, block).
936                  */
937                 if (S_ISDIR(source_st.st_mode)) {
938                         r = mkdir_label(where, 0755);
939                         if (r < 0 && errno != EEXIST)
940                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
941                 } else {
942                         r = touch(where);
943                         if (r < 0)
944                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
945                 }
946
947                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
948                         return log_error_errno(errno, "mount(%s) failed: %m", where);
949
950                 if (ro) {
951                         r = bind_remount_recursive(where, true);
952                         if (r < 0)
953                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
954                 }
955         }
956
957         return 0;
958 }
959
960 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
961         char *to;
962         int r;
963
964         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
965
966         r = path_is_mount_point(to, false);
967         if (r < 0)
968                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
969         if (r > 0)
970                 return 0;
971
972         mkdir_p(to, 0755);
973
974         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
975                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
976
977         return 1;
978 }
979
980 static int mount_cgroup(const char *dest) {
981         _cleanup_set_free_free_ Set *controllers = NULL;
982         _cleanup_free_ char *own_cgroup_path = NULL;
983         const char *cgroup_root, *systemd_root, *systemd_own;
984         int r;
985
986         controllers = set_new(&string_hash_ops);
987         if (!controllers)
988                 return log_oom();
989
990         r = cg_kernel_controllers(controllers);
991         if (r < 0)
992                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
993
994         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
995         if (r < 0)
996                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
997
998         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
999         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1000                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1001
1002         for (;;) {
1003                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1004
1005                 controller = set_steal_first(controllers);
1006                 if (!controller)
1007                         break;
1008
1009                 origin = strappend("/sys/fs/cgroup/", controller);
1010                 if (!origin)
1011                         return log_oom();
1012
1013                 r = readlink_malloc(origin, &combined);
1014                 if (r == -EINVAL) {
1015                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1016
1017                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1018                         if (r < 0)
1019                                 return r;
1020
1021                 } else if (r < 0)
1022                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1023                 else {
1024                         _cleanup_free_ char *target = NULL;
1025
1026                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1027                         if (!target)
1028                                 return log_oom();
1029
1030                         /* A symbolic link, a combination of controllers in one hierarchy */
1031
1032                         if (!filename_is_valid(combined)) {
1033                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1034                                 continue;
1035                         }
1036
1037                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1038                         if (r < 0)
1039                                 return r;
1040
1041                         if (symlink(combined, target) < 0)
1042                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1043                 }
1044         }
1045
1046         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1047         if (r < 0)
1048                 return r;
1049
1050         /* Make our own cgroup a (writable) bind mount */
1051         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1052         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1053                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1054
1055         /* And then remount the systemd cgroup root read-only */
1056         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1057         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1058                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1059
1060         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1061                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1062
1063         return 0;
1064 }
1065
1066 static int mount_tmpfs(const char *dest) {
1067         char **i, **o;
1068
1069         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1070                 _cleanup_free_ char *where = NULL;
1071                 int r;
1072
1073                 where = strappend(dest, *i);
1074                 if (!where)
1075                         return log_oom();
1076
1077                 r = mkdir_label(where, 0755);
1078                 if (r < 0 && r != -EEXIST)
1079                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1080
1081                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1082                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1083         }
1084
1085         return 0;
1086 }
1087
1088 static int setup_timezone(const char *dest) {
1089         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1090         char *z, *y;
1091         int r;
1092
1093         assert(dest);
1094
1095         /* Fix the timezone, if possible */
1096         r = readlink_malloc("/etc/localtime", &p);
1097         if (r < 0) {
1098                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1099                 return 0;
1100         }
1101
1102         z = path_startswith(p, "../usr/share/zoneinfo/");
1103         if (!z)
1104                 z = path_startswith(p, "/usr/share/zoneinfo/");
1105         if (!z) {
1106                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1107                 return 0;
1108         }
1109
1110         where = strappend(dest, "/etc/localtime");
1111         if (!where)
1112                 return log_oom();
1113
1114         r = readlink_malloc(where, &q);
1115         if (r >= 0) {
1116                 y = path_startswith(q, "../usr/share/zoneinfo/");
1117                 if (!y)
1118                         y = path_startswith(q, "/usr/share/zoneinfo/");
1119
1120                 /* Already pointing to the right place? Then do nothing .. */
1121                 if (y && streq(y, z))
1122                         return 0;
1123         }
1124
1125         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1126         if (!check)
1127                 return log_oom();
1128
1129         if (access(check, F_OK) < 0) {
1130                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1131                 return 0;
1132         }
1133
1134         what = strappend("../usr/share/zoneinfo/", z);
1135         if (!what)
1136                 return log_oom();
1137
1138         r = mkdir_parents(where, 0755);
1139         if (r < 0) {
1140                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1141
1142                 return 0;
1143         }
1144
1145         r = unlink(where);
1146         if (r < 0 && errno != ENOENT) {
1147                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1148
1149                 return 0;
1150         }
1151
1152         if (symlink(what, where) < 0) {
1153                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1154                 return 0;
1155         }
1156
1157         return 0;
1158 }
1159
1160 static int setup_resolv_conf(const char *dest) {
1161         _cleanup_free_ char *where = NULL;
1162         int r;
1163
1164         assert(dest);
1165
1166         if (arg_private_network)
1167                 return 0;
1168
1169         /* Fix resolv.conf, if possible */
1170         where = strappend(dest, "/etc/resolv.conf");
1171         if (!where)
1172                 return log_oom();
1173
1174         /* We don't really care for the results of this really. If it
1175          * fails, it fails, but meh... */
1176         r = mkdir_parents(where, 0755);
1177         if (r < 0) {
1178                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1179
1180                 return 0;
1181         }
1182
1183         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1184         if (r < 0) {
1185                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1186
1187                 return 0;
1188         }
1189
1190         return 0;
1191 }
1192
1193 static int setup_volatile_state(const char *directory) {
1194         const char *p;
1195         int r;
1196
1197         assert(directory);
1198
1199         if (arg_volatile != VOLATILE_STATE)
1200                 return 0;
1201
1202         /* --volatile=state means we simply overmount /var
1203            with a tmpfs, and the rest read-only. */
1204
1205         r = bind_remount_recursive(directory, true);
1206         if (r < 0)
1207                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1208
1209         p = strappenda(directory, "/var");
1210         r = mkdir(p, 0755);
1211         if (r < 0 && errno != EEXIST)
1212                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1213
1214         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1215                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1216
1217         return 0;
1218 }
1219
1220 static int setup_volatile(const char *directory) {
1221         bool tmpfs_mounted = false, bind_mounted = false;
1222         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1223         const char *f, *t;
1224         int r;
1225
1226         assert(directory);
1227
1228         if (arg_volatile != VOLATILE_YES)
1229                 return 0;
1230
1231         /* --volatile=yes means we mount a tmpfs to the root dir, and
1232            the original /usr to use inside it, and that read-only. */
1233
1234         if (!mkdtemp(template))
1235                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1236
1237         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1238                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1239                 r = -errno;
1240                 goto fail;
1241         }
1242
1243         tmpfs_mounted = true;
1244
1245         f = strappenda(directory, "/usr");
1246         t = strappenda(template, "/usr");
1247
1248         r = mkdir(t, 0755);
1249         if (r < 0 && errno != EEXIST) {
1250                 log_error_errno(errno, "Failed to create %s: %m", t);
1251                 r = -errno;
1252                 goto fail;
1253         }
1254
1255         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1256                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1257                 r = -errno;
1258                 goto fail;
1259         }
1260
1261         bind_mounted = true;
1262
1263         r = bind_remount_recursive(t, true);
1264         if (r < 0) {
1265                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1266                 goto fail;
1267         }
1268
1269         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1270                 log_error_errno(errno, "Failed to move root mount: %m");
1271                 r = -errno;
1272                 goto fail;
1273         }
1274
1275         rmdir(template);
1276
1277         return 0;
1278
1279 fail:
1280         if (bind_mounted)
1281                 umount(t);
1282         if (tmpfs_mounted)
1283                 umount(template);
1284         rmdir(template);
1285         return r;
1286 }
1287
1288 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1289
1290         snprintf(s, 37,
1291                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1292                  SD_ID128_FORMAT_VAL(id));
1293
1294         return s;
1295 }
1296
1297 static int setup_boot_id(const char *dest) {
1298         _cleanup_free_ char *from = NULL, *to = NULL;
1299         sd_id128_t rnd = {};
1300         char as_uuid[37];
1301         int r;
1302
1303         assert(dest);
1304
1305         if (arg_share_system)
1306                 return 0;
1307
1308         /* Generate a new randomized boot ID, so that each boot-up of
1309          * the container gets a new one */
1310
1311         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1312         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1313         if (!from || !to)
1314                 return log_oom();
1315
1316         r = sd_id128_randomize(&rnd);
1317         if (r < 0)
1318                 return log_error_errno(r, "Failed to generate random boot id: %m");
1319
1320         id128_format_as_uuid(rnd, as_uuid);
1321
1322         r = write_string_file(from, as_uuid);
1323         if (r < 0)
1324                 return log_error_errno(r, "Failed to write boot id: %m");
1325
1326         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1327                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1328                 r = -errno;
1329         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1330                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1331
1332         unlink(from);
1333         return r;
1334 }
1335
1336 static int copy_devnodes(const char *dest) {
1337
1338         static const char devnodes[] =
1339                 "null\0"
1340                 "zero\0"
1341                 "full\0"
1342                 "random\0"
1343                 "urandom\0"
1344                 "tty\0"
1345                 "net/tun\0";
1346
1347         const char *d;
1348         int r = 0;
1349         _cleanup_umask_ mode_t u;
1350
1351         assert(dest);
1352
1353         u = umask(0000);
1354
1355         NULSTR_FOREACH(d, devnodes) {
1356                 _cleanup_free_ char *from = NULL, *to = NULL;
1357                 struct stat st;
1358
1359                 from = strappend("/dev/", d);
1360                 to = strjoin(dest, "/dev/", d, NULL);
1361                 if (!from || !to)
1362                         return log_oom();
1363
1364                 if (stat(from, &st) < 0) {
1365
1366                         if (errno != ENOENT)
1367                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1368
1369                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1370
1371                         log_error("%s is not a char or block device, cannot copy", from);
1372                         return -EIO;
1373
1374                 } else {
1375                         r = mkdir_parents(to, 0775);
1376                         if (r < 0) {
1377                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1378                                 return -r;
1379                         }
1380
1381                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1382                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1383                 }
1384         }
1385
1386         return r;
1387 }
1388
1389 static int setup_ptmx(const char *dest) {
1390         _cleanup_free_ char *p = NULL;
1391
1392         p = strappend(dest, "/dev/ptmx");
1393         if (!p)
1394                 return log_oom();
1395
1396         if (symlink("pts/ptmx", p) < 0)
1397                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1398
1399         return 0;
1400 }
1401
1402 static int setup_dev_console(const char *dest, const char *console) {
1403         _cleanup_umask_ mode_t u;
1404         const char *to;
1405         struct stat st;
1406         int r;
1407
1408         assert(dest);
1409         assert(console);
1410
1411         u = umask(0000);
1412
1413         if (stat("/dev/null", &st) < 0)
1414                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1415
1416         r = chmod_and_chown(console, 0600, 0, 0);
1417         if (r < 0)
1418                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1419
1420         /* We need to bind mount the right tty to /dev/console since
1421          * ptys can only exist on pts file systems. To have something
1422          * to bind mount things on we create a device node first, and
1423          * use /dev/null for that since we the cgroups device policy
1424          * allows us to create that freely, while we cannot create
1425          * /dev/console. (Note that the major minor doesn't actually
1426          * matter here, since we mount it over anyway). */
1427
1428         to = strappenda(dest, "/dev/console");
1429         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1430                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1431
1432         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1433                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1434
1435         return 0;
1436 }
1437
1438 static int setup_kmsg(const char *dest, int kmsg_socket) {
1439         _cleanup_free_ char *from = NULL, *to = NULL;
1440         _cleanup_umask_ mode_t u;
1441         int r, fd, k;
1442         union {
1443                 struct cmsghdr cmsghdr;
1444                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1445         } control = {};
1446         struct msghdr mh = {
1447                 .msg_control = &control,
1448                 .msg_controllen = sizeof(control),
1449         };
1450         struct cmsghdr *cmsg;
1451
1452         assert(dest);
1453         assert(kmsg_socket >= 0);
1454
1455         u = umask(0000);
1456
1457         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1458          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1459          * on the reading side behave very similar to /proc/kmsg,
1460          * their writing side behaves differently from /dev/kmsg in
1461          * that writing blocks when nothing is reading. In order to
1462          * avoid any problems with containers deadlocking due to this
1463          * we simply make /dev/kmsg unavailable to the container. */
1464         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1465             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1466                 return log_oom();
1467
1468         if (mkfifo(from, 0600) < 0)
1469                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1470
1471         r = chmod_and_chown(from, 0600, 0, 0);
1472         if (r < 0)
1473                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1474
1475         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1476                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1477
1478         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1479         if (fd < 0)
1480                 return log_error_errno(errno, "Failed to open fifo: %m");
1481
1482         cmsg = CMSG_FIRSTHDR(&mh);
1483         cmsg->cmsg_level = SOL_SOCKET;
1484         cmsg->cmsg_type = SCM_RIGHTS;
1485         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1486         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1487
1488         mh.msg_controllen = cmsg->cmsg_len;
1489
1490         /* Store away the fd in the socket, so that it stays open as
1491          * long as we run the child */
1492         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1493         safe_close(fd);
1494
1495         if (k < 0)
1496                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1497
1498         /* And now make the FIFO unavailable as /dev/kmsg... */
1499         unlink(from);
1500         return 0;
1501 }
1502
1503 static int send_rtnl(int send_fd) {
1504         union {
1505                 struct cmsghdr cmsghdr;
1506                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1507         } control = {};
1508         struct msghdr mh = {
1509                 .msg_control = &control,
1510                 .msg_controllen = sizeof(control),
1511         };
1512         struct cmsghdr *cmsg;
1513         _cleanup_close_ int fd = -1;
1514         ssize_t k;
1515
1516         assert(send_fd >= 0);
1517
1518         if (!arg_expose_ports)
1519                 return 0;
1520
1521         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1522         if (fd < 0)
1523                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1524
1525         cmsg = CMSG_FIRSTHDR(&mh);
1526         cmsg->cmsg_level = SOL_SOCKET;
1527         cmsg->cmsg_type = SCM_RIGHTS;
1528         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1529         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1530
1531         mh.msg_controllen = cmsg->cmsg_len;
1532
1533         /* Store away the fd in the socket, so that it stays open as
1534          * long as we run the child */
1535         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1536         if (k < 0)
1537                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1538
1539         return 0;
1540 }
1541
1542 static int flush_ports(union in_addr_union *exposed) {
1543         ExposePort *p;
1544         int r, af = AF_INET;
1545
1546         assert(exposed);
1547
1548         if (!arg_expose_ports)
1549                 return 0;
1550
1551         if (in_addr_is_null(af, exposed))
1552                 return 0;
1553
1554         log_debug("Lost IP address.");
1555
1556         LIST_FOREACH(ports, p, arg_expose_ports) {
1557                 r = fw_add_local_dnat(false,
1558                                       af,
1559                                       p->protocol,
1560                                       NULL,
1561                                       NULL, 0,
1562                                       NULL, 0,
1563                                       p->host_port,
1564                                       exposed,
1565                                       p->container_port,
1566                                       NULL);
1567                 if (r < 0)
1568                         log_warning_errno(r, "Failed to modify firewall: %m");
1569         }
1570
1571         *exposed = IN_ADDR_NULL;
1572         return 0;
1573 }
1574
1575 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1576         _cleanup_free_ struct local_address *addresses = NULL;
1577         _cleanup_free_ char *pretty = NULL;
1578         union in_addr_union new_exposed;
1579         ExposePort *p;
1580         bool add;
1581         int af = AF_INET, r;
1582
1583         assert(exposed);
1584
1585         /* Invoked each time an address is added or removed inside the
1586          * container */
1587
1588         if (!arg_expose_ports)
1589                 return 0;
1590
1591         r = local_addresses(rtnl, 0, af, &addresses);
1592         if (r < 0)
1593                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1594
1595         add = r > 0 &&
1596                 addresses[0].family == af &&
1597                 addresses[0].scope < RT_SCOPE_LINK;
1598
1599         if (!add)
1600                 return flush_ports(exposed);
1601
1602         new_exposed = addresses[0].address;
1603         if (in_addr_equal(af, exposed, &new_exposed))
1604                 return 0;
1605
1606         in_addr_to_string(af, &new_exposed, &pretty);
1607         log_debug("New container IP is %s.", strna(pretty));
1608
1609         LIST_FOREACH(ports, p, arg_expose_ports) {
1610
1611                 r = fw_add_local_dnat(true,
1612                                       af,
1613                                       p->protocol,
1614                                       NULL,
1615                                       NULL, 0,
1616                                       NULL, 0,
1617                                       p->host_port,
1618                                       &new_exposed,
1619                                       p->container_port,
1620                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1621                 if (r < 0)
1622                         log_warning_errno(r, "Failed to modify firewall: %m");
1623         }
1624
1625         *exposed = new_exposed;
1626         return 0;
1627 }
1628
1629 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1630         union in_addr_union *exposed = userdata;
1631
1632         assert(rtnl);
1633         assert(m);
1634         assert(exposed);
1635
1636         expose_ports(rtnl, exposed);
1637         return 0;
1638 }
1639
1640 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1641         union {
1642                 struct cmsghdr cmsghdr;
1643                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1644         } control = {};
1645         struct msghdr mh = {
1646                 .msg_control = &control,
1647                 .msg_controllen = sizeof(control),
1648         };
1649         struct cmsghdr *cmsg;
1650         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1651         int fd, r;
1652         ssize_t k;
1653
1654         assert(event);
1655         assert(recv_fd >= 0);
1656         assert(ret);
1657
1658         if (!arg_expose_ports)
1659                 return 0;
1660
1661         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1662         if (k < 0)
1663                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1664
1665         cmsg = CMSG_FIRSTHDR(&mh);
1666         assert(cmsg->cmsg_level == SOL_SOCKET);
1667         assert(cmsg->cmsg_type == SCM_RIGHTS);
1668         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1669         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1670
1671         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1672         if (r < 0) {
1673                 safe_close(fd);
1674                 return log_error_errno(r, "Failed to create rtnl object: %m");
1675         }
1676
1677         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1678         if (r < 0)
1679                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1680
1681         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1682         if (r < 0)
1683                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1684
1685         r = sd_rtnl_attach_event(rtnl, event, 0);
1686         if (r < 0)
1687                 return log_error_errno(r, "Failed to add to even loop: %m");
1688
1689         *ret = rtnl;
1690         rtnl = NULL;
1691
1692         return 0;
1693 }
1694
1695 static int setup_hostname(void) {
1696
1697         if (arg_share_system)
1698                 return 0;
1699
1700         if (sethostname_idempotent(arg_machine) < 0)
1701                 return -errno;
1702
1703         return 0;
1704 }
1705
1706 static int setup_journal(const char *directory) {
1707         sd_id128_t machine_id, this_id;
1708         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1709         char *id;
1710         int r;
1711
1712         /* Don't link journals in ephemeral mode */
1713         if (arg_ephemeral)
1714                 return 0;
1715
1716         p = strappend(directory, "/etc/machine-id");
1717         if (!p)
1718                 return log_oom();
1719
1720         r = read_one_line_file(p, &b);
1721         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1722                 return 0;
1723         else if (r < 0)
1724                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1725
1726         id = strstrip(b);
1727         if (isempty(id) && arg_link_journal == LINK_AUTO)
1728                 return 0;
1729
1730         /* Verify validity */
1731         r = sd_id128_from_string(id, &machine_id);
1732         if (r < 0)
1733                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1734
1735         r = sd_id128_get_machine(&this_id);
1736         if (r < 0)
1737                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1738
1739         if (sd_id128_equal(machine_id, this_id)) {
1740                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1741                          "Host and machine ids are equal (%s): refusing to link journals", id);
1742                 if (arg_link_journal == LINK_AUTO)
1743                         return 0;
1744                 return -EEXIST;
1745         }
1746
1747         if (arg_link_journal == LINK_NO)
1748                 return 0;
1749
1750         free(p);
1751         p = strappend("/var/log/journal/", id);
1752         q = strjoin(directory, "/var/log/journal/", id, NULL);
1753         if (!p || !q)
1754                 return log_oom();
1755
1756         if (path_is_mount_point(p, false) > 0) {
1757                 if (arg_link_journal != LINK_AUTO) {
1758                         log_error("%s: already a mount point, refusing to use for journal", p);
1759                         return -EEXIST;
1760                 }
1761
1762                 return 0;
1763         }
1764
1765         if (path_is_mount_point(q, false) > 0) {
1766                 if (arg_link_journal != LINK_AUTO) {
1767                         log_error("%s: already a mount point, refusing to use for journal", q);
1768                         return -EEXIST;
1769                 }
1770
1771                 return 0;
1772         }
1773
1774         r = readlink_and_make_absolute(p, &d);
1775         if (r >= 0) {
1776                 if ((arg_link_journal == LINK_GUEST ||
1777                      arg_link_journal == LINK_AUTO) &&
1778                     path_equal(d, q)) {
1779
1780                         r = mkdir_p(q, 0755);
1781                         if (r < 0)
1782                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1783                         return 0;
1784                 }
1785
1786                 if (unlink(p) < 0)
1787                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1788         } else if (r == -EINVAL) {
1789
1790                 if (arg_link_journal == LINK_GUEST &&
1791                     rmdir(p) < 0) {
1792
1793                         if (errno == ENOTDIR) {
1794                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1795                                 return r;
1796                         } else {
1797                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1798                                 return -errno;
1799                         }
1800                 }
1801         } else if (r != -ENOENT) {
1802                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1803                 return r;
1804         }
1805
1806         if (arg_link_journal == LINK_GUEST) {
1807
1808                 if (symlink(q, p) < 0) {
1809                         if (arg_link_journal_try) {
1810                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1811                                 return 0;
1812                         } else {
1813                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1814                                 return -errno;
1815                         }
1816                 }
1817
1818                 r = mkdir_p(q, 0755);
1819                 if (r < 0)
1820                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1821                 return 0;
1822         }
1823
1824         if (arg_link_journal == LINK_HOST) {
1825                 /* don't create parents here -- if the host doesn't have
1826                  * permanent journal set up, don't force it here */
1827                 r = mkdir(p, 0755);
1828                 if (r < 0) {
1829                         if (arg_link_journal_try) {
1830                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1831                                 return 0;
1832                         } else {
1833                                 log_error_errno(errno, "Failed to create %s: %m", p);
1834                                 return r;
1835                         }
1836                 }
1837
1838         } else if (access(p, F_OK) < 0)
1839                 return 0;
1840
1841         if (dir_is_empty(q) == 0)
1842                 log_warning("%s is not empty, proceeding anyway.", q);
1843
1844         r = mkdir_p(q, 0755);
1845         if (r < 0) {
1846                 log_error_errno(errno, "Failed to create %s: %m", q);
1847                 return r;
1848         }
1849
1850         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1851                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1852
1853         return 0;
1854 }
1855
1856 static int drop_capabilities(void) {
1857         return capability_bounding_set_drop(~arg_retain, false);
1858 }
1859
1860 static int register_machine(pid_t pid, int local_ifindex) {
1861         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1862         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1863         int r;
1864
1865         if (!arg_register)
1866                 return 0;
1867
1868         r = sd_bus_default_system(&bus);
1869         if (r < 0)
1870                 return log_error_errno(r, "Failed to open system bus: %m");
1871
1872         if (arg_keep_unit) {
1873                 r = sd_bus_call_method(
1874                                 bus,
1875                                 "org.freedesktop.machine1",
1876                                 "/org/freedesktop/machine1",
1877                                 "org.freedesktop.machine1.Manager",
1878                                 "RegisterMachineWithNetwork",
1879                                 &error,
1880                                 NULL,
1881                                 "sayssusai",
1882                                 arg_machine,
1883                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1884                                 "nspawn",
1885                                 "container",
1886                                 (uint32_t) pid,
1887                                 strempty(arg_directory),
1888                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1889         } else {
1890                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1891
1892                 r = sd_bus_message_new_method_call(
1893                                 bus,
1894                                 &m,
1895                                 "org.freedesktop.machine1",
1896                                 "/org/freedesktop/machine1",
1897                                 "org.freedesktop.machine1.Manager",
1898                                 "CreateMachineWithNetwork");
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to create message: %m");
1901
1902                 r = sd_bus_message_append(
1903                                 m,
1904                                 "sayssusai",
1905                                 arg_machine,
1906                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1907                                 "nspawn",
1908                                 "container",
1909                                 (uint32_t) pid,
1910                                 strempty(arg_directory),
1911                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1912                 if (r < 0)
1913                         return log_error_errno(r, "Failed to append message arguments: %m");
1914
1915                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to open container: %m");
1918
1919                 if (!isempty(arg_slice)) {
1920                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1921                         if (r < 0)
1922                                 return log_error_errno(r, "Failed to append slice: %m");
1923                 }
1924
1925                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1926                 if (r < 0)
1927                         return log_error_errno(r, "Failed to add device policy: %m");
1928
1929                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1930                                           /* Allow the container to
1931                                            * access and create the API
1932                                            * device nodes, so that
1933                                            * PrivateDevices= in the
1934                                            * container can work
1935                                            * fine */
1936                                           "/dev/null", "rwm",
1937                                           "/dev/zero", "rwm",
1938                                           "/dev/full", "rwm",
1939                                           "/dev/random", "rwm",
1940                                           "/dev/urandom", "rwm",
1941                                           "/dev/tty", "rwm",
1942                                           "/dev/net/tun", "rwm",
1943                                           /* Allow the container
1944                                            * access to ptys. However,
1945                                            * do not permit the
1946                                            * container to ever create
1947                                            * these device nodes. */
1948                                           "/dev/pts/ptmx", "rw",
1949                                           "char-pts", "rw");
1950                 if (r < 0)
1951                         return log_error_errno(r, "Failed to add device whitelist: %m");
1952
1953                 r = sd_bus_message_close_container(m);
1954                 if (r < 0)
1955                         return log_error_errno(r, "Failed to close container: %m");
1956
1957                 r = sd_bus_call(bus, m, 0, &error, NULL);
1958         }
1959
1960         if (r < 0) {
1961                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1962                 return r;
1963         }
1964
1965         return 0;
1966 }
1967
1968 static int terminate_machine(pid_t pid) {
1969         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1970         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1971         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1972         const char *path;
1973         int r;
1974
1975         if (!arg_register)
1976                 return 0;
1977
1978         r = sd_bus_default_system(&bus);
1979         if (r < 0)
1980                 return log_error_errno(r, "Failed to open system bus: %m");
1981
1982         r = sd_bus_call_method(
1983                         bus,
1984                         "org.freedesktop.machine1",
1985                         "/org/freedesktop/machine1",
1986                         "org.freedesktop.machine1.Manager",
1987                         "GetMachineByPID",
1988                         &error,
1989                         &reply,
1990                         "u",
1991                         (uint32_t) pid);
1992         if (r < 0) {
1993                 /* Note that the machine might already have been
1994                  * cleaned up automatically, hence don't consider it a
1995                  * failure if we cannot get the machine object. */
1996                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1997                 return 0;
1998         }
1999
2000         r = sd_bus_message_read(reply, "o", &path);
2001         if (r < 0)
2002                 return bus_log_parse_error(r);
2003
2004         r = sd_bus_call_method(
2005                         bus,
2006                         "org.freedesktop.machine1",
2007                         path,
2008                         "org.freedesktop.machine1.Machine",
2009                         "Terminate",
2010                         &error,
2011                         NULL,
2012                         NULL);
2013         if (r < 0) {
2014                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2015                 return 0;
2016         }
2017
2018         return 0;
2019 }
2020
2021 static int reset_audit_loginuid(void) {
2022         _cleanup_free_ char *p = NULL;
2023         int r;
2024
2025         if (arg_share_system)
2026                 return 0;
2027
2028         r = read_one_line_file("/proc/self/loginuid", &p);
2029         if (r == -ENOENT)
2030                 return 0;
2031         if (r < 0)
2032                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2033
2034         /* Already reset? */
2035         if (streq(p, "4294967295"))
2036                 return 0;
2037
2038         r = write_string_file("/proc/self/loginuid", "4294967295");
2039         if (r < 0) {
2040                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2041                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2042                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2043                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2044                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2045
2046                 sleep(5);
2047         }
2048
2049         return 0;
2050 }
2051
2052 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2053 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2054 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2055
2056 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2057         uint8_t result[8];
2058         size_t l, sz;
2059         uint8_t *v, *i;
2060         int r;
2061
2062         l = strlen(arg_machine);
2063         sz = sizeof(sd_id128_t) + l;
2064         if (idx > 0)
2065                 sz += sizeof(idx);
2066
2067         v = alloca(sz);
2068
2069         /* fetch some persistent data unique to the host */
2070         r = sd_id128_get_machine((sd_id128_t*) v);
2071         if (r < 0)
2072                 return r;
2073
2074         /* combine with some data unique (on this host) to this
2075          * container instance */
2076         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2077         if (idx > 0) {
2078                 idx = htole64(idx);
2079                 memcpy(i, &idx, sizeof(idx));
2080         }
2081
2082         /* Let's hash the host machine ID plus the container name. We
2083          * use a fixed, but originally randomly created hash key here. */
2084         siphash24(result, v, sz, hash_key.bytes);
2085
2086         assert_cc(ETH_ALEN <= sizeof(result));
2087         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2088
2089         /* see eth_random_addr in the kernel */
2090         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2091         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2092
2093         return 0;
2094 }
2095
2096 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2097         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2098         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2099         struct ether_addr mac_host, mac_container;
2100         int r, i;
2101
2102         if (!arg_private_network)
2103                 return 0;
2104
2105         if (!arg_network_veth)
2106                 return 0;
2107
2108         /* Use two different interface name prefixes depending whether
2109          * we are in bridge mode or not. */
2110         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2111                  arg_network_bridge ? "vb" : "ve", arg_machine);
2112
2113         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2114         if (r < 0)
2115                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2116
2117         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2120
2121         r = sd_rtnl_open(&rtnl, 0);
2122         if (r < 0)
2123                 return log_error_errno(r, "Failed to connect to netlink: %m");
2124
2125         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2126         if (r < 0)
2127                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2128
2129         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2130         if (r < 0)
2131                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2132
2133         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2134         if (r < 0)
2135                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2136
2137         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2138         if (r < 0)
2139                 return log_error_errno(r, "Failed to open netlink container: %m");
2140
2141         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2142         if (r < 0)
2143                 return log_error_errno(r, "Failed to open netlink container: %m");
2144
2145         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2146         if (r < 0)
2147                 return log_error_errno(r, "Failed to open netlink container: %m");
2148
2149         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2150         if (r < 0)
2151                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2152
2153         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2154         if (r < 0)
2155                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2156
2157         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2158         if (r < 0)
2159                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2160
2161         r = sd_rtnl_message_close_container(m);
2162         if (r < 0)
2163                 return log_error_errno(r, "Failed to close netlink container: %m");
2164
2165         r = sd_rtnl_message_close_container(m);
2166         if (r < 0)
2167                 return log_error_errno(r, "Failed to close netlink container: %m");
2168
2169         r = sd_rtnl_message_close_container(m);
2170         if (r < 0)
2171                 return log_error_errno(r, "Failed to close netlink container: %m");
2172
2173         r = sd_rtnl_call(rtnl, m, 0, NULL);
2174         if (r < 0)
2175                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2176
2177         i = (int) if_nametoindex(iface_name);
2178         if (i <= 0)
2179                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2180
2181         *ifi = i;
2182
2183         return 0;
2184 }
2185
2186 static int setup_bridge(const char veth_name[], int *ifi) {
2187         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2188         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2189         int r, bridge;
2190
2191         if (!arg_private_network)
2192                 return 0;
2193
2194         if (!arg_network_veth)
2195                 return 0;
2196
2197         if (!arg_network_bridge)
2198                 return 0;
2199
2200         bridge = (int) if_nametoindex(arg_network_bridge);
2201         if (bridge <= 0)
2202                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2203
2204         *ifi = bridge;
2205
2206         r = sd_rtnl_open(&rtnl, 0);
2207         if (r < 0)
2208                 return log_error_errno(r, "Failed to connect to netlink: %m");
2209
2210         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2211         if (r < 0)
2212                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2213
2214         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2215         if (r < 0)
2216                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2217
2218         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2219         if (r < 0)
2220                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2221
2222         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2223         if (r < 0)
2224                 return log_error_errno(r, "Failed to add netlink master field: %m");
2225
2226         r = sd_rtnl_call(rtnl, m, 0, NULL);
2227         if (r < 0)
2228                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2229
2230         return 0;
2231 }
2232
2233 static int parse_interface(struct udev *udev, const char *name) {
2234         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2235         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2236         int ifi;
2237
2238         ifi = (int) if_nametoindex(name);
2239         if (ifi <= 0)
2240                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2241
2242         sprintf(ifi_str, "n%i", ifi);
2243         d = udev_device_new_from_device_id(udev, ifi_str);
2244         if (!d)
2245                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2246
2247         if (udev_device_get_is_initialized(d) <= 0) {
2248                 log_error("Network interface %s is not initialized yet.", name);
2249                 return -EBUSY;
2250         }
2251
2252         return ifi;
2253 }
2254
2255 static int move_network_interfaces(pid_t pid) {
2256         _cleanup_udev_unref_ struct udev *udev = NULL;
2257         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2258         char **i;
2259         int r;
2260
2261         if (!arg_private_network)
2262                 return 0;
2263
2264         if (strv_isempty(arg_network_interfaces))
2265                 return 0;
2266
2267         r = sd_rtnl_open(&rtnl, 0);
2268         if (r < 0)
2269                 return log_error_errno(r, "Failed to connect to netlink: %m");
2270
2271         udev = udev_new();
2272         if (!udev) {
2273                 log_error("Failed to connect to udev.");
2274                 return -ENOMEM;
2275         }
2276
2277         STRV_FOREACH(i, arg_network_interfaces) {
2278                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2279                 int ifi;
2280
2281                 ifi = parse_interface(udev, *i);
2282                 if (ifi < 0)
2283                         return ifi;
2284
2285                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2286                 if (r < 0)
2287                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2288
2289                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2290                 if (r < 0)
2291                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2292
2293                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2294                 if (r < 0)
2295                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2296         }
2297
2298         return 0;
2299 }
2300
2301 static int setup_macvlan(pid_t pid) {
2302         _cleanup_udev_unref_ struct udev *udev = NULL;
2303         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2304         unsigned idx = 0;
2305         char **i;
2306         int r;
2307
2308         if (!arg_private_network)
2309                 return 0;
2310
2311         if (strv_isempty(arg_network_macvlan))
2312                 return 0;
2313
2314         r = sd_rtnl_open(&rtnl, 0);
2315         if (r < 0)
2316                 return log_error_errno(r, "Failed to connect to netlink: %m");
2317
2318         udev = udev_new();
2319         if (!udev) {
2320                 log_error("Failed to connect to udev.");
2321                 return -ENOMEM;
2322         }
2323
2324         STRV_FOREACH(i, arg_network_macvlan) {
2325                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2326                 _cleanup_free_ char *n = NULL;
2327                 struct ether_addr mac;
2328                 int ifi;
2329
2330                 ifi = parse_interface(udev, *i);
2331                 if (ifi < 0)
2332                         return ifi;
2333
2334                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2335                 if (r < 0)
2336                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2337
2338                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2339                 if (r < 0)
2340                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2341
2342                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2343                 if (r < 0)
2344                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2345
2346                 n = strappend("mv-", *i);
2347                 if (!n)
2348                         return log_oom();
2349
2350                 strshorten(n, IFNAMSIZ-1);
2351
2352                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2353                 if (r < 0)
2354                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2355
2356                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2357                 if (r < 0)
2358                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2359
2360                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2361                 if (r < 0)
2362                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2363
2364                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2365                 if (r < 0)
2366                         return log_error_errno(r, "Failed to open netlink container: %m");
2367
2368                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to open netlink container: %m");
2371
2372                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2375
2376                 r = sd_rtnl_message_close_container(m);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to close netlink container: %m");
2379
2380                 r = sd_rtnl_message_close_container(m);
2381                 if (r < 0)
2382                         return log_error_errno(r, "Failed to close netlink container: %m");
2383
2384                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2385                 if (r < 0)
2386                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2387         }
2388
2389         return 0;
2390 }
2391
2392 static int setup_ipvlan(pid_t pid) {
2393         _cleanup_udev_unref_ struct udev *udev = NULL;
2394         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2395         char **i;
2396         int r;
2397
2398         if (!arg_private_network)
2399                 return 0;
2400
2401         if (strv_isempty(arg_network_ipvlan))
2402                 return 0;
2403
2404         r = sd_rtnl_open(&rtnl, 0);
2405         if (r < 0)
2406                 return log_error_errno(r, "Failed to connect to netlink: %m");
2407
2408         udev = udev_new();
2409         if (!udev) {
2410                 log_error("Failed to connect to udev.");
2411                 return -ENOMEM;
2412         }
2413
2414         STRV_FOREACH(i, arg_network_ipvlan) {
2415                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2416                 _cleanup_free_ char *n = NULL;
2417                 int ifi;
2418
2419                 ifi = parse_interface(udev, *i);
2420                 if (ifi < 0)
2421                         return ifi;
2422
2423                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2424                 if (r < 0)
2425                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2426
2427                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2428                 if (r < 0)
2429                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2430
2431                 n = strappend("iv-", *i);
2432                 if (!n)
2433                         return log_oom();
2434
2435                 strshorten(n, IFNAMSIZ-1);
2436
2437                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2438                 if (r < 0)
2439                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2440
2441                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2442                 if (r < 0)
2443                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2444
2445                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2446                 if (r < 0)
2447                         return log_error_errno(r, "Failed to open netlink container: %m");
2448
2449                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2450                 if (r < 0)
2451                         return log_error_errno(r, "Failed to open netlink container: %m");
2452
2453                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2454                 if (r < 0)
2455                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2456
2457                 r = sd_rtnl_message_close_container(m);
2458                 if (r < 0)
2459                         return log_error_errno(r, "Failed to close netlink container: %m");
2460
2461                 r = sd_rtnl_message_close_container(m);
2462                 if (r < 0)
2463                         return log_error_errno(r, "Failed to close netlink container: %m");
2464
2465                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2466                 if (r < 0)
2467                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2468         }
2469
2470         return 0;
2471 }
2472
2473 static int setup_seccomp(void) {
2474
2475 #ifdef HAVE_SECCOMP
2476         static const int blacklist[] = {
2477                 SCMP_SYS(kexec_load),
2478                 SCMP_SYS(open_by_handle_at),
2479                 SCMP_SYS(init_module),
2480                 SCMP_SYS(finit_module),
2481                 SCMP_SYS(delete_module),
2482                 SCMP_SYS(iopl),
2483                 SCMP_SYS(ioperm),
2484                 SCMP_SYS(swapon),
2485                 SCMP_SYS(swapoff),
2486         };
2487
2488         scmp_filter_ctx seccomp;
2489         unsigned i;
2490         int r;
2491
2492         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2493         if (!seccomp)
2494                 return log_oom();
2495
2496         r = seccomp_add_secondary_archs(seccomp);
2497         if (r < 0) {
2498                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2499                 goto finish;
2500         }
2501
2502         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2503                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2504                 if (r == -EFAULT)
2505                         continue; /* unknown syscall */
2506                 if (r < 0) {
2507                         log_error_errno(r, "Failed to block syscall: %m");
2508                         goto finish;
2509                 }
2510         }
2511
2512         /*
2513            Audit is broken in containers, much of the userspace audit
2514            hookup will fail if running inside a container. We don't
2515            care and just turn off creation of audit sockets.
2516
2517            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2518            with EAFNOSUPPORT which audit userspace uses as indication
2519            that audit is disabled in the kernel.
2520          */
2521
2522         r = seccomp_rule_add(
2523                         seccomp,
2524                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2525                         SCMP_SYS(socket),
2526                         2,
2527                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2528                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2529         if (r < 0) {
2530                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2531                 goto finish;
2532         }
2533
2534         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2535         if (r < 0) {
2536                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2537                 goto finish;
2538         }
2539
2540         r = seccomp_load(seccomp);
2541         if (r < 0)
2542                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2543
2544 finish:
2545         seccomp_release(seccomp);
2546         return r;
2547 #else
2548         return 0;
2549 #endif
2550
2551 }
2552
2553 static int setup_propagate(const char *root) {
2554         const char *p, *q;
2555
2556         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2557         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2558         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2559         (void) mkdir_p(p, 0600);
2560
2561         q = strappenda(root, "/run/systemd/nspawn/incoming");
2562         mkdir_parents(q, 0755);
2563         mkdir_p(q, 0600);
2564
2565         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2566                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2567
2568         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2569                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2570
2571         return 0;
2572 }
2573
2574 static int setup_image(char **device_path, int *loop_nr) {
2575         struct loop_info64 info = {
2576                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2577         };
2578         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2579         _cleanup_free_ char* loopdev = NULL;
2580         struct stat st;
2581         int r, nr;
2582
2583         assert(device_path);
2584         assert(loop_nr);
2585         assert(arg_image);
2586
2587         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2588         if (fd < 0)
2589                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2590
2591         if (fstat(fd, &st) < 0)
2592                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2593
2594         if (S_ISBLK(st.st_mode)) {
2595                 char *p;
2596
2597                 p = strdup(arg_image);
2598                 if (!p)
2599                         return log_oom();
2600
2601                 *device_path = p;
2602
2603                 *loop_nr = -1;
2604
2605                 r = fd;
2606                 fd = -1;
2607
2608                 return r;
2609         }
2610
2611         if (!S_ISREG(st.st_mode)) {
2612                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2613                 return -EINVAL;
2614         }
2615
2616         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2617         if (control < 0)
2618                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2619
2620         nr = ioctl(control, LOOP_CTL_GET_FREE);
2621         if (nr < 0)
2622                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2623
2624         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2625                 return log_oom();
2626
2627         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2628         if (loop < 0)
2629                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2630
2631         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2632                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2633
2634         if (arg_read_only)
2635                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2636
2637         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2638                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2639
2640         *device_path = loopdev;
2641         loopdev = NULL;
2642
2643         *loop_nr = nr;
2644
2645         r = loop;
2646         loop = -1;
2647
2648         return r;
2649 }
2650
2651 #define PARTITION_TABLE_BLURB \
2652         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2653         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2654         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2655         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2656         "to be bootable with systemd-nspawn."
2657
2658 static int dissect_image(
2659                 int fd,
2660                 char **root_device, bool *root_device_rw,
2661                 char **home_device, bool *home_device_rw,
2662                 char **srv_device, bool *srv_device_rw,
2663                 bool *secondary) {
2664
2665 #ifdef HAVE_BLKID
2666         int home_nr = -1, srv_nr = -1;
2667 #ifdef GPT_ROOT_NATIVE
2668         int root_nr = -1;
2669 #endif
2670 #ifdef GPT_ROOT_SECONDARY
2671         int secondary_root_nr = -1;
2672 #endif
2673         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2674         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2675         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2676         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2677         _cleanup_udev_unref_ struct udev *udev = NULL;
2678         struct udev_list_entry *first, *item;
2679         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2680         bool is_gpt, is_mbr, multiple_generic = false;
2681         const char *pttype = NULL;
2682         blkid_partlist pl;
2683         struct stat st;
2684         unsigned i;
2685         int r;
2686
2687         assert(fd >= 0);
2688         assert(root_device);
2689         assert(home_device);
2690         assert(srv_device);
2691         assert(secondary);
2692         assert(arg_image);
2693
2694         b = blkid_new_probe();
2695         if (!b)
2696                 return log_oom();
2697
2698         errno = 0;
2699         r = blkid_probe_set_device(b, fd, 0, 0);
2700         if (r != 0) {
2701                 if (errno == 0)
2702                         return log_oom();
2703
2704                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2705                 return -errno;
2706         }
2707
2708         blkid_probe_enable_partitions(b, 1);
2709         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2710
2711         errno = 0;
2712         r = blkid_do_safeprobe(b);
2713         if (r == -2 || r == 1) {
2714                 log_error("Failed to identify any partition table on\n"
2715                           "    %s\n"
2716                           PARTITION_TABLE_BLURB, arg_image);
2717                 return -EINVAL;
2718         } else if (r != 0) {
2719                 if (errno == 0)
2720                         errno = EIO;
2721                 log_error_errno(errno, "Failed to probe: %m");
2722                 return -errno;
2723         }
2724
2725         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2726
2727         is_gpt = streq_ptr(pttype, "gpt");
2728         is_mbr = streq_ptr(pttype, "dos");
2729
2730         if (!is_gpt && !is_mbr) {
2731                 log_error("No GPT or MBR partition table discovered on\n"
2732                           "    %s\n"
2733                           PARTITION_TABLE_BLURB, arg_image);
2734                 return -EINVAL;
2735         }
2736
2737         errno = 0;
2738         pl = blkid_probe_get_partitions(b);
2739         if (!pl) {
2740                 if (errno == 0)
2741                         return log_oom();
2742
2743                 log_error("Failed to list partitions of %s", arg_image);
2744                 return -errno;
2745         }
2746
2747         udev = udev_new();
2748         if (!udev)
2749                 return log_oom();
2750
2751         if (fstat(fd, &st) < 0)
2752                 return log_error_errno(errno, "Failed to stat block device: %m");
2753
2754         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2755         if (!d)
2756                 return log_oom();
2757
2758         for (i = 0;; i++) {
2759                 int n, m;
2760
2761                 if (i >= 10) {
2762                         log_error("Kernel partitions never appeared.");
2763                         return -ENXIO;
2764                 }
2765
2766                 e = udev_enumerate_new(udev);
2767                 if (!e)
2768                         return log_oom();
2769
2770                 r = udev_enumerate_add_match_parent(e, d);
2771                 if (r < 0)
2772                         return log_oom();
2773
2774                 r = udev_enumerate_scan_devices(e);
2775                 if (r < 0)
2776                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2777
2778                 /* Count the partitions enumerated by the kernel */
2779                 n = 0;
2780                 first = udev_enumerate_get_list_entry(e);
2781                 udev_list_entry_foreach(item, first)
2782                         n++;
2783
2784                 /* Count the partitions enumerated by blkid */
2785                 m = blkid_partlist_numof_partitions(pl);
2786                 if (n == m + 1)
2787                         break;
2788                 if (n > m + 1) {
2789                         log_error("blkid and kernel partition list do not match.");
2790                         return -EIO;
2791                 }
2792                 if (n < m + 1) {
2793                         unsigned j;
2794
2795                         /* The kernel has probed fewer partitions than
2796                          * blkid? Maybe the kernel prober is still
2797                          * running or it got EBUSY because udev
2798                          * already opened the device. Let's reprobe
2799                          * the device, which is a synchronous call
2800                          * that waits until probing is complete. */
2801
2802                         for (j = 0; j < 20; j++) {
2803
2804                                 r = ioctl(fd, BLKRRPART, 0);
2805                                 if (r < 0)
2806                                         r = -errno;
2807                                 if (r >= 0 || r != -EBUSY)
2808                                         break;
2809
2810                                 /* If something else has the device
2811                                  * open, such as an udev rule, the
2812                                  * ioctl will return EBUSY. Since
2813                                  * there's no way to wait until it
2814                                  * isn't busy anymore, let's just wait
2815                                  * a bit, and try again.
2816                                  *
2817                                  * This is really something they
2818                                  * should fix in the kernel! */
2819
2820                                 usleep(50 * USEC_PER_MSEC);
2821                         }
2822
2823                         if (r < 0)
2824                                 return log_error_errno(r, "Failed to reread partition table: %m");
2825                 }
2826
2827                 e = udev_enumerate_unref(e);
2828         }
2829
2830         first = udev_enumerate_get_list_entry(e);
2831         udev_list_entry_foreach(item, first) {
2832                 _cleanup_udev_device_unref_ struct udev_device *q;
2833                 const char *node;
2834                 unsigned long long flags;
2835                 blkid_partition pp;
2836                 dev_t qn;
2837                 int nr;
2838
2839                 errno = 0;
2840                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2841                 if (!q) {
2842                         if (!errno)
2843                                 errno = ENOMEM;
2844
2845                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2846                         return -errno;
2847                 }
2848
2849                 qn = udev_device_get_devnum(q);
2850                 if (major(qn) == 0)
2851                         continue;
2852
2853                 if (st.st_rdev == qn)
2854                         continue;
2855
2856                 node = udev_device_get_devnode(q);
2857                 if (!node)
2858                         continue;
2859
2860                 pp = blkid_partlist_devno_to_partition(pl, qn);
2861                 if (!pp)
2862                         continue;
2863
2864                 flags = blkid_partition_get_flags(pp);
2865
2866                 nr = blkid_partition_get_partno(pp);
2867                 if (nr < 0)
2868                         continue;
2869
2870                 if (is_gpt) {
2871                         sd_id128_t type_id;
2872                         const char *stype;
2873
2874                         if (flags & GPT_FLAG_NO_AUTO)
2875                                 continue;
2876
2877                         stype = blkid_partition_get_type_string(pp);
2878                         if (!stype)
2879                                 continue;
2880
2881                         if (sd_id128_from_string(stype, &type_id) < 0)
2882                                 continue;
2883
2884                         if (sd_id128_equal(type_id, GPT_HOME)) {
2885
2886                                 if (home && nr >= home_nr)
2887                                         continue;
2888
2889                                 home_nr = nr;
2890                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2891
2892                                 r = free_and_strdup(&home, node);
2893                                 if (r < 0)
2894                                         return log_oom();
2895
2896                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2897
2898                                 if (srv && nr >= srv_nr)
2899                                         continue;
2900
2901                                 srv_nr = nr;
2902                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2903
2904                                 r = free_and_strdup(&srv, node);
2905                                 if (r < 0)
2906                                         return log_oom();
2907                         }
2908 #ifdef GPT_ROOT_NATIVE
2909                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2910
2911                                 if (root && nr >= root_nr)
2912                                         continue;
2913
2914                                 root_nr = nr;
2915                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2916
2917                                 r = free_and_strdup(&root, node);
2918                                 if (r < 0)
2919                                         return log_oom();
2920                         }
2921 #endif
2922 #ifdef GPT_ROOT_SECONDARY
2923                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2924
2925                                 if (secondary_root && nr >= secondary_root_nr)
2926                                         continue;
2927
2928                                 secondary_root_nr = nr;
2929                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2930
2931                                 r = free_and_strdup(&secondary_root, node);
2932                                 if (r < 0)
2933                                         return log_oom();
2934                         }
2935 #endif
2936                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2937
2938                                 if (generic)
2939                                         multiple_generic = true;
2940                                 else {
2941                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943                                         r = free_and_strdup(&generic, node);
2944                                         if (r < 0)
2945                                                 return log_oom();
2946                                 }
2947                         }
2948
2949                 } else if (is_mbr) {
2950                         int type;
2951
2952                         if (flags != 0x80) /* Bootable flag */
2953                                 continue;
2954
2955                         type = blkid_partition_get_type(pp);
2956                         if (type != 0x83) /* Linux partition */
2957                                 continue;
2958
2959                         if (generic)
2960                                 multiple_generic = true;
2961                         else {
2962                                 generic_rw = true;
2963
2964                                 r = free_and_strdup(&root, node);
2965                                 if (r < 0)
2966                                         return log_oom();
2967                         }
2968                 }
2969         }
2970
2971         if (root) {
2972                 *root_device = root;
2973                 root = NULL;
2974
2975                 *root_device_rw = root_rw;
2976                 *secondary = false;
2977         } else if (secondary_root) {
2978                 *root_device = secondary_root;
2979                 secondary_root = NULL;
2980
2981                 *root_device_rw = secondary_root_rw;
2982                 *secondary = true;
2983         } else if (generic) {
2984
2985                 /* There were no partitions with precise meanings
2986                  * around, but we found generic partitions. In this
2987                  * case, if there's only one, we can go ahead and boot
2988                  * it, otherwise we bail out, because we really cannot
2989                  * make any sense of it. */
2990
2991                 if (multiple_generic) {
2992                         log_error("Identified multiple bootable Linux partitions on\n"
2993                                   "    %s\n"
2994                                   PARTITION_TABLE_BLURB, arg_image);
2995                         return -EINVAL;
2996                 }
2997
2998                 *root_device = generic;
2999                 generic = NULL;
3000
3001                 *root_device_rw = generic_rw;
3002                 *secondary = false;
3003         } else {
3004                 log_error("Failed to identify root partition in disk image\n"
3005                           "    %s\n"
3006                           PARTITION_TABLE_BLURB, arg_image);
3007                 return -EINVAL;
3008         }
3009
3010         if (home) {
3011                 *home_device = home;
3012                 home = NULL;
3013
3014                 *home_device_rw = home_rw;
3015         }
3016
3017         if (srv) {
3018                 *srv_device = srv;
3019                 srv = NULL;
3020
3021                 *srv_device_rw = srv_rw;
3022         }
3023
3024         return 0;
3025 #else
3026         log_error("--image= is not supported, compiled without blkid support.");
3027         return -ENOTSUP;
3028 #endif
3029 }
3030
3031 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3032 #ifdef HAVE_BLKID
3033         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3034         const char *fstype, *p;
3035         int r;
3036
3037         assert(what);
3038         assert(where);
3039
3040         if (arg_read_only)
3041                 rw = false;
3042
3043         if (directory)
3044                 p = strappenda(where, directory);
3045         else
3046                 p = where;
3047
3048         errno = 0;
3049         b = blkid_new_probe_from_filename(what);
3050         if (!b) {
3051                 if (errno == 0)
3052                         return log_oom();
3053                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3054                 return -errno;
3055         }
3056
3057         blkid_probe_enable_superblocks(b, 1);
3058         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3059
3060         errno = 0;
3061         r = blkid_do_safeprobe(b);
3062         if (r == -1 || r == 1) {
3063                 log_error("Cannot determine file system type of %s", what);
3064                 return -EINVAL;
3065         } else if (r != 0) {
3066                 if (errno == 0)
3067                         errno = EIO;
3068                 log_error_errno(errno, "Failed to probe %s: %m", what);
3069                 return -errno;
3070         }
3071
3072         errno = 0;
3073         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3074                 if (errno == 0)
3075                         errno = EINVAL;
3076                 log_error("Failed to determine file system type of %s", what);
3077                 return -errno;
3078         }
3079
3080         if (streq(fstype, "crypto_LUKS")) {
3081                 log_error("nspawn currently does not support LUKS disk images.");
3082                 return -ENOTSUP;
3083         }
3084
3085         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3086                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3087
3088         return 0;
3089 #else
3090         log_error("--image= is not supported, compiled without blkid support.");
3091         return -ENOTSUP;
3092 #endif
3093 }
3094
3095 static int mount_devices(
3096                 const char *where,
3097                 const char *root_device, bool root_device_rw,
3098                 const char *home_device, bool home_device_rw,
3099                 const char *srv_device, bool srv_device_rw) {
3100         int r;
3101
3102         assert(where);
3103
3104         if (root_device) {
3105                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3106                 if (r < 0)
3107                         return log_error_errno(r, "Failed to mount root directory: %m");
3108         }
3109
3110         if (home_device) {
3111                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3112                 if (r < 0)
3113                         return log_error_errno(r, "Failed to mount home directory: %m");
3114         }
3115
3116         if (srv_device) {
3117                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3118                 if (r < 0)
3119                         return log_error_errno(r, "Failed to mount server data directory: %m");
3120         }
3121
3122         return 0;
3123 }
3124
3125 static void loop_remove(int nr, int *image_fd) {
3126         _cleanup_close_ int control = -1;
3127         int r;
3128
3129         if (nr < 0)
3130                 return;
3131
3132         if (image_fd && *image_fd >= 0) {
3133                 r = ioctl(*image_fd, LOOP_CLR_FD);
3134                 if (r < 0)
3135                         log_debug_errno(errno, "Failed to close loop image: %m");
3136                 *image_fd = safe_close(*image_fd);
3137         }
3138
3139         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3140         if (control < 0) {
3141                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3142                 return;
3143         }
3144
3145         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3146         if (r < 0)
3147                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3148 }
3149
3150 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3151         int pipe_fds[2];
3152         pid_t pid;
3153
3154         assert(database);
3155         assert(key);
3156         assert(rpid);
3157
3158         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3159                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3160
3161         pid = fork();
3162         if (pid < 0)
3163                 return log_error_errno(errno, "Failed to fork getent child: %m");
3164         else if (pid == 0) {
3165                 int nullfd;
3166                 char *empty_env = NULL;
3167
3168                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3169                         _exit(EXIT_FAILURE);
3170
3171                 if (pipe_fds[0] > 2)
3172                         safe_close(pipe_fds[0]);
3173                 if (pipe_fds[1] > 2)
3174                         safe_close(pipe_fds[1]);
3175
3176                 nullfd = open("/dev/null", O_RDWR);
3177                 if (nullfd < 0)
3178                         _exit(EXIT_FAILURE);
3179
3180                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3181                         _exit(EXIT_FAILURE);
3182
3183                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3184                         _exit(EXIT_FAILURE);
3185
3186                 if (nullfd > 2)
3187                         safe_close(nullfd);
3188
3189                 reset_all_signal_handlers();
3190                 close_all_fds(NULL, 0);
3191
3192                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3193                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3194                 _exit(EXIT_FAILURE);
3195         }
3196
3197         pipe_fds[1] = safe_close(pipe_fds[1]);
3198
3199         *rpid = pid;
3200
3201         return pipe_fds[0];
3202 }
3203
3204 static int change_uid_gid(char **_home) {
3205         char line[LINE_MAX], *x, *u, *g, *h;
3206         const char *word, *state;
3207         _cleanup_free_ uid_t *uids = NULL;
3208         _cleanup_free_ char *home = NULL;
3209         _cleanup_fclose_ FILE *f = NULL;
3210         _cleanup_close_ int fd = -1;
3211         unsigned n_uids = 0;
3212         size_t sz = 0, l;
3213         uid_t uid;
3214         gid_t gid;
3215         pid_t pid;
3216         int r;
3217
3218         assert(_home);
3219
3220         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3221                 /* Reset everything fully to 0, just in case */
3222
3223                 if (setgroups(0, NULL) < 0)
3224                         return log_error_errno(errno, "setgroups() failed: %m");
3225
3226                 if (setresgid(0, 0, 0) < 0)
3227                         return log_error_errno(errno, "setregid() failed: %m");
3228
3229                 if (setresuid(0, 0, 0) < 0)
3230                         return log_error_errno(errno, "setreuid() failed: %m");
3231
3232                 *_home = NULL;
3233                 return 0;
3234         }
3235
3236         /* First, get user credentials */
3237         fd = spawn_getent("passwd", arg_user, &pid);
3238         if (fd < 0)
3239                 return fd;
3240
3241         f = fdopen(fd, "r");
3242         if (!f)
3243                 return log_oom();
3244         fd = -1;
3245
3246         if (!fgets(line, sizeof(line), f)) {
3247
3248                 if (!ferror(f)) {
3249                         log_error("Failed to resolve user %s.", arg_user);
3250                         return -ESRCH;
3251                 }
3252
3253                 log_error_errno(errno, "Failed to read from getent: %m");
3254                 return -errno;
3255         }
3256
3257         truncate_nl(line);
3258
3259         wait_for_terminate_and_warn("getent passwd", pid, true);
3260
3261         x = strchr(line, ':');
3262         if (!x) {
3263                 log_error("/etc/passwd entry has invalid user field.");
3264                 return -EIO;
3265         }
3266
3267         u = strchr(x+1, ':');
3268         if (!u) {
3269                 log_error("/etc/passwd entry has invalid password field.");
3270                 return -EIO;
3271         }
3272
3273         u++;
3274         g = strchr(u, ':');
3275         if (!g) {
3276                 log_error("/etc/passwd entry has invalid UID field.");
3277                 return -EIO;
3278         }
3279
3280         *g = 0;
3281         g++;
3282         x = strchr(g, ':');
3283         if (!x) {
3284                 log_error("/etc/passwd entry has invalid GID field.");
3285                 return -EIO;
3286         }
3287
3288         *x = 0;
3289         h = strchr(x+1, ':');
3290         if (!h) {
3291                 log_error("/etc/passwd entry has invalid GECOS field.");
3292                 return -EIO;
3293         }
3294
3295         h++;
3296         x = strchr(h, ':');
3297         if (!x) {
3298                 log_error("/etc/passwd entry has invalid home directory field.");
3299                 return -EIO;
3300         }
3301
3302         *x = 0;
3303
3304         r = parse_uid(u, &uid);
3305         if (r < 0) {
3306                 log_error("Failed to parse UID of user.");
3307                 return -EIO;
3308         }
3309
3310         r = parse_gid(g, &gid);
3311         if (r < 0) {
3312                 log_error("Failed to parse GID of user.");
3313                 return -EIO;
3314         }
3315
3316         home = strdup(h);
3317         if (!home)
3318                 return log_oom();
3319
3320         /* Second, get group memberships */
3321         fd = spawn_getent("initgroups", arg_user, &pid);
3322         if (fd < 0)
3323                 return fd;
3324
3325         fclose(f);
3326         f = fdopen(fd, "r");
3327         if (!f)
3328                 return log_oom();
3329         fd = -1;
3330
3331         if (!fgets(line, sizeof(line), f)) {
3332                 if (!ferror(f)) {
3333                         log_error("Failed to resolve user %s.", arg_user);
3334                         return -ESRCH;
3335                 }
3336
3337                 log_error_errno(errno, "Failed to read from getent: %m");
3338                 return -errno;
3339         }
3340
3341         truncate_nl(line);
3342
3343         wait_for_terminate_and_warn("getent initgroups", pid, true);
3344
3345         /* Skip over the username and subsequent separator whitespace */
3346         x = line;
3347         x += strcspn(x, WHITESPACE);
3348         x += strspn(x, WHITESPACE);
3349
3350         FOREACH_WORD(word, l, x, state) {
3351                 char c[l+1];
3352
3353                 memcpy(c, word, l);
3354                 c[l] = 0;
3355
3356                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3357                         return log_oom();
3358
3359                 r = parse_uid(c, &uids[n_uids++]);
3360                 if (r < 0) {
3361                         log_error("Failed to parse group data from getent.");
3362                         return -EIO;
3363                 }
3364         }
3365
3366         r = mkdir_parents(home, 0775);
3367         if (r < 0)
3368                 return log_error_errno(r, "Failed to make home root directory: %m");
3369
3370         r = mkdir_safe(home, 0755, uid, gid);
3371         if (r < 0 && r != -EEXIST)
3372                 return log_error_errno(r, "Failed to make home directory: %m");
3373
3374         fchown(STDIN_FILENO, uid, gid);
3375         fchown(STDOUT_FILENO, uid, gid);
3376         fchown(STDERR_FILENO, uid, gid);
3377
3378         if (setgroups(n_uids, uids) < 0)
3379                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3380
3381         if (setresgid(gid, gid, gid) < 0)
3382                 return log_error_errno(errno, "setregid() failed: %m");
3383
3384         if (setresuid(uid, uid, uid) < 0)
3385                 return log_error_errno(errno, "setreuid() failed: %m");
3386
3387         if (_home) {
3388                 *_home = home;
3389                 home = NULL;
3390         }
3391
3392         return 0;
3393 }
3394
3395 /*
3396  * Return values:
3397  * < 0 : wait_for_terminate() failed to get the state of the
3398  *       container, the container was terminated by a signal, or
3399  *       failed for an unknown reason.  No change is made to the
3400  *       container argument.
3401  * > 0 : The program executed in the container terminated with an
3402  *       error.  The exit code of the program executed in the
3403  *       container is returned.  The container argument has been set
3404  *       to CONTAINER_TERMINATED.
3405  *   0 : The container is being rebooted, has been shut down or exited
3406  *       successfully.  The container argument has been set to either
3407  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3408  *
3409  * That is, success is indicated by a return value of zero, and an
3410  * error is indicated by a non-zero value.
3411  */
3412 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3413         siginfo_t status;
3414         int r;
3415
3416         r = wait_for_terminate(pid, &status);
3417         if (r < 0)
3418                 return log_warning_errno(r, "Failed to wait for container: %m");
3419
3420         switch (status.si_code) {
3421
3422         case CLD_EXITED:
3423                 if (status.si_status == 0) {
3424                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3425
3426                 } else
3427                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3428
3429                 *container = CONTAINER_TERMINATED;
3430                 return status.si_status;
3431
3432         case CLD_KILLED:
3433                 if (status.si_status == SIGINT) {
3434
3435                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3436                         *container = CONTAINER_TERMINATED;
3437                         return 0;
3438
3439                 } else if (status.si_status == SIGHUP) {
3440
3441                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3442                         *container = CONTAINER_REBOOTED;
3443                         return 0;
3444                 }
3445
3446                 /* CLD_KILLED fallthrough */
3447
3448         case CLD_DUMPED:
3449                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3450                 return -EIO;
3451
3452         default:
3453                 log_error("Container %s failed due to unknown reason.", arg_machine);
3454                 return -EIO;
3455         }
3456
3457         return r;
3458 }
3459
3460 static void nop_handler(int sig) {}
3461
3462 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3463         pid_t pid;
3464
3465         pid = PTR_TO_UINT32(userdata);
3466         if (pid > 0) {
3467                 if (kill(pid, SIGRTMIN+3) >= 0) {
3468                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3469                         sd_event_source_set_userdata(s, NULL);
3470                         return 0;
3471                 }
3472         }
3473
3474         sd_event_exit(sd_event_source_get_event(s), 0);
3475         return 0;
3476 }
3477
3478 static int determine_names(void) {
3479         int r;
3480
3481         if (!arg_image && !arg_directory) {
3482                 if (arg_machine) {
3483                         _cleanup_(image_unrefp) Image *i = NULL;
3484
3485                         r = image_find(arg_machine, &i);
3486                         if (r < 0)
3487                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3488                         else if (r == 0) {
3489                                 log_error("No image for machine '%s': %m", arg_machine);
3490                                 return -ENOENT;
3491                         }
3492
3493                         if (i->type == IMAGE_RAW)
3494                                 r = set_sanitized_path(&arg_image, i->path);
3495                         else
3496                                 r = set_sanitized_path(&arg_directory, i->path);
3497                         if (r < 0)
3498                                 return log_error_errno(r, "Invalid image directory: %m");
3499
3500                         arg_read_only = arg_read_only || i->read_only;
3501                 } else
3502                         arg_directory = get_current_dir_name();
3503
3504                 if (!arg_directory && !arg_machine) {
3505                         log_error("Failed to determine path, please use -D or -i.");
3506                         return -EINVAL;
3507                 }
3508         }
3509
3510         if (!arg_machine) {
3511                 if (arg_directory && path_equal(arg_directory, "/"))
3512                         arg_machine = gethostname_malloc();
3513                 else
3514                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3515
3516                 if (!arg_machine)
3517                         return log_oom();
3518
3519                 hostname_cleanup(arg_machine, false);
3520                 if (!machine_name_is_valid(arg_machine)) {
3521                         log_error("Failed to determine machine name automatically, please use -M.");
3522                         return -EINVAL;
3523                 }
3524
3525                 if (arg_ephemeral) {
3526                         char *b;
3527
3528                         /* Add a random suffix when this is an
3529                          * ephemeral machine, so that we can run many
3530                          * instances at once without manually having
3531                          * to specify -M each time. */
3532
3533                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3534                                 return log_oom();
3535
3536                         free(arg_machine);
3537                         arg_machine = b;
3538                 }
3539         }
3540
3541         return 0;
3542 }
3543
3544 int main(int argc, char *argv[]) {
3545
3546         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3547         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3548         _cleanup_close_ int master = -1, image_fd = -1;
3549         _cleanup_fdset_free_ FDSet *fds = NULL;
3550         int r, n_fd_passed, loop_nr = -1;
3551         char veth_name[IFNAMSIZ];
3552         bool secondary = false, remove_subvol = false;
3553         sigset_t mask, mask_chld;
3554         pid_t pid = 0;
3555         int ret = EXIT_SUCCESS;
3556         union in_addr_union exposed = {};
3557         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3558
3559         log_parse_environment();
3560         log_open();
3561
3562         r = parse_argv(argc, argv);
3563         if (r <= 0)
3564                 goto finish;
3565
3566         r = determine_names();
3567         if (r < 0)
3568                 goto finish;
3569
3570         if (geteuid() != 0) {
3571                 log_error("Need to be root.");
3572                 r = -EPERM;
3573                 goto finish;
3574         }
3575
3576         if (sd_booted() <= 0) {
3577                 log_error("Not running on a systemd system.");
3578                 r = -EINVAL;
3579                 goto finish;
3580         }
3581
3582         log_close();
3583         n_fd_passed = sd_listen_fds(false);
3584         if (n_fd_passed > 0) {
3585                 r = fdset_new_listen_fds(&fds, false);
3586                 if (r < 0) {
3587                         log_error_errno(r, "Failed to collect file descriptors: %m");
3588                         goto finish;
3589                 }
3590         }
3591         fdset_close_others(fds);
3592         log_open();
3593
3594         if (arg_directory) {
3595                 assert(!arg_image);
3596
3597                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3598                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3599                         r = -EINVAL;
3600                         goto finish;
3601                 }
3602
3603                 if (arg_ephemeral) {
3604                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3605                         char *np;
3606
3607                         /* If the specified path is a mount point we
3608                          * generate the new snapshot immediately
3609                          * inside it under a random name. However if
3610                          * the specified is not a mount point we
3611                          * create the new snapshot in the parent
3612                          * directory, just next to it. */
3613                         r = path_is_mount_point(arg_directory, false);
3614                         if (r < 0) {
3615                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3616                                 goto finish;
3617                         }
3618                         if (r > 0)
3619                                 r = tempfn_random_child(arg_directory, &np);
3620                         else
3621                                 r = tempfn_random(arg_directory, &np);
3622                         if (r < 0) {
3623                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3624                                 goto finish;
3625                         }
3626
3627                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3628                         if (r < 0) {
3629                                 log_error_errno(r, "Failed to lock %s: %m", np);
3630                                 goto finish;
3631                         }
3632
3633                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3634                         if (r < 0) {
3635                                 free(np);
3636                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3637                                 goto finish;
3638                         }
3639
3640                         free(arg_directory);
3641                         arg_directory = np;
3642
3643                         remove_subvol = true;
3644
3645                 } else {
3646                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3647                         if (r == -EBUSY) {
3648                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3649                                 goto finish;
3650                         }
3651                         if (r < 0) {
3652                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3653                                 return r;
3654                         }
3655
3656                         if (arg_template) {
3657                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3658                                 if (r == -EEXIST) {
3659                                         if (!arg_quiet)
3660                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3661                                 } else if (r < 0) {
3662                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3663                                         goto finish;
3664                                 } else {
3665                                         if (!arg_quiet)
3666                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3667                                 }
3668                         }
3669                 }
3670
3671                 if (arg_boot) {
3672                         if (path_is_os_tree(arg_directory) <= 0) {
3673                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3674                                 r = -EINVAL;
3675                                 goto finish;
3676                         }
3677                 } else {
3678                         const char *p;
3679
3680                         p = strappenda(arg_directory,
3681                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3682                         if (access(p, F_OK) < 0) {
3683                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3684                                 r = -EINVAL;
3685                                 goto finish;
3686                         }
3687                 }
3688
3689         } else {
3690                 char template[] = "/tmp/nspawn-root-XXXXXX";
3691
3692                 assert(arg_image);
3693                 assert(!arg_template);
3694
3695                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3696                 if (r == -EBUSY) {
3697                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3698                         goto finish;
3699                 }
3700                 if (r < 0) {
3701                         r = log_error_errno(r, "Failed to create image lock: %m");
3702                         goto finish;
3703                 }
3704
3705                 if (!mkdtemp(template)) {
3706                         log_error_errno(errno, "Failed to create temporary directory: %m");
3707                         r = -errno;
3708                         goto finish;
3709                 }
3710
3711                 arg_directory = strdup(template);
3712                 if (!arg_directory) {
3713                         r = log_oom();
3714                         goto finish;
3715                 }
3716
3717                 image_fd = setup_image(&device_path, &loop_nr);
3718                 if (image_fd < 0) {
3719                         r = image_fd;
3720                         goto finish;
3721                 }
3722
3723                 r = dissect_image(image_fd,
3724                                   &root_device, &root_device_rw,
3725                                   &home_device, &home_device_rw,
3726                                   &srv_device, &srv_device_rw,
3727                                   &secondary);
3728                 if (r < 0)
3729                         goto finish;
3730         }
3731
3732         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3733         if (master < 0) {
3734                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3735                 goto finish;
3736         }
3737
3738         r = ptsname_malloc(master, &console);
3739         if (r < 0) {
3740                 r = log_error_errno(r, "Failed to determine tty name: %m");
3741                 goto finish;
3742         }
3743
3744         if (!arg_quiet)
3745                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3746                          arg_machine, arg_image ?: arg_directory);
3747
3748         if (unlockpt(master) < 0) {
3749                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3750                 goto finish;
3751         }
3752
3753         assert_se(sigemptyset(&mask) == 0);
3754         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3755         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3756
3757         assert_se(sigemptyset(&mask_chld) == 0);
3758         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3759
3760         for (;;) {
3761                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3762                 ContainerStatus container_status;
3763                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3764                 struct sigaction sa = {
3765                         .sa_handler = nop_handler,
3766                         .sa_flags = SA_NOCLDSTOP,
3767                 };
3768
3769                 r = barrier_create(&barrier);
3770                 if (r < 0) {
3771                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3772                         goto finish;
3773                 }
3774
3775                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3776                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3777                         goto finish;
3778                 }
3779
3780                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3781                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3782                         goto finish;
3783                 }
3784
3785                 /* Child can be killed before execv(), so handle SIGCHLD
3786                  * in order to interrupt parent's blocking calls and
3787                  * give it a chance to call wait() and terminate. */
3788                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3789                 if (r < 0) {
3790                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3791                         goto finish;
3792                 }
3793
3794                 r = sigaction(SIGCHLD, &sa, NULL);
3795                 if (r < 0) {
3796                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3797                         goto finish;
3798                 }
3799
3800                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3801                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3802                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3803                 if (pid < 0) {
3804                         if (errno == EINVAL)
3805                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3806                         else
3807                                 r = log_error_errno(errno, "clone() failed: %m");
3808
3809                         goto finish;
3810                 }
3811
3812                 if (pid == 0) {
3813                         /* child */
3814                         _cleanup_free_ char *home = NULL;
3815                         unsigned n_env = 2;
3816                         const char *envp[] = {
3817                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3818                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3819                                 NULL, /* TERM */
3820                                 NULL, /* HOME */
3821                                 NULL, /* USER */
3822                                 NULL, /* LOGNAME */
3823                                 NULL, /* container_uuid */
3824                                 NULL, /* LISTEN_FDS */
3825                                 NULL, /* LISTEN_PID */
3826                                 NULL
3827                         };
3828                         char **env_use;
3829
3830                         barrier_set_role(&barrier, BARRIER_CHILD);
3831
3832                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3833                         if (envp[n_env])
3834                                 n_env ++;
3835
3836                         master = safe_close(master);
3837
3838                         close_nointr(STDIN_FILENO);
3839                         close_nointr(STDOUT_FILENO);
3840                         close_nointr(STDERR_FILENO);
3841
3842                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3843                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3844
3845                         reset_all_signal_handlers();
3846                         reset_signal_mask();
3847
3848                         r = open_terminal(console, O_RDWR);
3849                         if (r != STDIN_FILENO) {
3850                                 if (r >= 0) {
3851                                         safe_close(r);
3852                                         r = -EINVAL;
3853                                 }
3854
3855                                 log_error_errno(r, "Failed to open console: %m");
3856                                 _exit(EXIT_FAILURE);
3857                         }
3858
3859                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3860                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3861                                 log_error_errno(errno, "Failed to duplicate console: %m");
3862                                 _exit(EXIT_FAILURE);
3863                         }
3864
3865                         if (setsid() < 0) {
3866                                 log_error_errno(errno, "setsid() failed: %m");
3867                                 _exit(EXIT_FAILURE);
3868                         }
3869
3870                         if (reset_audit_loginuid() < 0)
3871                                 _exit(EXIT_FAILURE);
3872
3873                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3874                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3875                                 _exit(EXIT_FAILURE);
3876                         }
3877
3878                         /* Mark everything as slave, so that we still
3879                          * receive mounts from the real root, but don't
3880                          * propagate mounts to the real root. */
3881                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3882                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3883                                 _exit(EXIT_FAILURE);
3884                         }
3885
3886                         if (mount_devices(arg_directory,
3887                                           root_device, root_device_rw,
3888                                           home_device, home_device_rw,
3889                                           srv_device, srv_device_rw) < 0)
3890                                 _exit(EXIT_FAILURE);
3891
3892                         /* Turn directory into bind mount */
3893                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3894                                 log_error_errno(errno, "Failed to make bind mount: %m");
3895                                 _exit(EXIT_FAILURE);
3896                         }
3897
3898                         r = setup_volatile(arg_directory);
3899                         if (r < 0)
3900                                 _exit(EXIT_FAILURE);
3901
3902                         if (setup_volatile_state(arg_directory) < 0)
3903                                 _exit(EXIT_FAILURE);
3904
3905                         r = base_filesystem_create(arg_directory);
3906                         if (r < 0)
3907                                 _exit(EXIT_FAILURE);
3908
3909                         if (arg_read_only) {
3910                                 r = bind_remount_recursive(arg_directory, true);
3911                                 if (r < 0) {
3912                                         log_error_errno(r, "Failed to make tree read-only: %m");
3913                                         _exit(EXIT_FAILURE);
3914                                 }
3915                         }
3916
3917                         if (mount_all(arg_directory) < 0)
3918                                 _exit(EXIT_FAILURE);
3919
3920                         if (copy_devnodes(arg_directory) < 0)
3921                                 _exit(EXIT_FAILURE);
3922
3923                         if (setup_ptmx(arg_directory) < 0)
3924                                 _exit(EXIT_FAILURE);
3925
3926                         dev_setup(arg_directory);
3927
3928                         if (setup_propagate(arg_directory) < 0)
3929                                 _exit(EXIT_FAILURE);
3930
3931                         if (setup_seccomp() < 0)
3932                                 _exit(EXIT_FAILURE);
3933
3934                         if (setup_dev_console(arg_directory, console) < 0)
3935                                 _exit(EXIT_FAILURE);
3936
3937                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3938                                 _exit(EXIT_FAILURE);
3939                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3940
3941                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3942                                 _exit(EXIT_FAILURE);
3943                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3944
3945                         /* Tell the parent that we are ready, and that
3946                          * it can cgroupify us to that we lack access
3947                          * to certain devices and resources. */
3948                         (void) barrier_place(&barrier);
3949
3950                         if (setup_boot_id(arg_directory) < 0)
3951                                 _exit(EXIT_FAILURE);
3952
3953                         if (setup_timezone(arg_directory) < 0)
3954                                 _exit(EXIT_FAILURE);
3955
3956                         if (setup_resolv_conf(arg_directory) < 0)
3957                                 _exit(EXIT_FAILURE);
3958
3959                         if (setup_journal(arg_directory) < 0)
3960                                 _exit(EXIT_FAILURE);
3961
3962                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3963                                 _exit(EXIT_FAILURE);
3964
3965                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3966                                 _exit(EXIT_FAILURE);
3967
3968                         if (mount_tmpfs(arg_directory) < 0)
3969                                 _exit(EXIT_FAILURE);
3970
3971                         /* Wait until we are cgroup-ified, so that we
3972                          * can mount the right cgroup path writable */
3973                         (void) barrier_sync_next(&barrier);
3974
3975                         if (mount_cgroup(arg_directory) < 0)
3976                                 _exit(EXIT_FAILURE);
3977
3978                         if (chdir(arg_directory) < 0) {
3979                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3980                                 _exit(EXIT_FAILURE);
3981                         }
3982
3983                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3984                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3985                                 _exit(EXIT_FAILURE);
3986                         }
3987
3988                         if (chroot(".") < 0) {
3989                                 log_error_errno(errno, "chroot() failed: %m");
3990                                 _exit(EXIT_FAILURE);
3991                         }
3992
3993                         if (chdir("/") < 0) {
3994                                 log_error_errno(errno, "chdir() failed: %m");
3995                                 _exit(EXIT_FAILURE);
3996                         }
3997
3998                         umask(0022);
3999
4000                         if (arg_private_network)
4001                                 loopback_setup();
4002
4003                         if (drop_capabilities() < 0) {
4004                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4005                                 _exit(EXIT_FAILURE);
4006                         }
4007
4008                         r = change_uid_gid(&home);
4009                         if (r < 0)
4010                                 _exit(EXIT_FAILURE);
4011
4012                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4013                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4014                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4015                                 log_oom();
4016                                 _exit(EXIT_FAILURE);
4017                         }
4018
4019                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4020                                 char as_uuid[37];
4021
4022                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4023                                         log_oom();
4024                                         _exit(EXIT_FAILURE);
4025                                 }
4026                         }
4027
4028                         if (fdset_size(fds) > 0) {
4029                                 r = fdset_cloexec(fds, false);
4030                                 if (r < 0) {
4031                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4032                                         _exit(EXIT_FAILURE);
4033                                 }
4034
4035                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4036                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4037                                         log_oom();
4038                                         _exit(EXIT_FAILURE);
4039                                 }
4040                         }
4041
4042                         setup_hostname();
4043
4044                         if (arg_personality != 0xffffffffLU) {
4045                                 if (personality(arg_personality) < 0) {
4046                                         log_error_errno(errno, "personality() failed: %m");
4047                                         _exit(EXIT_FAILURE);
4048                                 }
4049                         } else if (secondary) {
4050                                 if (personality(PER_LINUX32) < 0) {
4051                                         log_error_errno(errno, "personality() failed: %m");
4052                                         _exit(EXIT_FAILURE);
4053                                 }
4054                         }
4055
4056 #ifdef HAVE_SELINUX
4057                         if (arg_selinux_context)
4058                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4059                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4060                                         _exit(EXIT_FAILURE);
4061                                 }
4062 #endif
4063
4064                         if (!strv_isempty(arg_setenv)) {
4065                                 char **n;
4066
4067                                 n = strv_env_merge(2, envp, arg_setenv);
4068                                 if (!n) {
4069                                         log_oom();
4070                                         _exit(EXIT_FAILURE);
4071                                 }
4072
4073                                 env_use = n;
4074                         } else
4075                                 env_use = (char**) envp;
4076
4077                         /* Wait until the parent is ready with the setup, too... */
4078                         if (!barrier_place_and_sync(&barrier))
4079                                 _exit(EXIT_FAILURE);
4080
4081                         if (arg_boot) {
4082                                 char **a;
4083                                 size_t l;
4084
4085                                 /* Automatically search for the init system */
4086
4087                                 l = 1 + argc - optind;
4088                                 a = newa(char*, l + 1);
4089                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4090
4091                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4092                                 execve(a[0], a, env_use);
4093
4094                                 a[0] = (char*) "/lib/systemd/systemd";
4095                                 execve(a[0], a, env_use);
4096
4097                                 a[0] = (char*) "/sbin/init";
4098                                 execve(a[0], a, env_use);
4099                         } else if (argc > optind)
4100                                 execvpe(argv[optind], argv + optind, env_use);
4101                         else {
4102                                 chdir(home ? home : "/root");
4103                                 execle("/bin/bash", "-bash", NULL, env_use);
4104                                 execle("/bin/sh", "-sh", NULL, env_use);
4105                         }
4106
4107                         log_error_errno(errno, "execv() failed: %m");
4108                         _exit(EXIT_FAILURE);
4109                 }
4110
4111                 barrier_set_role(&barrier, BARRIER_PARENT);
4112                 fdset_free(fds);
4113                 fds = NULL;
4114
4115                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4116                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4117
4118                 /* Wait for the most basic Child-setup to be done,
4119                  * before we add hardware to it, and place it in a
4120                  * cgroup. */
4121                 if (barrier_sync_next(&barrier)) {
4122                         int ifi = 0;
4123
4124                         r = move_network_interfaces(pid);
4125                         if (r < 0)
4126                                 goto finish;
4127
4128                         r = setup_veth(pid, veth_name, &ifi);
4129                         if (r < 0)
4130                                 goto finish;
4131
4132                         r = setup_bridge(veth_name, &ifi);
4133                         if (r < 0)
4134                                 goto finish;
4135
4136                         r = setup_macvlan(pid);
4137                         if (r < 0)
4138                                 goto finish;
4139
4140                         r = setup_ipvlan(pid);
4141                         if (r < 0)
4142                                 goto finish;
4143
4144                         r = register_machine(pid, ifi);
4145                         if (r < 0)
4146                                 goto finish;
4147
4148                         /* Block SIGCHLD here, before notifying child.
4149                          * process_pty() will handle it with the other signals. */
4150                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4151                         if (r < 0)
4152                                 goto finish;
4153
4154                         /* Reset signal to default */
4155                         r = default_signals(SIGCHLD, -1);
4156                         if (r < 0)
4157                                 goto finish;
4158
4159                         /* Notify the child that the parent is ready with all
4160                          * its setup, and that the child can now hand over
4161                          * control to the code to run inside the container. */
4162                         (void) barrier_place(&barrier);
4163
4164                         /* And wait that the child is completely ready now. */
4165                         if (barrier_place_and_sync(&barrier)) {
4166                                 _cleanup_event_unref_ sd_event *event = NULL;
4167                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4168                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4169                                 char last_char = 0;
4170
4171                                 sd_notifyf(false,
4172                                            "READY=1\n"
4173                                            "STATUS=Container running.\n"
4174                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4175
4176                                 r = sd_event_new(&event);
4177                                 if (r < 0) {
4178                                         log_error_errno(r, "Failed to get default event source: %m");
4179                                         goto finish;
4180                                 }
4181
4182                                 if (arg_boot) {
4183                                         /* Try to kill the init system on SIGINT or SIGTERM */
4184                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4185                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4186                                 } else {
4187                                         /* Immediately exit */
4188                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4189                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4190                                 }
4191
4192                                 /* simply exit on sigchld */
4193                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4194
4195                                 if (arg_expose_ports) {
4196                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4197                                         if (r < 0)
4198                                                 goto finish;
4199
4200                                         (void) expose_ports(rtnl, &exposed);
4201                                 }
4202
4203                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4204
4205                                 r = pty_forward_new(event, master, true, &forward);
4206                                 if (r < 0) {
4207                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4208                                         goto finish;
4209                                 }
4210
4211                                 r = sd_event_loop(event);
4212                                 if (r < 0) {
4213                                         log_error_errno(r, "Failed to run event loop: %m");
4214                                         goto finish;
4215                                 }
4216
4217                                 pty_forward_get_last_char(forward, &last_char);
4218
4219                                 forward = pty_forward_free(forward);
4220
4221                                 if (!arg_quiet && last_char != '\n')
4222                                         putc('\n', stdout);
4223
4224                                 /* Kill if it is not dead yet anyway */
4225                                 terminate_machine(pid);
4226                         }
4227                 }
4228
4229                 /* Normally redundant, but better safe than sorry */
4230                 kill(pid, SIGKILL);
4231
4232                 r = wait_for_container(pid, &container_status);
4233                 pid = 0;
4234
4235                 if (r < 0)
4236                         /* We failed to wait for the container, or the
4237                          * container exited abnormally */
4238                         goto finish;
4239                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4240                         /* The container exited with a non-zero
4241                          * status, or with zero status and no reboot
4242                          * was requested. */
4243                         ret = r;
4244                         break;
4245                 }
4246
4247                 /* CONTAINER_REBOOTED, loop again */
4248
4249                 if (arg_keep_unit) {
4250                         /* Special handling if we are running as a
4251                          * service: instead of simply restarting the
4252                          * machine we want to restart the entire
4253                          * service, so let's inform systemd about this
4254                          * with the special exit code 133. The service
4255                          * file uses RestartForceExitStatus=133 so
4256                          * that this results in a full nspawn
4257                          * restart. This is necessary since we might
4258                          * have cgroup parameters set we want to have
4259                          * flushed out. */
4260                         ret = 133;
4261                         r = 0;
4262                         break;
4263                 }
4264
4265                 flush_ports(&exposed);
4266         }
4267
4268 finish:
4269         sd_notify(false,
4270                   "STOPPING=1\n"
4271                   "STATUS=Terminating...");
4272
4273         loop_remove(loop_nr, &image_fd);
4274
4275         if (pid > 0)
4276                 kill(pid, SIGKILL);
4277
4278         if (remove_subvol && arg_directory) {
4279                 int k;
4280
4281                 k = btrfs_subvol_remove(arg_directory);
4282                 if (k < 0)
4283                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4284         }
4285
4286         if (arg_machine) {
4287                 const char *p;
4288
4289                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4290                 (void) rm_rf(p, false, true, false);
4291         }
4292
4293         free(arg_directory);
4294         free(arg_template);
4295         free(arg_image);
4296         free(arg_machine);
4297         free(arg_user);
4298         strv_free(arg_setenv);
4299         strv_free(arg_network_interfaces);
4300         strv_free(arg_network_macvlan);
4301         strv_free(arg_network_ipvlan);
4302         strv_free(arg_bind);
4303         strv_free(arg_bind_ro);
4304         strv_free(arg_tmpfs);
4305
4306         flush_ports(&exposed);
4307
4308         while (arg_expose_ports) {
4309                 ExposePort *p = arg_expose_ports;
4310                 LIST_REMOVE(ports, arg_expose_ports, p);
4311                 free(p);
4312         }
4313
4314         return r < 0 ? EXIT_FAILURE : ret;
4315 }