chiark / gitweb /
4e465dfe4f6f510a2a4012880e9c493b76c73dfe
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
820 #ifdef HAVE_SELINUX
821                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
822                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
823 #endif
824         };
825
826         unsigned k;
827         int r = 0;
828
829         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830                 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832                 _cleanup_free_ char *options = NULL;
833 #endif
834                 const char *o;
835                 int t;
836
837                 where = strjoin(dest, "/", mount_table[k].where, NULL);
838                 if (!where)
839                         return log_oom();
840
841                 t = path_is_mount_point(where, true);
842                 if (t < 0) {
843                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845                         if (r == 0)
846                                 r = t;
847
848                         continue;
849                 }
850
851                 /* Skip this entry if it is not a remount. */
852                 if (mount_table[k].what && t > 0)
853                         continue;
854
855                 t = mkdir_p(where, 0755);
856                 if (t < 0) {
857                         if (mount_table[k].fatal) {
858                                log_error_errno(t, "Failed to create directory %s: %m", where);
859
860                                 if (r == 0)
861                                         r = t;
862                         } else
863                                log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865                         continue;
866                 }
867
868 #ifdef HAVE_SELINUX
869                 if (arg_selinux_apifs_context &&
870                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872                         if (!options)
873                                 return log_oom();
874
875                         o = options;
876                 } else
877 #endif
878                         o = mount_table[k].options;
879
880
881                 if (mount(mount_table[k].what,
882                           where,
883                           mount_table[k].type,
884                           mount_table[k].flags,
885                           o) < 0) {
886
887                         if (mount_table[k].fatal) {
888                                 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890                                 if (r == 0)
891                                         r = -errno;
892                         } else
893                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
894                 }
895         }
896
897         return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901         char **x, **y;
902
903         STRV_FOREACH_PAIR(x, y, l) {
904                 _cleanup_free_ char *where = NULL;
905                 struct stat source_st, dest_st;
906                 int r;
907
908                 if (stat(*x, &source_st) < 0)
909                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911                 where = strappend(dest, *y);
912                 if (!where)
913                         return log_oom();
914
915                 r = stat(where, &dest_st);
916                 if (r == 0) {
917                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919                                 return -EINVAL;
920                         }
921                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923                                 return -EINVAL;
924                         }
925                 } else if (errno == ENOENT) {
926                         r = mkdir_parents_label(where, 0755);
927                         if (r < 0)
928                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929                 } else {
930                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931                         return -errno;
932                 }
933
934                 /* Create the mount point. Any non-directory file can be
935                  * mounted on any non-directory file (regular, fifo, socket,
936                  * char, block).
937                  */
938                 if (S_ISDIR(source_st.st_mode)) {
939                         r = mkdir_label(where, 0755);
940                         if (r < 0 && errno != EEXIST)
941                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942                 } else {
943                         r = touch(where);
944                         if (r < 0)
945                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946                 }
947
948                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949                         return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951                 if (ro) {
952                         r = bind_remount_recursive(where, true);
953                         if (r < 0)
954                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
955                 }
956         }
957
958         return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962         char *to;
963         int r;
964
965         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
966
967         r = path_is_mount_point(to, false);
968         if (r < 0)
969                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970         if (r > 0)
971                 return 0;
972
973         mkdir_p(to, 0755);
974
975         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
976                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
977
978         return 1;
979 }
980
981 static int mount_cgroup(const char *dest) {
982         _cleanup_set_free_free_ Set *controllers = NULL;
983         _cleanup_free_ char *own_cgroup_path = NULL;
984         const char *cgroup_root, *systemd_root, *systemd_own;
985         int r;
986
987         controllers = set_new(&string_hash_ops);
988         if (!controllers)
989                 return log_oom();
990
991         r = cg_kernel_controllers(controllers);
992         if (r < 0)
993                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
994
995         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
996         if (r < 0)
997                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
998
999         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1000         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1001                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1002
1003         for (;;) {
1004                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1005
1006                 controller = set_steal_first(controllers);
1007                 if (!controller)
1008                         break;
1009
1010                 origin = strappend("/sys/fs/cgroup/", controller);
1011                 if (!origin)
1012                         return log_oom();
1013
1014                 r = readlink_malloc(origin, &combined);
1015                 if (r == -EINVAL) {
1016                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1017
1018                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1019                         if (r < 0)
1020                                 return r;
1021
1022                 } else if (r < 0)
1023                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1024                 else {
1025                         _cleanup_free_ char *target = NULL;
1026
1027                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1028                         if (!target)
1029                                 return log_oom();
1030
1031                         /* A symbolic link, a combination of controllers in one hierarchy */
1032
1033                         if (!filename_is_valid(combined)) {
1034                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1035                                 continue;
1036                         }
1037
1038                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1039                         if (r < 0)
1040                                 return r;
1041
1042                         if (symlink(combined, target) < 0)
1043                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1044                 }
1045         }
1046
1047         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1048         if (r < 0)
1049                 return r;
1050
1051         /* Make our own cgroup a (writable) bind mount */
1052         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1053         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1054                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1055
1056         /* And then remount the systemd cgroup root read-only */
1057         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1058         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1059                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1060
1061         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1062                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1063
1064         return 0;
1065 }
1066
1067 static int mount_tmpfs(const char *dest) {
1068         char **i, **o;
1069
1070         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1071                 _cleanup_free_ char *where = NULL;
1072                 int r;
1073
1074                 where = strappend(dest, *i);
1075                 if (!where)
1076                         return log_oom();
1077
1078                 r = mkdir_label(where, 0755);
1079                 if (r < 0 && r != -EEXIST)
1080                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1081
1082                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1083                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1084         }
1085
1086         return 0;
1087 }
1088
1089 static int setup_timezone(const char *dest) {
1090         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1091         char *z, *y;
1092         int r;
1093
1094         assert(dest);
1095
1096         /* Fix the timezone, if possible */
1097         r = readlink_malloc("/etc/localtime", &p);
1098         if (r < 0) {
1099                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1100                 return 0;
1101         }
1102
1103         z = path_startswith(p, "../usr/share/zoneinfo/");
1104         if (!z)
1105                 z = path_startswith(p, "/usr/share/zoneinfo/");
1106         if (!z) {
1107                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1108                 return 0;
1109         }
1110
1111         where = strappend(dest, "/etc/localtime");
1112         if (!where)
1113                 return log_oom();
1114
1115         r = readlink_malloc(where, &q);
1116         if (r >= 0) {
1117                 y = path_startswith(q, "../usr/share/zoneinfo/");
1118                 if (!y)
1119                         y = path_startswith(q, "/usr/share/zoneinfo/");
1120
1121                 /* Already pointing to the right place? Then do nothing .. */
1122                 if (y && streq(y, z))
1123                         return 0;
1124         }
1125
1126         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1127         if (!check)
1128                 return log_oom();
1129
1130         if (access(check, F_OK) < 0) {
1131                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1132                 return 0;
1133         }
1134
1135         what = strappend("../usr/share/zoneinfo/", z);
1136         if (!what)
1137                 return log_oom();
1138
1139         r = mkdir_parents(where, 0755);
1140         if (r < 0) {
1141                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1142
1143                 return 0;
1144         }
1145
1146         r = unlink(where);
1147         if (r < 0 && errno != ENOENT) {
1148                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1149
1150                 return 0;
1151         }
1152
1153         if (symlink(what, where) < 0) {
1154                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1155                 return 0;
1156         }
1157
1158         return 0;
1159 }
1160
1161 static int setup_resolv_conf(const char *dest) {
1162         _cleanup_free_ char *where = NULL;
1163         int r;
1164
1165         assert(dest);
1166
1167         if (arg_private_network)
1168                 return 0;
1169
1170         /* Fix resolv.conf, if possible */
1171         where = strappend(dest, "/etc/resolv.conf");
1172         if (!where)
1173                 return log_oom();
1174
1175         /* We don't really care for the results of this really. If it
1176          * fails, it fails, but meh... */
1177         r = mkdir_parents(where, 0755);
1178         if (r < 0) {
1179                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1180
1181                 return 0;
1182         }
1183
1184         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1185         if (r < 0) {
1186                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1187
1188                 return 0;
1189         }
1190
1191         return 0;
1192 }
1193
1194 static int setup_volatile_state(const char *directory) {
1195         const char *p;
1196         int r;
1197
1198         assert(directory);
1199
1200         if (arg_volatile != VOLATILE_STATE)
1201                 return 0;
1202
1203         /* --volatile=state means we simply overmount /var
1204            with a tmpfs, and the rest read-only. */
1205
1206         r = bind_remount_recursive(directory, true);
1207         if (r < 0)
1208                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1209
1210         p = strappenda(directory, "/var");
1211         r = mkdir(p, 0755);
1212         if (r < 0 && errno != EEXIST)
1213                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1214
1215         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1216                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1217
1218         return 0;
1219 }
1220
1221 static int setup_volatile(const char *directory) {
1222         bool tmpfs_mounted = false, bind_mounted = false;
1223         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1224         const char *f, *t;
1225         int r;
1226
1227         assert(directory);
1228
1229         if (arg_volatile != VOLATILE_YES)
1230                 return 0;
1231
1232         /* --volatile=yes means we mount a tmpfs to the root dir, and
1233            the original /usr to use inside it, and that read-only. */
1234
1235         if (!mkdtemp(template))
1236                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1237
1238         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1239                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1240                 r = -errno;
1241                 goto fail;
1242         }
1243
1244         tmpfs_mounted = true;
1245
1246         f = strappenda(directory, "/usr");
1247         t = strappenda(template, "/usr");
1248
1249         r = mkdir(t, 0755);
1250         if (r < 0 && errno != EEXIST) {
1251                 log_error_errno(errno, "Failed to create %s: %m", t);
1252                 r = -errno;
1253                 goto fail;
1254         }
1255
1256         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1257                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1258                 r = -errno;
1259                 goto fail;
1260         }
1261
1262         bind_mounted = true;
1263
1264         r = bind_remount_recursive(t, true);
1265         if (r < 0) {
1266                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1267                 goto fail;
1268         }
1269
1270         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1271                 log_error_errno(errno, "Failed to move root mount: %m");
1272                 r = -errno;
1273                 goto fail;
1274         }
1275
1276         rmdir(template);
1277
1278         return 0;
1279
1280 fail:
1281         if (bind_mounted)
1282                 umount(t);
1283         if (tmpfs_mounted)
1284                 umount(template);
1285         rmdir(template);
1286         return r;
1287 }
1288
1289 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1290
1291         snprintf(s, 37,
1292                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1293                  SD_ID128_FORMAT_VAL(id));
1294
1295         return s;
1296 }
1297
1298 static int setup_boot_id(const char *dest) {
1299         _cleanup_free_ char *from = NULL, *to = NULL;
1300         sd_id128_t rnd = {};
1301         char as_uuid[37];
1302         int r;
1303
1304         assert(dest);
1305
1306         if (arg_share_system)
1307                 return 0;
1308
1309         /* Generate a new randomized boot ID, so that each boot-up of
1310          * the container gets a new one */
1311
1312         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1313         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1314         if (!from || !to)
1315                 return log_oom();
1316
1317         r = sd_id128_randomize(&rnd);
1318         if (r < 0)
1319                 return log_error_errno(r, "Failed to generate random boot id: %m");
1320
1321         id128_format_as_uuid(rnd, as_uuid);
1322
1323         r = write_string_file(from, as_uuid);
1324         if (r < 0)
1325                 return log_error_errno(r, "Failed to write boot id: %m");
1326
1327         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1328                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1329                 r = -errno;
1330         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1331                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1332
1333         unlink(from);
1334         return r;
1335 }
1336
1337 static int copy_devnodes(const char *dest) {
1338
1339         static const char devnodes[] =
1340                 "null\0"
1341                 "zero\0"
1342                 "full\0"
1343                 "random\0"
1344                 "urandom\0"
1345                 "tty\0"
1346                 "net/tun\0";
1347
1348         const char *d;
1349         int r = 0;
1350         _cleanup_umask_ mode_t u;
1351
1352         assert(dest);
1353
1354         u = umask(0000);
1355
1356         NULSTR_FOREACH(d, devnodes) {
1357                 _cleanup_free_ char *from = NULL, *to = NULL;
1358                 struct stat st;
1359
1360                 from = strappend("/dev/", d);
1361                 to = strjoin(dest, "/dev/", d, NULL);
1362                 if (!from || !to)
1363                         return log_oom();
1364
1365                 if (stat(from, &st) < 0) {
1366
1367                         if (errno != ENOENT)
1368                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1369
1370                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1371
1372                         log_error("%s is not a char or block device, cannot copy", from);
1373                         return -EIO;
1374
1375                 } else {
1376                         r = mkdir_parents(to, 0775);
1377                         if (r < 0) {
1378                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1379                                 return -r;
1380                         }
1381
1382                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1383                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1384                 }
1385         }
1386
1387         return r;
1388 }
1389
1390 static int setup_ptmx(const char *dest) {
1391         _cleanup_free_ char *p = NULL;
1392
1393         p = strappend(dest, "/dev/ptmx");
1394         if (!p)
1395                 return log_oom();
1396
1397         if (symlink("pts/ptmx", p) < 0)
1398                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1399
1400         return 0;
1401 }
1402
1403 static int setup_dev_console(const char *dest, const char *console) {
1404         _cleanup_umask_ mode_t u;
1405         const char *to;
1406         struct stat st;
1407         int r;
1408
1409         assert(dest);
1410         assert(console);
1411
1412         u = umask(0000);
1413
1414         if (stat("/dev/null", &st) < 0)
1415                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1416
1417         r = chmod_and_chown(console, 0600, 0, 0);
1418         if (r < 0)
1419                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1420
1421         /* We need to bind mount the right tty to /dev/console since
1422          * ptys can only exist on pts file systems. To have something
1423          * to bind mount things on we create a device node first, and
1424          * use /dev/null for that since we the cgroups device policy
1425          * allows us to create that freely, while we cannot create
1426          * /dev/console. (Note that the major minor doesn't actually
1427          * matter here, since we mount it over anyway). */
1428
1429         to = strappenda(dest, "/dev/console");
1430         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1431                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1432
1433         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1434                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1435
1436         return 0;
1437 }
1438
1439 static int setup_kmsg(const char *dest, int kmsg_socket) {
1440         _cleanup_free_ char *from = NULL, *to = NULL;
1441         _cleanup_umask_ mode_t u;
1442         int r, fd, k;
1443         union {
1444                 struct cmsghdr cmsghdr;
1445                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1446         } control = {};
1447         struct msghdr mh = {
1448                 .msg_control = &control,
1449                 .msg_controllen = sizeof(control),
1450         };
1451         struct cmsghdr *cmsg;
1452
1453         assert(dest);
1454         assert(kmsg_socket >= 0);
1455
1456         u = umask(0000);
1457
1458         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1459          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1460          * on the reading side behave very similar to /proc/kmsg,
1461          * their writing side behaves differently from /dev/kmsg in
1462          * that writing blocks when nothing is reading. In order to
1463          * avoid any problems with containers deadlocking due to this
1464          * we simply make /dev/kmsg unavailable to the container. */
1465         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1466             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1467                 return log_oom();
1468
1469         if (mkfifo(from, 0600) < 0)
1470                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1471
1472         r = chmod_and_chown(from, 0600, 0, 0);
1473         if (r < 0)
1474                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1475
1476         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1477                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1478
1479         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1480         if (fd < 0)
1481                 return log_error_errno(errno, "Failed to open fifo: %m");
1482
1483         cmsg = CMSG_FIRSTHDR(&mh);
1484         cmsg->cmsg_level = SOL_SOCKET;
1485         cmsg->cmsg_type = SCM_RIGHTS;
1486         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1487         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1488
1489         mh.msg_controllen = cmsg->cmsg_len;
1490
1491         /* Store away the fd in the socket, so that it stays open as
1492          * long as we run the child */
1493         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1494         safe_close(fd);
1495
1496         if (k < 0)
1497                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1498
1499         /* And now make the FIFO unavailable as /dev/kmsg... */
1500         unlink(from);
1501         return 0;
1502 }
1503
1504 static int send_rtnl(int send_fd) {
1505         union {
1506                 struct cmsghdr cmsghdr;
1507                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1508         } control = {};
1509         struct msghdr mh = {
1510                 .msg_control = &control,
1511                 .msg_controllen = sizeof(control),
1512         };
1513         struct cmsghdr *cmsg;
1514         _cleanup_close_ int fd = -1;
1515         ssize_t k;
1516
1517         assert(send_fd >= 0);
1518
1519         if (!arg_expose_ports)
1520                 return 0;
1521
1522         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1523         if (fd < 0)
1524                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1525
1526         cmsg = CMSG_FIRSTHDR(&mh);
1527         cmsg->cmsg_level = SOL_SOCKET;
1528         cmsg->cmsg_type = SCM_RIGHTS;
1529         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1530         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1531
1532         mh.msg_controllen = cmsg->cmsg_len;
1533
1534         /* Store away the fd in the socket, so that it stays open as
1535          * long as we run the child */
1536         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1537         if (k < 0)
1538                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1539
1540         return 0;
1541 }
1542
1543 static int flush_ports(union in_addr_union *exposed) {
1544         ExposePort *p;
1545         int r, af = AF_INET;
1546
1547         assert(exposed);
1548
1549         if (!arg_expose_ports)
1550                 return 0;
1551
1552         if (in_addr_is_null(af, exposed))
1553                 return 0;
1554
1555         log_debug("Lost IP address.");
1556
1557         LIST_FOREACH(ports, p, arg_expose_ports) {
1558                 r = fw_add_local_dnat(false,
1559                                       af,
1560                                       p->protocol,
1561                                       NULL,
1562                                       NULL, 0,
1563                                       NULL, 0,
1564                                       p->host_port,
1565                                       exposed,
1566                                       p->container_port,
1567                                       NULL);
1568                 if (r < 0)
1569                         log_warning_errno(r, "Failed to modify firewall: %m");
1570         }
1571
1572         *exposed = IN_ADDR_NULL;
1573         return 0;
1574 }
1575
1576 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1577         _cleanup_free_ struct local_address *addresses = NULL;
1578         _cleanup_free_ char *pretty = NULL;
1579         union in_addr_union new_exposed;
1580         ExposePort *p;
1581         bool add;
1582         int af = AF_INET, r;
1583
1584         assert(exposed);
1585
1586         /* Invoked each time an address is added or removed inside the
1587          * container */
1588
1589         if (!arg_expose_ports)
1590                 return 0;
1591
1592         r = local_addresses(rtnl, 0, af, &addresses);
1593         if (r < 0)
1594                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1595
1596         add = r > 0 &&
1597                 addresses[0].family == af &&
1598                 addresses[0].scope < RT_SCOPE_LINK;
1599
1600         if (!add)
1601                 return flush_ports(exposed);
1602
1603         new_exposed = addresses[0].address;
1604         if (in_addr_equal(af, exposed, &new_exposed))
1605                 return 0;
1606
1607         in_addr_to_string(af, &new_exposed, &pretty);
1608         log_debug("New container IP is %s.", strna(pretty));
1609
1610         LIST_FOREACH(ports, p, arg_expose_ports) {
1611
1612                 r = fw_add_local_dnat(true,
1613                                       af,
1614                                       p->protocol,
1615                                       NULL,
1616                                       NULL, 0,
1617                                       NULL, 0,
1618                                       p->host_port,
1619                                       &new_exposed,
1620                                       p->container_port,
1621                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1622                 if (r < 0)
1623                         log_warning_errno(r, "Failed to modify firewall: %m");
1624         }
1625
1626         *exposed = new_exposed;
1627         return 0;
1628 }
1629
1630 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1631         union in_addr_union *exposed = userdata;
1632
1633         assert(rtnl);
1634         assert(m);
1635         assert(exposed);
1636
1637         expose_ports(rtnl, exposed);
1638         return 0;
1639 }
1640
1641 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1642         union {
1643                 struct cmsghdr cmsghdr;
1644                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1645         } control = {};
1646         struct msghdr mh = {
1647                 .msg_control = &control,
1648                 .msg_controllen = sizeof(control),
1649         };
1650         struct cmsghdr *cmsg;
1651         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1652         int fd, r;
1653         ssize_t k;
1654
1655         assert(event);
1656         assert(recv_fd >= 0);
1657         assert(ret);
1658
1659         if (!arg_expose_ports)
1660                 return 0;
1661
1662         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1663         if (k < 0)
1664                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1665
1666         cmsg = CMSG_FIRSTHDR(&mh);
1667         assert(cmsg->cmsg_level == SOL_SOCKET);
1668         assert(cmsg->cmsg_type == SCM_RIGHTS);
1669         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1670         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1671
1672         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1673         if (r < 0) {
1674                 safe_close(fd);
1675                 return log_error_errno(r, "Failed to create rtnl object: %m");
1676         }
1677
1678         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1679         if (r < 0)
1680                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1681
1682         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1683         if (r < 0)
1684                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1685
1686         r = sd_rtnl_attach_event(rtnl, event, 0);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to add to even loop: %m");
1689
1690         *ret = rtnl;
1691         rtnl = NULL;
1692
1693         return 0;
1694 }
1695
1696 static int setup_hostname(void) {
1697
1698         if (arg_share_system)
1699                 return 0;
1700
1701         if (sethostname_idempotent(arg_machine) < 0)
1702                 return -errno;
1703
1704         return 0;
1705 }
1706
1707 static int setup_journal(const char *directory) {
1708         sd_id128_t machine_id, this_id;
1709         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1710         char *id;
1711         int r;
1712
1713         /* Don't link journals in ephemeral mode */
1714         if (arg_ephemeral)
1715                 return 0;
1716
1717         p = strappend(directory, "/etc/machine-id");
1718         if (!p)
1719                 return log_oom();
1720
1721         r = read_one_line_file(p, &b);
1722         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1723                 return 0;
1724         else if (r < 0)
1725                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1726
1727         id = strstrip(b);
1728         if (isempty(id) && arg_link_journal == LINK_AUTO)
1729                 return 0;
1730
1731         /* Verify validity */
1732         r = sd_id128_from_string(id, &machine_id);
1733         if (r < 0)
1734                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1735
1736         r = sd_id128_get_machine(&this_id);
1737         if (r < 0)
1738                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1739
1740         if (sd_id128_equal(machine_id, this_id)) {
1741                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1742                          "Host and machine ids are equal (%s): refusing to link journals", id);
1743                 if (arg_link_journal == LINK_AUTO)
1744                         return 0;
1745                 return -EEXIST;
1746         }
1747
1748         if (arg_link_journal == LINK_NO)
1749                 return 0;
1750
1751         free(p);
1752         p = strappend("/var/log/journal/", id);
1753         q = strjoin(directory, "/var/log/journal/", id, NULL);
1754         if (!p || !q)
1755                 return log_oom();
1756
1757         if (path_is_mount_point(p, false) > 0) {
1758                 if (arg_link_journal != LINK_AUTO) {
1759                         log_error("%s: already a mount point, refusing to use for journal", p);
1760                         return -EEXIST;
1761                 }
1762
1763                 return 0;
1764         }
1765
1766         if (path_is_mount_point(q, false) > 0) {
1767                 if (arg_link_journal != LINK_AUTO) {
1768                         log_error("%s: already a mount point, refusing to use for journal", q);
1769                         return -EEXIST;
1770                 }
1771
1772                 return 0;
1773         }
1774
1775         r = readlink_and_make_absolute(p, &d);
1776         if (r >= 0) {
1777                 if ((arg_link_journal == LINK_GUEST ||
1778                      arg_link_journal == LINK_AUTO) &&
1779                     path_equal(d, q)) {
1780
1781                         r = mkdir_p(q, 0755);
1782                         if (r < 0)
1783                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1784                         return 0;
1785                 }
1786
1787                 if (unlink(p) < 0)
1788                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1789         } else if (r == -EINVAL) {
1790
1791                 if (arg_link_journal == LINK_GUEST &&
1792                     rmdir(p) < 0) {
1793
1794                         if (errno == ENOTDIR) {
1795                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1796                                 return r;
1797                         } else {
1798                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1799                                 return -errno;
1800                         }
1801                 }
1802         } else if (r != -ENOENT) {
1803                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1804                 return r;
1805         }
1806
1807         if (arg_link_journal == LINK_GUEST) {
1808
1809                 if (symlink(q, p) < 0) {
1810                         if (arg_link_journal_try) {
1811                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1812                                 return 0;
1813                         } else {
1814                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1815                                 return -errno;
1816                         }
1817                 }
1818
1819                 r = mkdir_p(q, 0755);
1820                 if (r < 0)
1821                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1822                 return 0;
1823         }
1824
1825         if (arg_link_journal == LINK_HOST) {
1826                 /* don't create parents here -- if the host doesn't have
1827                  * permanent journal set up, don't force it here */
1828                 r = mkdir(p, 0755);
1829                 if (r < 0) {
1830                         if (arg_link_journal_try) {
1831                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1832                                 return 0;
1833                         } else {
1834                                 log_error_errno(errno, "Failed to create %s: %m", p);
1835                                 return r;
1836                         }
1837                 }
1838
1839         } else if (access(p, F_OK) < 0)
1840                 return 0;
1841
1842         if (dir_is_empty(q) == 0)
1843                 log_warning("%s is not empty, proceeding anyway.", q);
1844
1845         r = mkdir_p(q, 0755);
1846         if (r < 0) {
1847                 log_error_errno(errno, "Failed to create %s: %m", q);
1848                 return r;
1849         }
1850
1851         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1852                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1853
1854         return 0;
1855 }
1856
1857 static int drop_capabilities(void) {
1858         return capability_bounding_set_drop(~arg_retain, false);
1859 }
1860
1861 static int register_machine(pid_t pid, int local_ifindex) {
1862         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1863         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1864         int r;
1865
1866         if (!arg_register)
1867                 return 0;
1868
1869         r = sd_bus_default_system(&bus);
1870         if (r < 0)
1871                 return log_error_errno(r, "Failed to open system bus: %m");
1872
1873         if (arg_keep_unit) {
1874                 r = sd_bus_call_method(
1875                                 bus,
1876                                 "org.freedesktop.machine1",
1877                                 "/org/freedesktop/machine1",
1878                                 "org.freedesktop.machine1.Manager",
1879                                 "RegisterMachineWithNetwork",
1880                                 &error,
1881                                 NULL,
1882                                 "sayssusai",
1883                                 arg_machine,
1884                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1885                                 "nspawn",
1886                                 "container",
1887                                 (uint32_t) pid,
1888                                 strempty(arg_directory),
1889                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1890         } else {
1891                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1892
1893                 r = sd_bus_message_new_method_call(
1894                                 bus,
1895                                 &m,
1896                                 "org.freedesktop.machine1",
1897                                 "/org/freedesktop/machine1",
1898                                 "org.freedesktop.machine1.Manager",
1899                                 "CreateMachineWithNetwork");
1900                 if (r < 0)
1901                         return log_error_errno(r, "Failed to create message: %m");
1902
1903                 r = sd_bus_message_append(
1904                                 m,
1905                                 "sayssusai",
1906                                 arg_machine,
1907                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1908                                 "nspawn",
1909                                 "container",
1910                                 (uint32_t) pid,
1911                                 strempty(arg_directory),
1912                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1913                 if (r < 0)
1914                         return log_error_errno(r, "Failed to append message arguments: %m");
1915
1916                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1917                 if (r < 0)
1918                         return log_error_errno(r, "Failed to open container: %m");
1919
1920                 if (!isempty(arg_slice)) {
1921                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1922                         if (r < 0)
1923                                 return log_error_errno(r, "Failed to append slice: %m");
1924                 }
1925
1926                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1927                 if (r < 0)
1928                         return log_error_errno(r, "Failed to add device policy: %m");
1929
1930                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1931                                           /* Allow the container to
1932                                            * access and create the API
1933                                            * device nodes, so that
1934                                            * PrivateDevices= in the
1935                                            * container can work
1936                                            * fine */
1937                                           "/dev/null", "rwm",
1938                                           "/dev/zero", "rwm",
1939                                           "/dev/full", "rwm",
1940                                           "/dev/random", "rwm",
1941                                           "/dev/urandom", "rwm",
1942                                           "/dev/tty", "rwm",
1943                                           "/dev/net/tun", "rwm",
1944                                           /* Allow the container
1945                                            * access to ptys. However,
1946                                            * do not permit the
1947                                            * container to ever create
1948                                            * these device nodes. */
1949                                           "/dev/pts/ptmx", "rw",
1950                                           "char-pts", "rw");
1951                 if (r < 0)
1952                         return log_error_errno(r, "Failed to add device whitelist: %m");
1953
1954                 r = sd_bus_message_close_container(m);
1955                 if (r < 0)
1956                         return log_error_errno(r, "Failed to close container: %m");
1957
1958                 r = sd_bus_call(bus, m, 0, &error, NULL);
1959         }
1960
1961         if (r < 0) {
1962                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1963                 return r;
1964         }
1965
1966         return 0;
1967 }
1968
1969 static int terminate_machine(pid_t pid) {
1970         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1971         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1972         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1973         const char *path;
1974         int r;
1975
1976         if (!arg_register)
1977                 return 0;
1978
1979         r = sd_bus_default_system(&bus);
1980         if (r < 0)
1981                 return log_error_errno(r, "Failed to open system bus: %m");
1982
1983         r = sd_bus_call_method(
1984                         bus,
1985                         "org.freedesktop.machine1",
1986                         "/org/freedesktop/machine1",
1987                         "org.freedesktop.machine1.Manager",
1988                         "GetMachineByPID",
1989                         &error,
1990                         &reply,
1991                         "u",
1992                         (uint32_t) pid);
1993         if (r < 0) {
1994                 /* Note that the machine might already have been
1995                  * cleaned up automatically, hence don't consider it a
1996                  * failure if we cannot get the machine object. */
1997                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1998                 return 0;
1999         }
2000
2001         r = sd_bus_message_read(reply, "o", &path);
2002         if (r < 0)
2003                 return bus_log_parse_error(r);
2004
2005         r = sd_bus_call_method(
2006                         bus,
2007                         "org.freedesktop.machine1",
2008                         path,
2009                         "org.freedesktop.machine1.Machine",
2010                         "Terminate",
2011                         &error,
2012                         NULL,
2013                         NULL);
2014         if (r < 0) {
2015                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2016                 return 0;
2017         }
2018
2019         return 0;
2020 }
2021
2022 static int reset_audit_loginuid(void) {
2023         _cleanup_free_ char *p = NULL;
2024         int r;
2025
2026         if (arg_share_system)
2027                 return 0;
2028
2029         r = read_one_line_file("/proc/self/loginuid", &p);
2030         if (r == -ENOENT)
2031                 return 0;
2032         if (r < 0)
2033                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2034
2035         /* Already reset? */
2036         if (streq(p, "4294967295"))
2037                 return 0;
2038
2039         r = write_string_file("/proc/self/loginuid", "4294967295");
2040         if (r < 0) {
2041                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2042                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2043                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2044                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2045                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2046
2047                 sleep(5);
2048         }
2049
2050         return 0;
2051 }
2052
2053 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2054 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2055 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2056
2057 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2058         uint8_t result[8];
2059         size_t l, sz;
2060         uint8_t *v, *i;
2061         int r;
2062
2063         l = strlen(arg_machine);
2064         sz = sizeof(sd_id128_t) + l;
2065         if (idx > 0)
2066                 sz += sizeof(idx);
2067
2068         v = alloca(sz);
2069
2070         /* fetch some persistent data unique to the host */
2071         r = sd_id128_get_machine((sd_id128_t*) v);
2072         if (r < 0)
2073                 return r;
2074
2075         /* combine with some data unique (on this host) to this
2076          * container instance */
2077         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2078         if (idx > 0) {
2079                 idx = htole64(idx);
2080                 memcpy(i, &idx, sizeof(idx));
2081         }
2082
2083         /* Let's hash the host machine ID plus the container name. We
2084          * use a fixed, but originally randomly created hash key here. */
2085         siphash24(result, v, sz, hash_key.bytes);
2086
2087         assert_cc(ETH_ALEN <= sizeof(result));
2088         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2089
2090         /* see eth_random_addr in the kernel */
2091         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2092         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2093
2094         return 0;
2095 }
2096
2097 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2098         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2099         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2100         struct ether_addr mac_host, mac_container;
2101         int r, i;
2102
2103         if (!arg_private_network)
2104                 return 0;
2105
2106         if (!arg_network_veth)
2107                 return 0;
2108
2109         /* Use two different interface name prefixes depending whether
2110          * we are in bridge mode or not. */
2111         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2112                  arg_network_bridge ? "vb" : "ve", arg_machine);
2113
2114         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2115         if (r < 0)
2116                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2117
2118         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2119         if (r < 0)
2120                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2121
2122         r = sd_rtnl_open(&rtnl, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to connect to netlink: %m");
2125
2126         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2129
2130         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2133
2134         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2137
2138         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to open netlink container: %m");
2141
2142         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to open netlink container: %m");
2145
2146         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2153
2154         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2157
2158         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2161
2162         r = sd_rtnl_message_close_container(m);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to close netlink container: %m");
2165
2166         r = sd_rtnl_message_close_container(m);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to close netlink container: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_call(rtnl, m, 0, NULL);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2177
2178         i = (int) if_nametoindex(iface_name);
2179         if (i <= 0)
2180                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2181
2182         *ifi = i;
2183
2184         return 0;
2185 }
2186
2187 static int setup_bridge(const char veth_name[], int *ifi) {
2188         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2189         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2190         int r, bridge;
2191
2192         if (!arg_private_network)
2193                 return 0;
2194
2195         if (!arg_network_veth)
2196                 return 0;
2197
2198         if (!arg_network_bridge)
2199                 return 0;
2200
2201         bridge = (int) if_nametoindex(arg_network_bridge);
2202         if (bridge <= 0)
2203                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2204
2205         *ifi = bridge;
2206
2207         r = sd_rtnl_open(&rtnl, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to connect to netlink: %m");
2210
2211         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2214
2215         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2218
2219         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2222
2223         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink master field: %m");
2226
2227         r = sd_rtnl_call(rtnl, m, 0, NULL);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2230
2231         return 0;
2232 }
2233
2234 static int parse_interface(struct udev *udev, const char *name) {
2235         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2236         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2237         int ifi;
2238
2239         ifi = (int) if_nametoindex(name);
2240         if (ifi <= 0)
2241                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2242
2243         sprintf(ifi_str, "n%i", ifi);
2244         d = udev_device_new_from_device_id(udev, ifi_str);
2245         if (!d)
2246                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2247
2248         if (udev_device_get_is_initialized(d) <= 0) {
2249                 log_error("Network interface %s is not initialized yet.", name);
2250                 return -EBUSY;
2251         }
2252
2253         return ifi;
2254 }
2255
2256 static int move_network_interfaces(pid_t pid) {
2257         _cleanup_udev_unref_ struct udev *udev = NULL;
2258         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2259         char **i;
2260         int r;
2261
2262         if (!arg_private_network)
2263                 return 0;
2264
2265         if (strv_isempty(arg_network_interfaces))
2266                 return 0;
2267
2268         r = sd_rtnl_open(&rtnl, 0);
2269         if (r < 0)
2270                 return log_error_errno(r, "Failed to connect to netlink: %m");
2271
2272         udev = udev_new();
2273         if (!udev) {
2274                 log_error("Failed to connect to udev.");
2275                 return -ENOMEM;
2276         }
2277
2278         STRV_FOREACH(i, arg_network_interfaces) {
2279                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2280                 int ifi;
2281
2282                 ifi = parse_interface(udev, *i);
2283                 if (ifi < 0)
2284                         return ifi;
2285
2286                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2287                 if (r < 0)
2288                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2289
2290                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2291                 if (r < 0)
2292                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2293
2294                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2297         }
2298
2299         return 0;
2300 }
2301
2302 static int setup_macvlan(pid_t pid) {
2303         _cleanup_udev_unref_ struct udev *udev = NULL;
2304         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2305         unsigned idx = 0;
2306         char **i;
2307         int r;
2308
2309         if (!arg_private_network)
2310                 return 0;
2311
2312         if (strv_isempty(arg_network_macvlan))
2313                 return 0;
2314
2315         r = sd_rtnl_open(&rtnl, 0);
2316         if (r < 0)
2317                 return log_error_errno(r, "Failed to connect to netlink: %m");
2318
2319         udev = udev_new();
2320         if (!udev) {
2321                 log_error("Failed to connect to udev.");
2322                 return -ENOMEM;
2323         }
2324
2325         STRV_FOREACH(i, arg_network_macvlan) {
2326                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2327                 _cleanup_free_ char *n = NULL;
2328                 struct ether_addr mac;
2329                 int ifi;
2330
2331                 ifi = parse_interface(udev, *i);
2332                 if (ifi < 0)
2333                         return ifi;
2334
2335                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2336                 if (r < 0)
2337                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2338
2339                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2340                 if (r < 0)
2341                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2342
2343                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2346
2347                 n = strappend("mv-", *i);
2348                 if (!n)
2349                         return log_oom();
2350
2351                 strshorten(n, IFNAMSIZ-1);
2352
2353                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2354                 if (r < 0)
2355                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2356
2357                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2358                 if (r < 0)
2359                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2360
2361                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2364
2365                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to open netlink container: %m");
2368
2369                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to open netlink container: %m");
2372
2373                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2376
2377                 r = sd_rtnl_message_close_container(m);
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to close netlink container: %m");
2380
2381                 r = sd_rtnl_message_close_container(m);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to close netlink container: %m");
2384
2385                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2388         }
2389
2390         return 0;
2391 }
2392
2393 static int setup_ipvlan(pid_t pid) {
2394         _cleanup_udev_unref_ struct udev *udev = NULL;
2395         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2396         char **i;
2397         int r;
2398
2399         if (!arg_private_network)
2400                 return 0;
2401
2402         if (strv_isempty(arg_network_ipvlan))
2403                 return 0;
2404
2405         r = sd_rtnl_open(&rtnl, 0);
2406         if (r < 0)
2407                 return log_error_errno(r, "Failed to connect to netlink: %m");
2408
2409         udev = udev_new();
2410         if (!udev) {
2411                 log_error("Failed to connect to udev.");
2412                 return -ENOMEM;
2413         }
2414
2415         STRV_FOREACH(i, arg_network_ipvlan) {
2416                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2417                 _cleanup_free_ char *n = NULL;
2418                 int ifi;
2419
2420                 ifi = parse_interface(udev, *i);
2421                 if (ifi < 0)
2422                         return ifi;
2423
2424                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2425                 if (r < 0)
2426                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2427
2428                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2431
2432                 n = strappend("iv-", *i);
2433                 if (!n)
2434                         return log_oom();
2435
2436                 strshorten(n, IFNAMSIZ-1);
2437
2438                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2439                 if (r < 0)
2440                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2441
2442                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2457
2458                 r = sd_rtnl_message_close_container(m);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2469         }
2470
2471         return 0;
2472 }
2473
2474 static int setup_seccomp(void) {
2475
2476 #ifdef HAVE_SECCOMP
2477         static const int blacklist[] = {
2478                 SCMP_SYS(kexec_load),
2479                 SCMP_SYS(open_by_handle_at),
2480                 SCMP_SYS(init_module),
2481                 SCMP_SYS(finit_module),
2482                 SCMP_SYS(delete_module),
2483                 SCMP_SYS(iopl),
2484                 SCMP_SYS(ioperm),
2485                 SCMP_SYS(swapon),
2486                 SCMP_SYS(swapoff),
2487         };
2488
2489         scmp_filter_ctx seccomp;
2490         unsigned i;
2491         int r;
2492
2493         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2494         if (!seccomp)
2495                 return log_oom();
2496
2497         r = seccomp_add_secondary_archs(seccomp);
2498         if (r < 0) {
2499                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2500                 goto finish;
2501         }
2502
2503         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2504                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2505                 if (r == -EFAULT)
2506                         continue; /* unknown syscall */
2507                 if (r < 0) {
2508                         log_error_errno(r, "Failed to block syscall: %m");
2509                         goto finish;
2510                 }
2511         }
2512
2513         /*
2514            Audit is broken in containers, much of the userspace audit
2515            hookup will fail if running inside a container. We don't
2516            care and just turn off creation of audit sockets.
2517
2518            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2519            with EAFNOSUPPORT which audit userspace uses as indication
2520            that audit is disabled in the kernel.
2521          */
2522
2523         r = seccomp_rule_add(
2524                         seccomp,
2525                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2526                         SCMP_SYS(socket),
2527                         2,
2528                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2529                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2530         if (r < 0) {
2531                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2532                 goto finish;
2533         }
2534
2535         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2536         if (r < 0) {
2537                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2538                 goto finish;
2539         }
2540
2541         r = seccomp_load(seccomp);
2542         if (r < 0)
2543                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2544
2545 finish:
2546         seccomp_release(seccomp);
2547         return r;
2548 #else
2549         return 0;
2550 #endif
2551
2552 }
2553
2554 static int setup_propagate(const char *root) {
2555         const char *p, *q;
2556
2557         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2558         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2559         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2560         (void) mkdir_p(p, 0600);
2561
2562         q = strappenda(root, "/run/systemd/nspawn/incoming");
2563         mkdir_parents(q, 0755);
2564         mkdir_p(q, 0600);
2565
2566         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2567                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2568
2569         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2570                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2571
2572         return 0;
2573 }
2574
2575 static int setup_image(char **device_path, int *loop_nr) {
2576         struct loop_info64 info = {
2577                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2578         };
2579         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2580         _cleanup_free_ char* loopdev = NULL;
2581         struct stat st;
2582         int r, nr;
2583
2584         assert(device_path);
2585         assert(loop_nr);
2586         assert(arg_image);
2587
2588         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2589         if (fd < 0)
2590                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2591
2592         if (fstat(fd, &st) < 0)
2593                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2594
2595         if (S_ISBLK(st.st_mode)) {
2596                 char *p;
2597
2598                 p = strdup(arg_image);
2599                 if (!p)
2600                         return log_oom();
2601
2602                 *device_path = p;
2603
2604                 *loop_nr = -1;
2605
2606                 r = fd;
2607                 fd = -1;
2608
2609                 return r;
2610         }
2611
2612         if (!S_ISREG(st.st_mode)) {
2613                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2614                 return -EINVAL;
2615         }
2616
2617         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2618         if (control < 0)
2619                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2620
2621         nr = ioctl(control, LOOP_CTL_GET_FREE);
2622         if (nr < 0)
2623                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2624
2625         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2626                 return log_oom();
2627
2628         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2629         if (loop < 0)
2630                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2631
2632         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2633                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2634
2635         if (arg_read_only)
2636                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2637
2638         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2639                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2640
2641         *device_path = loopdev;
2642         loopdev = NULL;
2643
2644         *loop_nr = nr;
2645
2646         r = loop;
2647         loop = -1;
2648
2649         return r;
2650 }
2651
2652 #define PARTITION_TABLE_BLURB \
2653         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2654         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2655         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2656         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2657         "to be bootable with systemd-nspawn."
2658
2659 static int dissect_image(
2660                 int fd,
2661                 char **root_device, bool *root_device_rw,
2662                 char **home_device, bool *home_device_rw,
2663                 char **srv_device, bool *srv_device_rw,
2664                 bool *secondary) {
2665
2666 #ifdef HAVE_BLKID
2667         int home_nr = -1, srv_nr = -1;
2668 #ifdef GPT_ROOT_NATIVE
2669         int root_nr = -1;
2670 #endif
2671 #ifdef GPT_ROOT_SECONDARY
2672         int secondary_root_nr = -1;
2673 #endif
2674         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2675         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2676         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2677         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2678         _cleanup_udev_unref_ struct udev *udev = NULL;
2679         struct udev_list_entry *first, *item;
2680         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2681         bool is_gpt, is_mbr, multiple_generic = false;
2682         const char *pttype = NULL;
2683         blkid_partlist pl;
2684         struct stat st;
2685         unsigned i;
2686         int r;
2687
2688         assert(fd >= 0);
2689         assert(root_device);
2690         assert(home_device);
2691         assert(srv_device);
2692         assert(secondary);
2693         assert(arg_image);
2694
2695         b = blkid_new_probe();
2696         if (!b)
2697                 return log_oom();
2698
2699         errno = 0;
2700         r = blkid_probe_set_device(b, fd, 0, 0);
2701         if (r != 0) {
2702                 if (errno == 0)
2703                         return log_oom();
2704
2705                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2706                 return -errno;
2707         }
2708
2709         blkid_probe_enable_partitions(b, 1);
2710         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2711
2712         errno = 0;
2713         r = blkid_do_safeprobe(b);
2714         if (r == -2 || r == 1) {
2715                 log_error("Failed to identify any partition table on\n"
2716                           "    %s\n"
2717                           PARTITION_TABLE_BLURB, arg_image);
2718                 return -EINVAL;
2719         } else if (r != 0) {
2720                 if (errno == 0)
2721                         errno = EIO;
2722                 log_error_errno(errno, "Failed to probe: %m");
2723                 return -errno;
2724         }
2725
2726         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2727
2728         is_gpt = streq_ptr(pttype, "gpt");
2729         is_mbr = streq_ptr(pttype, "dos");
2730
2731         if (!is_gpt && !is_mbr) {
2732                 log_error("No GPT or MBR partition table discovered on\n"
2733                           "    %s\n"
2734                           PARTITION_TABLE_BLURB, arg_image);
2735                 return -EINVAL;
2736         }
2737
2738         errno = 0;
2739         pl = blkid_probe_get_partitions(b);
2740         if (!pl) {
2741                 if (errno == 0)
2742                         return log_oom();
2743
2744                 log_error("Failed to list partitions of %s", arg_image);
2745                 return -errno;
2746         }
2747
2748         udev = udev_new();
2749         if (!udev)
2750                 return log_oom();
2751
2752         if (fstat(fd, &st) < 0)
2753                 return log_error_errno(errno, "Failed to stat block device: %m");
2754
2755         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2756         if (!d)
2757                 return log_oom();
2758
2759         for (i = 0;; i++) {
2760                 int n, m;
2761
2762                 if (i >= 10) {
2763                         log_error("Kernel partitions never appeared.");
2764                         return -ENXIO;
2765                 }
2766
2767                 e = udev_enumerate_new(udev);
2768                 if (!e)
2769                         return log_oom();
2770
2771                 r = udev_enumerate_add_match_parent(e, d);
2772                 if (r < 0)
2773                         return log_oom();
2774
2775                 r = udev_enumerate_scan_devices(e);
2776                 if (r < 0)
2777                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2778
2779                 /* Count the partitions enumerated by the kernel */
2780                 n = 0;
2781                 first = udev_enumerate_get_list_entry(e);
2782                 udev_list_entry_foreach(item, first)
2783                         n++;
2784
2785                 /* Count the partitions enumerated by blkid */
2786                 m = blkid_partlist_numof_partitions(pl);
2787                 if (n == m + 1)
2788                         break;
2789                 if (n > m + 1) {
2790                         log_error("blkid and kernel partition list do not match.");
2791                         return -EIO;
2792                 }
2793                 if (n < m + 1) {
2794                         unsigned j;
2795
2796                         /* The kernel has probed fewer partitions than
2797                          * blkid? Maybe the kernel prober is still
2798                          * running or it got EBUSY because udev
2799                          * already opened the device. Let's reprobe
2800                          * the device, which is a synchronous call
2801                          * that waits until probing is complete. */
2802
2803                         for (j = 0; j < 20; j++) {
2804
2805                                 r = ioctl(fd, BLKRRPART, 0);
2806                                 if (r < 0)
2807                                         r = -errno;
2808                                 if (r >= 0 || r != -EBUSY)
2809                                         break;
2810
2811                                 /* If something else has the device
2812                                  * open, such as an udev rule, the
2813                                  * ioctl will return EBUSY. Since
2814                                  * there's no way to wait until it
2815                                  * isn't busy anymore, let's just wait
2816                                  * a bit, and try again.
2817                                  *
2818                                  * This is really something they
2819                                  * should fix in the kernel! */
2820
2821                                 usleep(50 * USEC_PER_MSEC);
2822                         }
2823
2824                         if (r < 0)
2825                                 return log_error_errno(r, "Failed to reread partition table: %m");
2826                 }
2827
2828                 e = udev_enumerate_unref(e);
2829         }
2830
2831         first = udev_enumerate_get_list_entry(e);
2832         udev_list_entry_foreach(item, first) {
2833                 _cleanup_udev_device_unref_ struct udev_device *q;
2834                 const char *node;
2835                 unsigned long long flags;
2836                 blkid_partition pp;
2837                 dev_t qn;
2838                 int nr;
2839
2840                 errno = 0;
2841                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2842                 if (!q) {
2843                         if (!errno)
2844                                 errno = ENOMEM;
2845
2846                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2847                         return -errno;
2848                 }
2849
2850                 qn = udev_device_get_devnum(q);
2851                 if (major(qn) == 0)
2852                         continue;
2853
2854                 if (st.st_rdev == qn)
2855                         continue;
2856
2857                 node = udev_device_get_devnode(q);
2858                 if (!node)
2859                         continue;
2860
2861                 pp = blkid_partlist_devno_to_partition(pl, qn);
2862                 if (!pp)
2863                         continue;
2864
2865                 flags = blkid_partition_get_flags(pp);
2866
2867                 nr = blkid_partition_get_partno(pp);
2868                 if (nr < 0)
2869                         continue;
2870
2871                 if (is_gpt) {
2872                         sd_id128_t type_id;
2873                         const char *stype;
2874
2875                         if (flags & GPT_FLAG_NO_AUTO)
2876                                 continue;
2877
2878                         stype = blkid_partition_get_type_string(pp);
2879                         if (!stype)
2880                                 continue;
2881
2882                         if (sd_id128_from_string(stype, &type_id) < 0)
2883                                 continue;
2884
2885                         if (sd_id128_equal(type_id, GPT_HOME)) {
2886
2887                                 if (home && nr >= home_nr)
2888                                         continue;
2889
2890                                 home_nr = nr;
2891                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2892
2893                                 r = free_and_strdup(&home, node);
2894                                 if (r < 0)
2895                                         return log_oom();
2896
2897                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2898
2899                                 if (srv && nr >= srv_nr)
2900                                         continue;
2901
2902                                 srv_nr = nr;
2903                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2904
2905                                 r = free_and_strdup(&srv, node);
2906                                 if (r < 0)
2907                                         return log_oom();
2908                         }
2909 #ifdef GPT_ROOT_NATIVE
2910                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2911
2912                                 if (root && nr >= root_nr)
2913                                         continue;
2914
2915                                 root_nr = nr;
2916                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918                                 r = free_and_strdup(&root, node);
2919                                 if (r < 0)
2920                                         return log_oom();
2921                         }
2922 #endif
2923 #ifdef GPT_ROOT_SECONDARY
2924                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2925
2926                                 if (secondary_root && nr >= secondary_root_nr)
2927                                         continue;
2928
2929                                 secondary_root_nr = nr;
2930                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2931
2932                                 r = free_and_strdup(&secondary_root, node);
2933                                 if (r < 0)
2934                                         return log_oom();
2935                         }
2936 #endif
2937                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2938
2939                                 if (generic)
2940                                         multiple_generic = true;
2941                                 else {
2942                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2943
2944                                         r = free_and_strdup(&generic, node);
2945                                         if (r < 0)
2946                                                 return log_oom();
2947                                 }
2948                         }
2949
2950                 } else if (is_mbr) {
2951                         int type;
2952
2953                         if (flags != 0x80) /* Bootable flag */
2954                                 continue;
2955
2956                         type = blkid_partition_get_type(pp);
2957                         if (type != 0x83) /* Linux partition */
2958                                 continue;
2959
2960                         if (generic)
2961                                 multiple_generic = true;
2962                         else {
2963                                 generic_rw = true;
2964
2965                                 r = free_and_strdup(&root, node);
2966                                 if (r < 0)
2967                                         return log_oom();
2968                         }
2969                 }
2970         }
2971
2972         if (root) {
2973                 *root_device = root;
2974                 root = NULL;
2975
2976                 *root_device_rw = root_rw;
2977                 *secondary = false;
2978         } else if (secondary_root) {
2979                 *root_device = secondary_root;
2980                 secondary_root = NULL;
2981
2982                 *root_device_rw = secondary_root_rw;
2983                 *secondary = true;
2984         } else if (generic) {
2985
2986                 /* There were no partitions with precise meanings
2987                  * around, but we found generic partitions. In this
2988                  * case, if there's only one, we can go ahead and boot
2989                  * it, otherwise we bail out, because we really cannot
2990                  * make any sense of it. */
2991
2992                 if (multiple_generic) {
2993                         log_error("Identified multiple bootable Linux partitions on\n"
2994                                   "    %s\n"
2995                                   PARTITION_TABLE_BLURB, arg_image);
2996                         return -EINVAL;
2997                 }
2998
2999                 *root_device = generic;
3000                 generic = NULL;
3001
3002                 *root_device_rw = generic_rw;
3003                 *secondary = false;
3004         } else {
3005                 log_error("Failed to identify root partition in disk image\n"
3006                           "    %s\n"
3007                           PARTITION_TABLE_BLURB, arg_image);
3008                 return -EINVAL;
3009         }
3010
3011         if (home) {
3012                 *home_device = home;
3013                 home = NULL;
3014
3015                 *home_device_rw = home_rw;
3016         }
3017
3018         if (srv) {
3019                 *srv_device = srv;
3020                 srv = NULL;
3021
3022                 *srv_device_rw = srv_rw;
3023         }
3024
3025         return 0;
3026 #else
3027         log_error("--image= is not supported, compiled without blkid support.");
3028         return -ENOTSUP;
3029 #endif
3030 }
3031
3032 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3033 #ifdef HAVE_BLKID
3034         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3035         const char *fstype, *p;
3036         int r;
3037
3038         assert(what);
3039         assert(where);
3040
3041         if (arg_read_only)
3042                 rw = false;
3043
3044         if (directory)
3045                 p = strappenda(where, directory);
3046         else
3047                 p = where;
3048
3049         errno = 0;
3050         b = blkid_new_probe_from_filename(what);
3051         if (!b) {
3052                 if (errno == 0)
3053                         return log_oom();
3054                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3055                 return -errno;
3056         }
3057
3058         blkid_probe_enable_superblocks(b, 1);
3059         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3060
3061         errno = 0;
3062         r = blkid_do_safeprobe(b);
3063         if (r == -1 || r == 1) {
3064                 log_error("Cannot determine file system type of %s", what);
3065                 return -EINVAL;
3066         } else if (r != 0) {
3067                 if (errno == 0)
3068                         errno = EIO;
3069                 log_error_errno(errno, "Failed to probe %s: %m", what);
3070                 return -errno;
3071         }
3072
3073         errno = 0;
3074         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3075                 if (errno == 0)
3076                         errno = EINVAL;
3077                 log_error("Failed to determine file system type of %s", what);
3078                 return -errno;
3079         }
3080
3081         if (streq(fstype, "crypto_LUKS")) {
3082                 log_error("nspawn currently does not support LUKS disk images.");
3083                 return -ENOTSUP;
3084         }
3085
3086         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3087                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3088
3089         return 0;
3090 #else
3091         log_error("--image= is not supported, compiled without blkid support.");
3092         return -ENOTSUP;
3093 #endif
3094 }
3095
3096 static int mount_devices(
3097                 const char *where,
3098                 const char *root_device, bool root_device_rw,
3099                 const char *home_device, bool home_device_rw,
3100                 const char *srv_device, bool srv_device_rw) {
3101         int r;
3102
3103         assert(where);
3104
3105         if (root_device) {
3106                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3107                 if (r < 0)
3108                         return log_error_errno(r, "Failed to mount root directory: %m");
3109         }
3110
3111         if (home_device) {
3112                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3113                 if (r < 0)
3114                         return log_error_errno(r, "Failed to mount home directory: %m");
3115         }
3116
3117         if (srv_device) {
3118                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3119                 if (r < 0)
3120                         return log_error_errno(r, "Failed to mount server data directory: %m");
3121         }
3122
3123         return 0;
3124 }
3125
3126 static void loop_remove(int nr, int *image_fd) {
3127         _cleanup_close_ int control = -1;
3128         int r;
3129
3130         if (nr < 0)
3131                 return;
3132
3133         if (image_fd && *image_fd >= 0) {
3134                 r = ioctl(*image_fd, LOOP_CLR_FD);
3135                 if (r < 0)
3136                         log_debug_errno(errno, "Failed to close loop image: %m");
3137                 *image_fd = safe_close(*image_fd);
3138         }
3139
3140         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3141         if (control < 0) {
3142                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3143                 return;
3144         }
3145
3146         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3147         if (r < 0)
3148                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3149 }
3150
3151 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3152         int pipe_fds[2];
3153         pid_t pid;
3154
3155         assert(database);
3156         assert(key);
3157         assert(rpid);
3158
3159         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3160                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3161
3162         pid = fork();
3163         if (pid < 0)
3164                 return log_error_errno(errno, "Failed to fork getent child: %m");
3165         else if (pid == 0) {
3166                 int nullfd;
3167                 char *empty_env = NULL;
3168
3169                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3170                         _exit(EXIT_FAILURE);
3171
3172                 if (pipe_fds[0] > 2)
3173                         safe_close(pipe_fds[0]);
3174                 if (pipe_fds[1] > 2)
3175                         safe_close(pipe_fds[1]);
3176
3177                 nullfd = open("/dev/null", O_RDWR);
3178                 if (nullfd < 0)
3179                         _exit(EXIT_FAILURE);
3180
3181                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3182                         _exit(EXIT_FAILURE);
3183
3184                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3185                         _exit(EXIT_FAILURE);
3186
3187                 if (nullfd > 2)
3188                         safe_close(nullfd);
3189
3190                 reset_all_signal_handlers();
3191                 close_all_fds(NULL, 0);
3192
3193                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3194                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3195                 _exit(EXIT_FAILURE);
3196         }
3197
3198         pipe_fds[1] = safe_close(pipe_fds[1]);
3199
3200         *rpid = pid;
3201
3202         return pipe_fds[0];
3203 }
3204
3205 static int change_uid_gid(char **_home) {
3206         char line[LINE_MAX], *x, *u, *g, *h;
3207         const char *word, *state;
3208         _cleanup_free_ uid_t *uids = NULL;
3209         _cleanup_free_ char *home = NULL;
3210         _cleanup_fclose_ FILE *f = NULL;
3211         _cleanup_close_ int fd = -1;
3212         unsigned n_uids = 0;
3213         size_t sz = 0, l;
3214         uid_t uid;
3215         gid_t gid;
3216         pid_t pid;
3217         int r;
3218
3219         assert(_home);
3220
3221         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3222                 /* Reset everything fully to 0, just in case */
3223
3224                 if (setgroups(0, NULL) < 0)
3225                         return log_error_errno(errno, "setgroups() failed: %m");
3226
3227                 if (setresgid(0, 0, 0) < 0)
3228                         return log_error_errno(errno, "setregid() failed: %m");
3229
3230                 if (setresuid(0, 0, 0) < 0)
3231                         return log_error_errno(errno, "setreuid() failed: %m");
3232
3233                 *_home = NULL;
3234                 return 0;
3235         }
3236
3237         /* First, get user credentials */
3238         fd = spawn_getent("passwd", arg_user, &pid);
3239         if (fd < 0)
3240                 return fd;
3241
3242         f = fdopen(fd, "r");
3243         if (!f)
3244                 return log_oom();
3245         fd = -1;
3246
3247         if (!fgets(line, sizeof(line), f)) {
3248
3249                 if (!ferror(f)) {
3250                         log_error("Failed to resolve user %s.", arg_user);
3251                         return -ESRCH;
3252                 }
3253
3254                 log_error_errno(errno, "Failed to read from getent: %m");
3255                 return -errno;
3256         }
3257
3258         truncate_nl(line);
3259
3260         wait_for_terminate_and_warn("getent passwd", pid, true);
3261
3262         x = strchr(line, ':');
3263         if (!x) {
3264                 log_error("/etc/passwd entry has invalid user field.");
3265                 return -EIO;
3266         }
3267
3268         u = strchr(x+1, ':');
3269         if (!u) {
3270                 log_error("/etc/passwd entry has invalid password field.");
3271                 return -EIO;
3272         }
3273
3274         u++;
3275         g = strchr(u, ':');
3276         if (!g) {
3277                 log_error("/etc/passwd entry has invalid UID field.");
3278                 return -EIO;
3279         }
3280
3281         *g = 0;
3282         g++;
3283         x = strchr(g, ':');
3284         if (!x) {
3285                 log_error("/etc/passwd entry has invalid GID field.");
3286                 return -EIO;
3287         }
3288
3289         *x = 0;
3290         h = strchr(x+1, ':');
3291         if (!h) {
3292                 log_error("/etc/passwd entry has invalid GECOS field.");
3293                 return -EIO;
3294         }
3295
3296         h++;
3297         x = strchr(h, ':');
3298         if (!x) {
3299                 log_error("/etc/passwd entry has invalid home directory field.");
3300                 return -EIO;
3301         }
3302
3303         *x = 0;
3304
3305         r = parse_uid(u, &uid);
3306         if (r < 0) {
3307                 log_error("Failed to parse UID of user.");
3308                 return -EIO;
3309         }
3310
3311         r = parse_gid(g, &gid);
3312         if (r < 0) {
3313                 log_error("Failed to parse GID of user.");
3314                 return -EIO;
3315         }
3316
3317         home = strdup(h);
3318         if (!home)
3319                 return log_oom();
3320
3321         /* Second, get group memberships */
3322         fd = spawn_getent("initgroups", arg_user, &pid);
3323         if (fd < 0)
3324                 return fd;
3325
3326         fclose(f);
3327         f = fdopen(fd, "r");
3328         if (!f)
3329                 return log_oom();
3330         fd = -1;
3331
3332         if (!fgets(line, sizeof(line), f)) {
3333                 if (!ferror(f)) {
3334                         log_error("Failed to resolve user %s.", arg_user);
3335                         return -ESRCH;
3336                 }
3337
3338                 log_error_errno(errno, "Failed to read from getent: %m");
3339                 return -errno;
3340         }
3341
3342         truncate_nl(line);
3343
3344         wait_for_terminate_and_warn("getent initgroups", pid, true);
3345
3346         /* Skip over the username and subsequent separator whitespace */
3347         x = line;
3348         x += strcspn(x, WHITESPACE);
3349         x += strspn(x, WHITESPACE);
3350
3351         FOREACH_WORD(word, l, x, state) {
3352                 char c[l+1];
3353
3354                 memcpy(c, word, l);
3355                 c[l] = 0;
3356
3357                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3358                         return log_oom();
3359
3360                 r = parse_uid(c, &uids[n_uids++]);
3361                 if (r < 0) {
3362                         log_error("Failed to parse group data from getent.");
3363                         return -EIO;
3364                 }
3365         }
3366
3367         r = mkdir_parents(home, 0775);
3368         if (r < 0)
3369                 return log_error_errno(r, "Failed to make home root directory: %m");
3370
3371         r = mkdir_safe(home, 0755, uid, gid);
3372         if (r < 0 && r != -EEXIST)
3373                 return log_error_errno(r, "Failed to make home directory: %m");
3374
3375         fchown(STDIN_FILENO, uid, gid);
3376         fchown(STDOUT_FILENO, uid, gid);
3377         fchown(STDERR_FILENO, uid, gid);
3378
3379         if (setgroups(n_uids, uids) < 0)
3380                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3381
3382         if (setresgid(gid, gid, gid) < 0)
3383                 return log_error_errno(errno, "setregid() failed: %m");
3384
3385         if (setresuid(uid, uid, uid) < 0)
3386                 return log_error_errno(errno, "setreuid() failed: %m");
3387
3388         if (_home) {
3389                 *_home = home;
3390                 home = NULL;
3391         }
3392
3393         return 0;
3394 }
3395
3396 /*
3397  * Return values:
3398  * < 0 : wait_for_terminate() failed to get the state of the
3399  *       container, the container was terminated by a signal, or
3400  *       failed for an unknown reason.  No change is made to the
3401  *       container argument.
3402  * > 0 : The program executed in the container terminated with an
3403  *       error.  The exit code of the program executed in the
3404  *       container is returned.  The container argument has been set
3405  *       to CONTAINER_TERMINATED.
3406  *   0 : The container is being rebooted, has been shut down or exited
3407  *       successfully.  The container argument has been set to either
3408  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3409  *
3410  * That is, success is indicated by a return value of zero, and an
3411  * error is indicated by a non-zero value.
3412  */
3413 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3414         siginfo_t status;
3415         int r;
3416
3417         r = wait_for_terminate(pid, &status);
3418         if (r < 0)
3419                 return log_warning_errno(r, "Failed to wait for container: %m");
3420
3421         switch (status.si_code) {
3422
3423         case CLD_EXITED:
3424                 if (status.si_status == 0) {
3425                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3426
3427                 } else
3428                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3429
3430                 *container = CONTAINER_TERMINATED;
3431                 return status.si_status;
3432
3433         case CLD_KILLED:
3434                 if (status.si_status == SIGINT) {
3435
3436                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3437                         *container = CONTAINER_TERMINATED;
3438                         return 0;
3439
3440                 } else if (status.si_status == SIGHUP) {
3441
3442                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3443                         *container = CONTAINER_REBOOTED;
3444                         return 0;
3445                 }
3446
3447                 /* CLD_KILLED fallthrough */
3448
3449         case CLD_DUMPED:
3450                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3451                 return -EIO;
3452
3453         default:
3454                 log_error("Container %s failed due to unknown reason.", arg_machine);
3455                 return -EIO;
3456         }
3457
3458         return r;
3459 }
3460
3461 static void nop_handler(int sig) {}
3462
3463 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3464         pid_t pid;
3465
3466         pid = PTR_TO_UINT32(userdata);
3467         if (pid > 0) {
3468                 if (kill(pid, SIGRTMIN+3) >= 0) {
3469                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3470                         sd_event_source_set_userdata(s, NULL);
3471                         return 0;
3472                 }
3473         }
3474
3475         sd_event_exit(sd_event_source_get_event(s), 0);
3476         return 0;
3477 }
3478
3479 static int determine_names(void) {
3480         int r;
3481
3482         if (!arg_image && !arg_directory) {
3483                 if (arg_machine) {
3484                         _cleanup_(image_unrefp) Image *i = NULL;
3485
3486                         r = image_find(arg_machine, &i);
3487                         if (r < 0)
3488                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3489                         else if (r == 0) {
3490                                 log_error("No image for machine '%s': %m", arg_machine);
3491                                 return -ENOENT;
3492                         }
3493
3494                         if (i->type == IMAGE_RAW)
3495                                 r = set_sanitized_path(&arg_image, i->path);
3496                         else
3497                                 r = set_sanitized_path(&arg_directory, i->path);
3498                         if (r < 0)
3499                                 return log_error_errno(r, "Invalid image directory: %m");
3500
3501                         arg_read_only = arg_read_only || i->read_only;
3502                 } else
3503                         arg_directory = get_current_dir_name();
3504
3505                 if (!arg_directory && !arg_machine) {
3506                         log_error("Failed to determine path, please use -D or -i.");
3507                         return -EINVAL;
3508                 }
3509         }
3510
3511         if (!arg_machine) {
3512                 if (arg_directory && path_equal(arg_directory, "/"))
3513                         arg_machine = gethostname_malloc();
3514                 else
3515                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3516
3517                 if (!arg_machine)
3518                         return log_oom();
3519
3520                 hostname_cleanup(arg_machine, false);
3521                 if (!machine_name_is_valid(arg_machine)) {
3522                         log_error("Failed to determine machine name automatically, please use -M.");
3523                         return -EINVAL;
3524                 }
3525
3526                 if (arg_ephemeral) {
3527                         char *b;
3528
3529                         /* Add a random suffix when this is an
3530                          * ephemeral machine, so that we can run many
3531                          * instances at once without manually having
3532                          * to specify -M each time. */
3533
3534                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3535                                 return log_oom();
3536
3537                         free(arg_machine);
3538                         arg_machine = b;
3539                 }
3540         }
3541
3542         return 0;
3543 }
3544
3545 int main(int argc, char *argv[]) {
3546
3547         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3548         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3549         _cleanup_close_ int master = -1, image_fd = -1;
3550         _cleanup_fdset_free_ FDSet *fds = NULL;
3551         int r, n_fd_passed, loop_nr = -1;
3552         char veth_name[IFNAMSIZ];
3553         bool secondary = false, remove_subvol = false;
3554         sigset_t mask, mask_chld;
3555         pid_t pid = 0;
3556         int ret = EXIT_SUCCESS;
3557         union in_addr_union exposed = {};
3558         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3559
3560         log_parse_environment();
3561         log_open();
3562
3563         r = parse_argv(argc, argv);
3564         if (r <= 0)
3565                 goto finish;
3566
3567         r = determine_names();
3568         if (r < 0)
3569                 goto finish;
3570
3571         if (geteuid() != 0) {
3572                 log_error("Need to be root.");
3573                 r = -EPERM;
3574                 goto finish;
3575         }
3576
3577         if (sd_booted() <= 0) {
3578                 log_error("Not running on a systemd system.");
3579                 r = -EINVAL;
3580                 goto finish;
3581         }
3582
3583         log_close();
3584         n_fd_passed = sd_listen_fds(false);
3585         if (n_fd_passed > 0) {
3586                 r = fdset_new_listen_fds(&fds, false);
3587                 if (r < 0) {
3588                         log_error_errno(r, "Failed to collect file descriptors: %m");
3589                         goto finish;
3590                 }
3591         }
3592         fdset_close_others(fds);
3593         log_open();
3594
3595         if (arg_directory) {
3596                 assert(!arg_image);
3597
3598                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3599                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3600                         r = -EINVAL;
3601                         goto finish;
3602                 }
3603
3604                 if (arg_ephemeral) {
3605                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3606                         char *np;
3607
3608                         /* If the specified path is a mount point we
3609                          * generate the new snapshot immediately
3610                          * inside it under a random name. However if
3611                          * the specified is not a mount point we
3612                          * create the new snapshot in the parent
3613                          * directory, just next to it. */
3614                         r = path_is_mount_point(arg_directory, false);
3615                         if (r < 0) {
3616                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3617                                 goto finish;
3618                         }
3619                         if (r > 0)
3620                                 r = tempfn_random_child(arg_directory, &np);
3621                         else
3622                                 r = tempfn_random(arg_directory, &np);
3623                         if (r < 0) {
3624                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3625                                 goto finish;
3626                         }
3627
3628                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3629                         if (r < 0) {
3630                                 log_error_errno(r, "Failed to lock %s: %m", np);
3631                                 goto finish;
3632                         }
3633
3634                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3635                         if (r < 0) {
3636                                 free(np);
3637                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3638                                 goto finish;
3639                         }
3640
3641                         free(arg_directory);
3642                         arg_directory = np;
3643
3644                         remove_subvol = true;
3645
3646                 } else {
3647                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3648                         if (r == -EBUSY) {
3649                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3650                                 goto finish;
3651                         }
3652                         if (r < 0) {
3653                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3654                                 return r;
3655                         }
3656
3657                         if (arg_template) {
3658                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3659                                 if (r == -EEXIST) {
3660                                         if (!arg_quiet)
3661                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3662                                 } else if (r < 0) {
3663                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3664                                         goto finish;
3665                                 } else {
3666                                         if (!arg_quiet)
3667                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3668                                 }
3669                         }
3670                 }
3671
3672                 if (arg_boot) {
3673                         if (path_is_os_tree(arg_directory) <= 0) {
3674                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3675                                 r = -EINVAL;
3676                                 goto finish;
3677                         }
3678                 } else {
3679                         const char *p;
3680
3681                         p = strappenda(arg_directory,
3682                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3683                         if (access(p, F_OK) < 0) {
3684                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3685                                 r = -EINVAL;
3686                                 goto finish;
3687                         }
3688                 }
3689
3690         } else {
3691                 char template[] = "/tmp/nspawn-root-XXXXXX";
3692
3693                 assert(arg_image);
3694                 assert(!arg_template);
3695
3696                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697                 if (r == -EBUSY) {
3698                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3699                         goto finish;
3700                 }
3701                 if (r < 0) {
3702                         r = log_error_errno(r, "Failed to create image lock: %m");
3703                         goto finish;
3704                 }
3705
3706                 if (!mkdtemp(template)) {
3707                         log_error_errno(errno, "Failed to create temporary directory: %m");
3708                         r = -errno;
3709                         goto finish;
3710                 }
3711
3712                 arg_directory = strdup(template);
3713                 if (!arg_directory) {
3714                         r = log_oom();
3715                         goto finish;
3716                 }
3717
3718                 image_fd = setup_image(&device_path, &loop_nr);
3719                 if (image_fd < 0) {
3720                         r = image_fd;
3721                         goto finish;
3722                 }
3723
3724                 r = dissect_image(image_fd,
3725                                   &root_device, &root_device_rw,
3726                                   &home_device, &home_device_rw,
3727                                   &srv_device, &srv_device_rw,
3728                                   &secondary);
3729                 if (r < 0)
3730                         goto finish;
3731         }
3732
3733         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3734         if (master < 0) {
3735                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3736                 goto finish;
3737         }
3738
3739         r = ptsname_malloc(master, &console);
3740         if (r < 0) {
3741                 r = log_error_errno(r, "Failed to determine tty name: %m");
3742                 goto finish;
3743         }
3744
3745         if (!arg_quiet)
3746                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3747                          arg_machine, arg_image ?: arg_directory);
3748
3749         if (unlockpt(master) < 0) {
3750                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3751                 goto finish;
3752         }
3753
3754         assert_se(sigemptyset(&mask) == 0);
3755         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3756         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3757
3758         assert_se(sigemptyset(&mask_chld) == 0);
3759         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3760
3761         for (;;) {
3762                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3763                 ContainerStatus container_status;
3764                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3765                 struct sigaction sa = {
3766                         .sa_handler = nop_handler,
3767                         .sa_flags = SA_NOCLDSTOP,
3768                 };
3769
3770                 r = barrier_create(&barrier);
3771                 if (r < 0) {
3772                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3773                         goto finish;
3774                 }
3775
3776                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3777                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3778                         goto finish;
3779                 }
3780
3781                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3782                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3783                         goto finish;
3784                 }
3785
3786                 /* Child can be killed before execv(), so handle SIGCHLD
3787                  * in order to interrupt parent's blocking calls and
3788                  * give it a chance to call wait() and terminate. */
3789                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3790                 if (r < 0) {
3791                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3792                         goto finish;
3793                 }
3794
3795                 r = sigaction(SIGCHLD, &sa, NULL);
3796                 if (r < 0) {
3797                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3798                         goto finish;
3799                 }
3800
3801                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3802                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3803                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3804                 if (pid < 0) {
3805                         if (errno == EINVAL)
3806                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3807                         else
3808                                 r = log_error_errno(errno, "clone() failed: %m");
3809
3810                         goto finish;
3811                 }
3812
3813                 if (pid == 0) {
3814                         /* child */
3815                         _cleanup_free_ char *home = NULL;
3816                         unsigned n_env = 2;
3817                         const char *envp[] = {
3818                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3819                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3820                                 NULL, /* TERM */
3821                                 NULL, /* HOME */
3822                                 NULL, /* USER */
3823                                 NULL, /* LOGNAME */
3824                                 NULL, /* container_uuid */
3825                                 NULL, /* LISTEN_FDS */
3826                                 NULL, /* LISTEN_PID */
3827                                 NULL
3828                         };
3829                         char **env_use;
3830
3831                         barrier_set_role(&barrier, BARRIER_CHILD);
3832
3833                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3834                         if (envp[n_env])
3835                                 n_env ++;
3836
3837                         master = safe_close(master);
3838
3839                         close_nointr(STDIN_FILENO);
3840                         close_nointr(STDOUT_FILENO);
3841                         close_nointr(STDERR_FILENO);
3842
3843                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3844                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3845
3846                         reset_all_signal_handlers();
3847                         reset_signal_mask();
3848
3849                         r = open_terminal(console, O_RDWR);
3850                         if (r != STDIN_FILENO) {
3851                                 if (r >= 0) {
3852                                         safe_close(r);
3853                                         r = -EINVAL;
3854                                 }
3855
3856                                 log_error_errno(r, "Failed to open console: %m");
3857                                 _exit(EXIT_FAILURE);
3858                         }
3859
3860                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3861                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3862                                 log_error_errno(errno, "Failed to duplicate console: %m");
3863                                 _exit(EXIT_FAILURE);
3864                         }
3865
3866                         if (setsid() < 0) {
3867                                 log_error_errno(errno, "setsid() failed: %m");
3868                                 _exit(EXIT_FAILURE);
3869                         }
3870
3871                         if (reset_audit_loginuid() < 0)
3872                                 _exit(EXIT_FAILURE);
3873
3874                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3875                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3876                                 _exit(EXIT_FAILURE);
3877                         }
3878
3879                         /* Mark everything as slave, so that we still
3880                          * receive mounts from the real root, but don't
3881                          * propagate mounts to the real root. */
3882                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3883                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3884                                 _exit(EXIT_FAILURE);
3885                         }
3886
3887                         if (mount_devices(arg_directory,
3888                                           root_device, root_device_rw,
3889                                           home_device, home_device_rw,
3890                                           srv_device, srv_device_rw) < 0)
3891                                 _exit(EXIT_FAILURE);
3892
3893                         /* Turn directory into bind mount */
3894                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3895                                 log_error_errno(errno, "Failed to make bind mount: %m");
3896                                 _exit(EXIT_FAILURE);
3897                         }
3898
3899                         r = setup_volatile(arg_directory);
3900                         if (r < 0)
3901                                 _exit(EXIT_FAILURE);
3902
3903                         if (setup_volatile_state(arg_directory) < 0)
3904                                 _exit(EXIT_FAILURE);
3905
3906                         r = base_filesystem_create(arg_directory);
3907                         if (r < 0)
3908                                 _exit(EXIT_FAILURE);
3909
3910                         if (arg_read_only) {
3911                                 r = bind_remount_recursive(arg_directory, true);
3912                                 if (r < 0) {
3913                                         log_error_errno(r, "Failed to make tree read-only: %m");
3914                                         _exit(EXIT_FAILURE);
3915                                 }
3916                         }
3917
3918                         if (mount_all(arg_directory) < 0)
3919                                 _exit(EXIT_FAILURE);
3920
3921                         if (copy_devnodes(arg_directory) < 0)
3922                                 _exit(EXIT_FAILURE);
3923
3924                         if (setup_ptmx(arg_directory) < 0)
3925                                 _exit(EXIT_FAILURE);
3926
3927                         dev_setup(arg_directory);
3928
3929                         if (setup_propagate(arg_directory) < 0)
3930                                 _exit(EXIT_FAILURE);
3931
3932                         if (setup_seccomp() < 0)
3933                                 _exit(EXIT_FAILURE);
3934
3935                         if (setup_dev_console(arg_directory, console) < 0)
3936                                 _exit(EXIT_FAILURE);
3937
3938                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3939                                 _exit(EXIT_FAILURE);
3940                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3941
3942                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3943                                 _exit(EXIT_FAILURE);
3944                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3945
3946                         /* Tell the parent that we are ready, and that
3947                          * it can cgroupify us to that we lack access
3948                          * to certain devices and resources. */
3949                         (void) barrier_place(&barrier);
3950
3951                         if (setup_boot_id(arg_directory) < 0)
3952                                 _exit(EXIT_FAILURE);
3953
3954                         if (setup_timezone(arg_directory) < 0)
3955                                 _exit(EXIT_FAILURE);
3956
3957                         if (setup_resolv_conf(arg_directory) < 0)
3958                                 _exit(EXIT_FAILURE);
3959
3960                         if (setup_journal(arg_directory) < 0)
3961                                 _exit(EXIT_FAILURE);
3962
3963                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3964                                 _exit(EXIT_FAILURE);
3965
3966                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3967                                 _exit(EXIT_FAILURE);
3968
3969                         if (mount_tmpfs(arg_directory) < 0)
3970                                 _exit(EXIT_FAILURE);
3971
3972                         /* Wait until we are cgroup-ified, so that we
3973                          * can mount the right cgroup path writable */
3974                         (void) barrier_sync_next(&barrier);
3975
3976                         if (mount_cgroup(arg_directory) < 0)
3977                                 _exit(EXIT_FAILURE);
3978
3979                         if (chdir(arg_directory) < 0) {
3980                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3981                                 _exit(EXIT_FAILURE);
3982                         }
3983
3984                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3985                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3986                                 _exit(EXIT_FAILURE);
3987                         }
3988
3989                         if (chroot(".") < 0) {
3990                                 log_error_errno(errno, "chroot() failed: %m");
3991                                 _exit(EXIT_FAILURE);
3992                         }
3993
3994                         if (chdir("/") < 0) {
3995                                 log_error_errno(errno, "chdir() failed: %m");
3996                                 _exit(EXIT_FAILURE);
3997                         }
3998
3999                         umask(0022);
4000
4001                         if (arg_private_network)
4002                                 loopback_setup();
4003
4004                         if (drop_capabilities() < 0) {
4005                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4006                                 _exit(EXIT_FAILURE);
4007                         }
4008
4009                         r = change_uid_gid(&home);
4010                         if (r < 0)
4011                                 _exit(EXIT_FAILURE);
4012
4013                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4014                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4015                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4016                                 log_oom();
4017                                 _exit(EXIT_FAILURE);
4018                         }
4019
4020                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4021                                 char as_uuid[37];
4022
4023                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4024                                         log_oom();
4025                                         _exit(EXIT_FAILURE);
4026                                 }
4027                         }
4028
4029                         if (fdset_size(fds) > 0) {
4030                                 r = fdset_cloexec(fds, false);
4031                                 if (r < 0) {
4032                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4033                                         _exit(EXIT_FAILURE);
4034                                 }
4035
4036                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4037                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4038                                         log_oom();
4039                                         _exit(EXIT_FAILURE);
4040                                 }
4041                         }
4042
4043                         setup_hostname();
4044
4045                         if (arg_personality != 0xffffffffLU) {
4046                                 if (personality(arg_personality) < 0) {
4047                                         log_error_errno(errno, "personality() failed: %m");
4048                                         _exit(EXIT_FAILURE);
4049                                 }
4050                         } else if (secondary) {
4051                                 if (personality(PER_LINUX32) < 0) {
4052                                         log_error_errno(errno, "personality() failed: %m");
4053                                         _exit(EXIT_FAILURE);
4054                                 }
4055                         }
4056
4057 #ifdef HAVE_SELINUX
4058                         if (arg_selinux_context)
4059                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4060                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4061                                         _exit(EXIT_FAILURE);
4062                                 }
4063 #endif
4064
4065                         if (!strv_isempty(arg_setenv)) {
4066                                 char **n;
4067
4068                                 n = strv_env_merge(2, envp, arg_setenv);
4069                                 if (!n) {
4070                                         log_oom();
4071                                         _exit(EXIT_FAILURE);
4072                                 }
4073
4074                                 env_use = n;
4075                         } else
4076                                 env_use = (char**) envp;
4077
4078                         /* Wait until the parent is ready with the setup, too... */
4079                         if (!barrier_place_and_sync(&barrier))
4080                                 _exit(EXIT_FAILURE);
4081
4082                         if (arg_boot) {
4083                                 char **a;
4084                                 size_t l;
4085
4086                                 /* Automatically search for the init system */
4087
4088                                 l = 1 + argc - optind;
4089                                 a = newa(char*, l + 1);
4090                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4091
4092                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4093                                 execve(a[0], a, env_use);
4094
4095                                 a[0] = (char*) "/lib/systemd/systemd";
4096                                 execve(a[0], a, env_use);
4097
4098                                 a[0] = (char*) "/sbin/init";
4099                                 execve(a[0], a, env_use);
4100                         } else if (argc > optind)
4101                                 execvpe(argv[optind], argv + optind, env_use);
4102                         else {
4103                                 chdir(home ? home : "/root");
4104                                 execle("/bin/bash", "-bash", NULL, env_use);
4105                                 execle("/bin/sh", "-sh", NULL, env_use);
4106                         }
4107
4108                         log_error_errno(errno, "execv() failed: %m");
4109                         _exit(EXIT_FAILURE);
4110                 }
4111
4112                 barrier_set_role(&barrier, BARRIER_PARENT);
4113                 fdset_free(fds);
4114                 fds = NULL;
4115
4116                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4117                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4118
4119                 /* Wait for the most basic Child-setup to be done,
4120                  * before we add hardware to it, and place it in a
4121                  * cgroup. */
4122                 if (barrier_sync_next(&barrier)) {
4123                         int ifi = 0;
4124
4125                         r = move_network_interfaces(pid);
4126                         if (r < 0)
4127                                 goto finish;
4128
4129                         r = setup_veth(pid, veth_name, &ifi);
4130                         if (r < 0)
4131                                 goto finish;
4132
4133                         r = setup_bridge(veth_name, &ifi);
4134                         if (r < 0)
4135                                 goto finish;
4136
4137                         r = setup_macvlan(pid);
4138                         if (r < 0)
4139                                 goto finish;
4140
4141                         r = setup_ipvlan(pid);
4142                         if (r < 0)
4143                                 goto finish;
4144
4145                         r = register_machine(pid, ifi);
4146                         if (r < 0)
4147                                 goto finish;
4148
4149                         /* Block SIGCHLD here, before notifying child.
4150                          * process_pty() will handle it with the other signals. */
4151                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4152                         if (r < 0)
4153                                 goto finish;
4154
4155                         /* Reset signal to default */
4156                         r = default_signals(SIGCHLD, -1);
4157                         if (r < 0)
4158                                 goto finish;
4159
4160                         /* Notify the child that the parent is ready with all
4161                          * its setup, and that the child can now hand over
4162                          * control to the code to run inside the container. */
4163                         (void) barrier_place(&barrier);
4164
4165                         /* And wait that the child is completely ready now. */
4166                         if (barrier_place_and_sync(&barrier)) {
4167                                 _cleanup_event_unref_ sd_event *event = NULL;
4168                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4169                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4170                                 char last_char = 0;
4171
4172                                 sd_notifyf(false,
4173                                            "READY=1\n"
4174                                            "STATUS=Container running.\n"
4175                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4176
4177                                 r = sd_event_new(&event);
4178                                 if (r < 0) {
4179                                         log_error_errno(r, "Failed to get default event source: %m");
4180                                         goto finish;
4181                                 }
4182
4183                                 if (arg_boot) {
4184                                         /* Try to kill the init system on SIGINT or SIGTERM */
4185                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4186                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4187                                 } else {
4188                                         /* Immediately exit */
4189                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4190                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4191                                 }
4192
4193                                 /* simply exit on sigchld */
4194                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4195
4196                                 if (arg_expose_ports) {
4197                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4198                                         if (r < 0)
4199                                                 goto finish;
4200
4201                                         (void) expose_ports(rtnl, &exposed);
4202                                 }
4203
4204                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4205
4206                                 r = pty_forward_new(event, master, true, &forward);
4207                                 if (r < 0) {
4208                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4209                                         goto finish;
4210                                 }
4211
4212                                 r = sd_event_loop(event);
4213                                 if (r < 0) {
4214                                         log_error_errno(r, "Failed to run event loop: %m");
4215                                         goto finish;
4216                                 }
4217
4218                                 pty_forward_get_last_char(forward, &last_char);
4219
4220                                 forward = pty_forward_free(forward);
4221
4222                                 if (!arg_quiet && last_char != '\n')
4223                                         putc('\n', stdout);
4224
4225                                 /* Kill if it is not dead yet anyway */
4226                                 terminate_machine(pid);
4227                         }
4228                 }
4229
4230                 /* Normally redundant, but better safe than sorry */
4231                 kill(pid, SIGKILL);
4232
4233                 r = wait_for_container(pid, &container_status);
4234                 pid = 0;
4235
4236                 if (r < 0)
4237                         /* We failed to wait for the container, or the
4238                          * container exited abnormally */
4239                         goto finish;
4240                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4241                         /* The container exited with a non-zero
4242                          * status, or with zero status and no reboot
4243                          * was requested. */
4244                         ret = r;
4245                         break;
4246                 }
4247
4248                 /* CONTAINER_REBOOTED, loop again */
4249
4250                 if (arg_keep_unit) {
4251                         /* Special handling if we are running as a
4252                          * service: instead of simply restarting the
4253                          * machine we want to restart the entire
4254                          * service, so let's inform systemd about this
4255                          * with the special exit code 133. The service
4256                          * file uses RestartForceExitStatus=133 so
4257                          * that this results in a full nspawn
4258                          * restart. This is necessary since we might
4259                          * have cgroup parameters set we want to have
4260                          * flushed out. */
4261                         ret = 133;
4262                         r = 0;
4263                         break;
4264                 }
4265
4266                 flush_ports(&exposed);
4267         }
4268
4269 finish:
4270         sd_notify(false,
4271                   "STOPPING=1\n"
4272                   "STATUS=Terminating...");
4273
4274         loop_remove(loop_nr, &image_fd);
4275
4276         if (pid > 0)
4277                 kill(pid, SIGKILL);
4278
4279         if (remove_subvol && arg_directory) {
4280                 int k;
4281
4282                 k = btrfs_subvol_remove(arg_directory);
4283                 if (k < 0)
4284                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4285         }
4286
4287         if (arg_machine) {
4288                 const char *p;
4289
4290                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4291                 (void) rm_rf(p, false, true, false);
4292         }
4293
4294         free(arg_directory);
4295         free(arg_template);
4296         free(arg_image);
4297         free(arg_machine);
4298         free(arg_user);
4299         strv_free(arg_setenv);
4300         strv_free(arg_network_interfaces);
4301         strv_free(arg_network_macvlan);
4302         strv_free(arg_network_ipvlan);
4303         strv_free(arg_bind);
4304         strv_free(arg_bind_ro);
4305         strv_free(arg_tmpfs);
4306
4307         flush_ports(&exposed);
4308
4309         while (arg_expose_ports) {
4310                 ExposePort *p = arg_expose_ports;
4311                 LIST_REMOVE(ports, arg_expose_ports, p);
4312                 free(p);
4313         }
4314
4315         return r < 0 ? EXIT_FAILURE : ret;
4316 }