chiark / gitweb /
8a151f15ea16fc764ab5e5c28e396c2d1e34bbd6
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
192 static bool arg_userns = false;
193
194 static void help(void) {
195         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197                "  -h --help                 Show this help\n"
198                "     --version              Print version string\n"
199                "  -q --quiet                Do not show status information\n"
200                "  -D --directory=PATH       Root directory for the container\n"
201                "     --template=PATH        Initialize root directory from template directory,\n"
202                "                            if missing\n"
203                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
204                "                            remove it after exit\n"
205                "  -i --image=PATH           File system device or disk image for the container\n"
206                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
207                "  -u --user=USER            Run the command under specified user or uid\n"
208                "  -M --machine=NAME         Set the machine name for the container\n"
209                "     --uuid=UUID            Set a specific machine UUID for the container\n"
210                "  -S --slice=SLICE          Place the container in the specified slice\n"
211                "     --property=NAME=VALUE  Set scope unit property\n"
212                "     --private-network      Disable network in container\n"
213                "     --network-interface=INTERFACE\n"
214                "                            Assign an existing network interface to the\n"
215                "                            container\n"
216                "     --network-macvlan=INTERFACE\n"
217                "                            Create a macvlan network interface based on an\n"
218                "                            existing network interface to the container\n"
219                "     --network-ipvlan=INTERFACE\n"
220                "                            Create a ipvlan network interface based on an\n"
221                "                            existing network interface to the container\n"
222                "  -n --network-veth         Add a virtual ethernet connection between host\n"
223                "                            and container\n"
224                "     --network-bridge=INTERFACE\n"
225                "                            Add a virtual ethernet connection between host\n"
226                "                            and container and add it to an existing bridge on\n"
227                "                            the host\n"
228                "     --private-users[=UIDBASE[:NUIDS]]\n"
229                "                            Run within user namespace\n"
230                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
231                "                            Expose a container IP port on the host\n"
232                "  -Z --selinux-context=SECLABEL\n"
233                "                            Set the SELinux security context to be used by\n"
234                "                            processes in the container\n"
235                "  -L --selinux-apifs-context=SECLABEL\n"
236                "                            Set the SELinux security context to be used by\n"
237                "                            API/tmpfs file systems in the container\n"
238                "     --capability=CAP       In addition to the default, retain specified\n"
239                "                            capability\n"
240                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
241                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
242                "                            try-guest, try-host\n"
243                "  -j                        Equivalent to --link-journal=try-guest\n"
244                "     --read-only            Mount the root directory read-only\n"
245                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
246                "                            the container\n"
247                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
248                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
249                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
250                "     --share-system         Share system namespaces with host\n"
251                "     --register=BOOLEAN     Register container as machine\n"
252                "     --keep-unit            Do not register a scope for the machine, reuse\n"
253                "                            the service unit nspawn is running in\n"
254                "     --volatile[=MODE]      Run the system in volatile mode\n"
255                , program_invocation_short_name);
256 }
257
258 static int set_sanitized_path(char **b, const char *path) {
259         char *p;
260
261         assert(b);
262         assert(path);
263
264         p = canonicalize_file_name(path);
265         if (!p) {
266                 if (errno != ENOENT)
267                         return -errno;
268
269                 p = path_make_absolute_cwd(path);
270                 if (!p)
271                         return -ENOMEM;
272         }
273
274         free(*b);
275         *b = path_kill_slashes(p);
276         return 0;
277 }
278
279 static int parse_argv(int argc, char *argv[]) {
280
281         enum {
282                 ARG_VERSION = 0x100,
283                 ARG_PRIVATE_NETWORK,
284                 ARG_UUID,
285                 ARG_READ_ONLY,
286                 ARG_CAPABILITY,
287                 ARG_DROP_CAPABILITY,
288                 ARG_LINK_JOURNAL,
289                 ARG_BIND,
290                 ARG_BIND_RO,
291                 ARG_TMPFS,
292                 ARG_SETENV,
293                 ARG_SHARE_SYSTEM,
294                 ARG_REGISTER,
295                 ARG_KEEP_UNIT,
296                 ARG_NETWORK_INTERFACE,
297                 ARG_NETWORK_MACVLAN,
298                 ARG_NETWORK_IPVLAN,
299                 ARG_NETWORK_BRIDGE,
300                 ARG_PERSONALITY,
301                 ARG_VOLATILE,
302                 ARG_TEMPLATE,
303                 ARG_PROPERTY,
304                 ARG_PRIVATE_USERS,
305         };
306
307         static const struct option options[] = {
308                 { "help",                  no_argument,       NULL, 'h'                   },
309                 { "version",               no_argument,       NULL, ARG_VERSION           },
310                 { "directory",             required_argument, NULL, 'D'                   },
311                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
312                 { "ephemeral",             no_argument,       NULL, 'x'                   },
313                 { "user",                  required_argument, NULL, 'u'                   },
314                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
315                 { "boot",                  no_argument,       NULL, 'b'                   },
316                 { "uuid",                  required_argument, NULL, ARG_UUID              },
317                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
318                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
319                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
320                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
321                 { "bind",                  required_argument, NULL, ARG_BIND              },
322                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
323                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
324                 { "machine",               required_argument, NULL, 'M'                   },
325                 { "slice",                 required_argument, NULL, 'S'                   },
326                 { "setenv",                required_argument, NULL, ARG_SETENV            },
327                 { "selinux-context",       required_argument, NULL, 'Z'                   },
328                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
329                 { "quiet",                 no_argument,       NULL, 'q'                   },
330                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
331                 { "register",              required_argument, NULL, ARG_REGISTER          },
332                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
333                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
334                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
335                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
336                 { "network-veth",          no_argument,       NULL, 'n'                   },
337                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
338                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
339                 { "image",                 required_argument, NULL, 'i'                   },
340                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
341                 { "port",                  required_argument, NULL, 'p'                   },
342                 { "property",              required_argument, NULL, ARG_PROPERTY          },
343                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
344                 {}
345         };
346
347         int c, r;
348         uint64_t plus = 0, minus = 0;
349
350         assert(argc >= 0);
351         assert(argv);
352
353         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
354
355                 switch (c) {
356
357                 case 'h':
358                         help();
359                         return 0;
360
361                 case ARG_VERSION:
362                         puts(PACKAGE_STRING);
363                         puts(SYSTEMD_FEATURES);
364                         return 0;
365
366                 case 'D':
367                         r = set_sanitized_path(&arg_directory, optarg);
368                         if (r < 0)
369                                 return log_error_errno(r, "Invalid root directory: %m");
370
371                         break;
372
373                 case ARG_TEMPLATE:
374                         r = set_sanitized_path(&arg_template, optarg);
375                         if (r < 0)
376                                 return log_error_errno(r, "Invalid template directory: %m");
377
378                         break;
379
380                 case 'i':
381                         r = set_sanitized_path(&arg_image, optarg);
382                         if (r < 0)
383                                 return log_error_errno(r, "Invalid image path: %m");
384
385                         break;
386
387                 case 'x':
388                         arg_ephemeral = true;
389                         break;
390
391                 case 'u':
392                         free(arg_user);
393                         arg_user = strdup(optarg);
394                         if (!arg_user)
395                                 return log_oom();
396
397                         break;
398
399                 case ARG_NETWORK_BRIDGE:
400                         arg_network_bridge = optarg;
401
402                         /* fall through */
403
404                 case 'n':
405                         arg_network_veth = true;
406                         arg_private_network = true;
407                         break;
408
409                 case ARG_NETWORK_INTERFACE:
410                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
411                                 return log_oom();
412
413                         arg_private_network = true;
414                         break;
415
416                 case ARG_NETWORK_MACVLAN:
417                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
418                                 return log_oom();
419
420                         arg_private_network = true;
421                         break;
422
423                 case ARG_NETWORK_IPVLAN:
424                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425                                 return log_oom();
426
427                         /* fall through */
428
429                 case ARG_PRIVATE_NETWORK:
430                         arg_private_network = true;
431                         break;
432
433                 case 'b':
434                         arg_boot = true;
435                         break;
436
437                 case ARG_UUID:
438                         r = sd_id128_from_string(optarg, &arg_uuid);
439                         if (r < 0) {
440                                 log_error("Invalid UUID: %s", optarg);
441                                 return r;
442                         }
443                         break;
444
445                 case 'S':
446                         arg_slice = optarg;
447                         break;
448
449                 case 'M':
450                         if (isempty(optarg)) {
451                                 free(arg_machine);
452                                 arg_machine = NULL;
453                         } else {
454                                 if (!machine_name_is_valid(optarg)) {
455                                         log_error("Invalid machine name: %s", optarg);
456                                         return -EINVAL;
457                                 }
458
459                                 r = free_and_strdup(&arg_machine, optarg);
460                                 if (r < 0)
461                                         return log_oom();
462
463                                 break;
464                         }
465
466                 case 'Z':
467                         arg_selinux_context = optarg;
468                         break;
469
470                 case 'L':
471                         arg_selinux_apifs_context = optarg;
472                         break;
473
474                 case ARG_READ_ONLY:
475                         arg_read_only = true;
476                         break;
477
478                 case ARG_CAPABILITY:
479                 case ARG_DROP_CAPABILITY: {
480                         const char *state, *word;
481                         size_t length;
482
483                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
484                                 _cleanup_free_ char *t;
485
486                                 t = strndup(word, length);
487                                 if (!t)
488                                         return log_oom();
489
490                                 if (streq(t, "all")) {
491                                         if (c == ARG_CAPABILITY)
492                                                 plus = (uint64_t) -1;
493                                         else
494                                                 minus = (uint64_t) -1;
495                                 } else {
496                                         int cap;
497
498                                         cap = capability_from_name(t);
499                                         if (cap < 0) {
500                                                 log_error("Failed to parse capability %s.", t);
501                                                 return -EINVAL;
502                                         }
503
504                                         if (c == ARG_CAPABILITY)
505                                                 plus |= 1ULL << (uint64_t) cap;
506                                         else
507                                                 minus |= 1ULL << (uint64_t) cap;
508                                 }
509                         }
510
511                         break;
512                 }
513
514                 case 'j':
515                         arg_link_journal = LINK_GUEST;
516                         arg_link_journal_try = true;
517                         break;
518
519                 case ARG_LINK_JOURNAL:
520                         if (streq(optarg, "auto")) {
521                                 arg_link_journal = LINK_AUTO;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "no")) {
524                                 arg_link_journal = LINK_NO;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = false;
529                         } else if (streq(optarg, "host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = false;
532                         } else if (streq(optarg, "try-guest")) {
533                                 arg_link_journal = LINK_GUEST;
534                                 arg_link_journal_try = true;
535                         } else if (streq(optarg, "try-host")) {
536                                 arg_link_journal = LINK_HOST;
537                                 arg_link_journal_try = true;
538                         } else {
539                                 log_error("Failed to parse link journal mode %s", optarg);
540                                 return -EINVAL;
541                         }
542
543                         break;
544
545                 case ARG_BIND:
546                 case ARG_BIND_RO: {
547                         _cleanup_free_ char *a = NULL, *b = NULL;
548                         char *e;
549                         char ***x;
550
551                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553                         e = strchr(optarg, ':');
554                         if (e) {
555                                 a = strndup(optarg, e - optarg);
556                                 b = strdup(e + 1);
557                         } else {
558                                 a = strdup(optarg);
559                                 b = strdup(optarg);
560                         }
561
562                         if (!a || !b)
563                                 return log_oom();
564
565                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
566                                 log_error("Invalid bind mount specification: %s", optarg);
567                                 return -EINVAL;
568                         }
569
570                         r = strv_extend(x, a);
571                         if (r < 0)
572                                 return log_oom();
573
574                         r = strv_extend(x, b);
575                         if (r < 0)
576                                 return log_oom();
577
578                         break;
579                 }
580
581                 case ARG_TMPFS: {
582                         _cleanup_free_ char *a = NULL, *b = NULL;
583                         char *e;
584
585                         e = strchr(optarg, ':');
586                         if (e) {
587                                 a = strndup(optarg, e - optarg);
588                                 b = strdup(e + 1);
589                         } else {
590                                 a = strdup(optarg);
591                                 b = strdup("mode=0755");
592                         }
593
594                         if (!a || !b)
595                                 return log_oom();
596
597                         if (!path_is_absolute(a)) {
598                                 log_error("Invalid tmpfs specification: %s", optarg);
599                                 return -EINVAL;
600                         }
601
602                         r = strv_push(&arg_tmpfs, a);
603                         if (r < 0)
604                                 return log_oom();
605
606                         a = NULL;
607
608                         r = strv_push(&arg_tmpfs, b);
609                         if (r < 0)
610                                 return log_oom();
611
612                         b = NULL;
613
614                         break;
615                 }
616
617                 case ARG_SETENV: {
618                         char **n;
619
620                         if (!env_assignment_is_valid(optarg)) {
621                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         n = strv_env_set(arg_setenv, optarg);
626                         if (!n)
627                                 return log_oom();
628
629                         strv_free(arg_setenv);
630                         arg_setenv = n;
631                         break;
632                 }
633
634                 case 'q':
635                         arg_quiet = true;
636                         break;
637
638                 case ARG_SHARE_SYSTEM:
639                         arg_share_system = true;
640                         break;
641
642                 case ARG_REGISTER:
643                         r = parse_boolean(optarg);
644                         if (r < 0) {
645                                 log_error("Failed to parse --register= argument: %s", optarg);
646                                 return r;
647                         }
648
649                         arg_register = r;
650                         break;
651
652                 case ARG_KEEP_UNIT:
653                         arg_keep_unit = true;
654                         break;
655
656                 case ARG_PERSONALITY:
657
658                         arg_personality = personality_from_string(optarg);
659                         if (arg_personality == 0xffffffffLU) {
660                                 log_error("Unknown or unsupported personality '%s'.", optarg);
661                                 return -EINVAL;
662                         }
663
664                         break;
665
666                 case ARG_VOLATILE:
667
668                         if (!optarg)
669                                 arg_volatile = VOLATILE_YES;
670                         else {
671                                 r = parse_boolean(optarg);
672                                 if (r < 0) {
673                                         if (streq(optarg, "state"))
674                                                 arg_volatile = VOLATILE_STATE;
675                                         else {
676                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
677                                                 return r;
678                                         }
679                                 } else
680                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681                         }
682
683                         break;
684
685                 case 'p': {
686                         const char *split, *e;
687                         uint16_t container_port, host_port;
688                         int protocol;
689                         ExposePort *p;
690
691                         if ((e = startswith(optarg, "tcp:")))
692                                 protocol = IPPROTO_TCP;
693                         else if ((e = startswith(optarg, "udp:")))
694                                 protocol = IPPROTO_UDP;
695                         else {
696                                 e = optarg;
697                                 protocol = IPPROTO_TCP;
698                         }
699
700                         split = strchr(e, ':');
701                         if (split) {
702                                 char v[split - e + 1];
703
704                                 memcpy(v, e, split - e);
705                                 v[split - e] = 0;
706
707                                 r = safe_atou16(v, &host_port);
708                                 if (r < 0 || host_port <= 0) {
709                                         log_error("Failed to parse host port: %s", optarg);
710                                         return -EINVAL;
711                                 }
712
713                                 r = safe_atou16(split + 1, &container_port);
714                         } else {
715                                 r = safe_atou16(e, &container_port);
716                                 host_port = container_port;
717                         }
718
719                         if (r < 0 || container_port <= 0) {
720                                 log_error("Failed to parse host port: %s", optarg);
721                                 return -EINVAL;
722                         }
723
724                         LIST_FOREACH(ports, p, arg_expose_ports) {
725                                 if (p->protocol == protocol && p->host_port == host_port) {
726                                         log_error("Duplicate port specification: %s", optarg);
727                                         return -EINVAL;
728                                 }
729                         }
730
731                         p = new(ExposePort, 1);
732                         if (!p)
733                                 return log_oom();
734
735                         p->protocol = protocol;
736                         p->host_port = host_port;
737                         p->container_port = container_port;
738
739                         LIST_PREPEND(ports, arg_expose_ports, p);
740
741                         break;
742                 }
743
744                 case ARG_PROPERTY:
745                         if (strv_extend(&arg_property, optarg) < 0)
746                                 return log_oom();
747
748                         break;
749
750                 case ARG_PRIVATE_USERS:
751                         if (optarg) {
752                                 _cleanup_free_ char *buffer = NULL;
753                                 const char *range, *shift;
754
755                                 range = strchr(optarg, ':');
756                                 if (range) {
757                                         buffer = strndup(optarg, range - optarg);
758                                         if (!buffer)
759                                                 return log_oom();
760                                         shift = buffer;
761
762                                         range++;
763                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764                                                 log_error("Failed to parse UID range: %s", range);
765                                                 return -EINVAL;
766                                         }
767                                 } else
768                                         shift = optarg;
769
770                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
771                                         log_error("Failed to parse UID: %s", optarg);
772                                         return -EINVAL;
773                                 }
774                         }
775
776                         arg_userns = true;
777                         break;
778
779                 case '?':
780                         return -EINVAL;
781
782                 default:
783                         assert_not_reached("Unhandled option");
784                 }
785
786         if (arg_share_system)
787                 arg_register = false;
788
789         if (arg_boot && arg_share_system) {
790                 log_error("--boot and --share-system may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
795                 log_error("--keep-unit may not be used when invoked from a user session.");
796                 return -EINVAL;
797         }
798
799         if (arg_directory && arg_image) {
800                 log_error("--directory= and --image= may not be combined.");
801                 return -EINVAL;
802         }
803
804         if (arg_template && arg_image) {
805                 log_error("--template= and --image= may not be combined.");
806                 return -EINVAL;
807         }
808
809         if (arg_template && !(arg_directory || arg_machine)) {
810                 log_error("--template= needs --directory= or --machine=.");
811                 return -EINVAL;
812         }
813
814         if (arg_ephemeral && arg_template) {
815                 log_error("--ephemeral and --template= may not be combined.");
816                 return -EINVAL;
817         }
818
819         if (arg_ephemeral && arg_image) {
820                 log_error("--ephemeral and --image= may not be combined.");
821                 return -EINVAL;
822         }
823
824         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
825                 log_error("--ephemeral and --link-journal= may not be combined.");
826                 return -EINVAL;
827         }
828
829         if (arg_volatile != VOLATILE_NO && arg_read_only) {
830                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
831                 return -EINVAL;
832         }
833
834         if (arg_expose_ports && !arg_private_network) {
835                 log_error("Cannot use --port= without private networking.");
836                 return -EINVAL;
837         }
838
839         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
840
841         return 1;
842 }
843
844 static int mount_all(const char *dest) {
845
846         typedef struct MountPoint {
847                 const char *what;
848                 const char *where;
849                 const char *type;
850                 const char *options;
851                 unsigned long flags;
852                 bool fatal;
853         } MountPoint;
854
855         static const MountPoint mount_table[] = {
856                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
857                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
858                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
859                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
860                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
861                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
862                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
863                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
864                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
865 #ifdef HAVE_SELINUX
866                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
867                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
868 #endif
869         };
870
871         unsigned k;
872         int r = 0;
873
874         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
875                 _cleanup_free_ char *where = NULL;
876 #ifdef HAVE_SELINUX
877                 _cleanup_free_ char *options = NULL;
878 #endif
879                 const char *o;
880                 int t;
881
882                 where = strjoin(dest, "/", mount_table[k].where, NULL);
883                 if (!where)
884                         return log_oom();
885
886                 t = path_is_mount_point(where, true);
887                 if (t < 0) {
888                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
889
890                         if (r == 0)
891                                 r = t;
892
893                         continue;
894                 }
895
896                 /* Skip this entry if it is not a remount. */
897                 if (mount_table[k].what && t > 0)
898                         continue;
899
900                 t = mkdir_p(where, 0755);
901                 if (t < 0) {
902                         if (mount_table[k].fatal) {
903                                log_error_errno(t, "Failed to create directory %s: %m", where);
904
905                                 if (r == 0)
906                                         r = t;
907                         } else
908                                log_warning_errno(t, "Failed to create directory %s: %m", where);
909
910                         continue;
911                 }
912
913 #ifdef HAVE_SELINUX
914                 if (arg_selinux_apifs_context &&
915                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
916                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
917                         if (!options)
918                                 return log_oom();
919
920                         o = options;
921                 } else
922 #endif
923                         o = mount_table[k].options;
924
925                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
926                         char *uid_options = NULL;
927
928                         if (o)
929                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
930                         else
931                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
932                         if (!uid_options)
933                                 return log_oom();
934
935                         free(options);
936                         o = options = uid_options;
937                 }
938
939                 if (mount(mount_table[k].what,
940                           where,
941                           mount_table[k].type,
942                           mount_table[k].flags,
943                           o) < 0) {
944
945                         if (mount_table[k].fatal) {
946                                 log_error_errno(errno, "mount(%s) failed: %m", where);
947
948                                 if (r == 0)
949                                         r = -errno;
950                         } else
951                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
952                 }
953         }
954
955         return r;
956 }
957
958 static int mount_binds(const char *dest, char **l, bool ro) {
959         char **x, **y;
960
961         STRV_FOREACH_PAIR(x, y, l) {
962                 _cleanup_free_ char *where = NULL;
963                 struct stat source_st, dest_st;
964                 int r;
965
966                 if (stat(*x, &source_st) < 0)
967                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
968
969                 where = strappend(dest, *y);
970                 if (!where)
971                         return log_oom();
972
973                 r = stat(where, &dest_st);
974                 if (r == 0) {
975                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
976                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
977                                 return -EINVAL;
978                         }
979                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
980                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
981                                 return -EINVAL;
982                         }
983                 } else if (errno == ENOENT) {
984                         r = mkdir_parents_label(where, 0755);
985                         if (r < 0)
986                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
987                 } else {
988                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
989                         return -errno;
990                 }
991
992                 /* Create the mount point. Any non-directory file can be
993                  * mounted on any non-directory file (regular, fifo, socket,
994                  * char, block).
995                  */
996                 if (S_ISDIR(source_st.st_mode)) {
997                         r = mkdir_label(where, 0755);
998                         if (r < 0 && errno != EEXIST)
999                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1000                 } else {
1001                         r = touch(where);
1002                         if (r < 0)
1003                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1004                 }
1005
1006                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1007                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1008
1009                 if (ro) {
1010                         r = bind_remount_recursive(where, true);
1011                         if (r < 0)
1012                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1013                 }
1014         }
1015
1016         return 0;
1017 }
1018
1019 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1020         char *to;
1021         int r;
1022
1023         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1024
1025         r = path_is_mount_point(to, false);
1026         if (r < 0)
1027                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1028         if (r > 0)
1029                 return 0;
1030
1031         mkdir_p(to, 0755);
1032
1033         /* The superblock mount options of the mount point need to be
1034          * identical to the hosts', and hence writable... */
1035         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1036                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1037
1038         /* ... hence let's only make the bind mount read-only, not the
1039          * superblock. */
1040         if (read_only) {
1041                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1042                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1043         }
1044         return 1;
1045 }
1046
1047 static int mount_cgroup(const char *dest) {
1048         _cleanup_set_free_free_ Set *controllers = NULL;
1049         _cleanup_free_ char *own_cgroup_path = NULL;
1050         const char *cgroup_root, *systemd_root, *systemd_own;
1051         int r;
1052
1053         controllers = set_new(&string_hash_ops);
1054         if (!controllers)
1055                 return log_oom();
1056
1057         r = cg_kernel_controllers(controllers);
1058         if (r < 0)
1059                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1060
1061         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1062         if (r < 0)
1063                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1064
1065         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1066         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1067                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1068
1069         for (;;) {
1070                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1071
1072                 controller = set_steal_first(controllers);
1073                 if (!controller)
1074                         break;
1075
1076                 origin = strappend("/sys/fs/cgroup/", controller);
1077                 if (!origin)
1078                         return log_oom();
1079
1080                 r = readlink_malloc(origin, &combined);
1081                 if (r == -EINVAL) {
1082                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1083
1084                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1085                         if (r < 0)
1086                                 return r;
1087
1088                 } else if (r < 0)
1089                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1090                 else {
1091                         _cleanup_free_ char *target = NULL;
1092
1093                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1094                         if (!target)
1095                                 return log_oom();
1096
1097                         /* A symbolic link, a combination of controllers in one hierarchy */
1098
1099                         if (!filename_is_valid(combined)) {
1100                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1101                                 continue;
1102                         }
1103
1104                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1105                         if (r < 0)
1106                                 return r;
1107
1108                         if (symlink(combined, target) < 0)
1109                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1110                 }
1111         }
1112
1113         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1114         if (r < 0)
1115                 return r;
1116
1117         /* Make our own cgroup a (writable) bind mount */
1118         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1119         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1120                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1121
1122         /* And then remount the systemd cgroup root read-only */
1123         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1124         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1125                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1126
1127         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1128                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1129
1130         return 0;
1131 }
1132
1133 static int mount_tmpfs(const char *dest) {
1134         char **i, **o;
1135
1136         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1137                 _cleanup_free_ char *where = NULL;
1138                 int r;
1139
1140                 where = strappend(dest, *i);
1141                 if (!where)
1142                         return log_oom();
1143
1144                 r = mkdir_label(where, 0755);
1145                 if (r < 0 && r != -EEXIST)
1146                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1147
1148                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1149                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1150         }
1151
1152         return 0;
1153 }
1154
1155 static int setup_timezone(const char *dest) {
1156         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1157         char *z, *y;
1158         int r;
1159
1160         assert(dest);
1161
1162         /* Fix the timezone, if possible */
1163         r = readlink_malloc("/etc/localtime", &p);
1164         if (r < 0) {
1165                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1166                 return 0;
1167         }
1168
1169         z = path_startswith(p, "../usr/share/zoneinfo/");
1170         if (!z)
1171                 z = path_startswith(p, "/usr/share/zoneinfo/");
1172         if (!z) {
1173                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1174                 return 0;
1175         }
1176
1177         where = strappend(dest, "/etc/localtime");
1178         if (!where)
1179                 return log_oom();
1180
1181         r = readlink_malloc(where, &q);
1182         if (r >= 0) {
1183                 y = path_startswith(q, "../usr/share/zoneinfo/");
1184                 if (!y)
1185                         y = path_startswith(q, "/usr/share/zoneinfo/");
1186
1187                 /* Already pointing to the right place? Then do nothing .. */
1188                 if (y && streq(y, z))
1189                         return 0;
1190         }
1191
1192         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1193         if (!check)
1194                 return log_oom();
1195
1196         if (access(check, F_OK) < 0) {
1197                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1198                 return 0;
1199         }
1200
1201         what = strappend("../usr/share/zoneinfo/", z);
1202         if (!what)
1203                 return log_oom();
1204
1205         r = mkdir_parents(where, 0755);
1206         if (r < 0) {
1207                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1208
1209                 return 0;
1210         }
1211
1212         r = unlink(where);
1213         if (r < 0 && errno != ENOENT) {
1214                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1215
1216                 return 0;
1217         }
1218
1219         if (symlink(what, where) < 0) {
1220                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1221                 return 0;
1222         }
1223
1224         return 0;
1225 }
1226
1227 static int setup_resolv_conf(const char *dest) {
1228         _cleanup_free_ char *where = NULL;
1229         int r;
1230
1231         assert(dest);
1232
1233         if (arg_private_network)
1234                 return 0;
1235
1236         /* Fix resolv.conf, if possible */
1237         where = strappend(dest, "/etc/resolv.conf");
1238         if (!where)
1239                 return log_oom();
1240
1241         /* We don't really care for the results of this really. If it
1242          * fails, it fails, but meh... */
1243         r = mkdir_parents(where, 0755);
1244         if (r < 0) {
1245                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1246
1247                 return 0;
1248         }
1249
1250         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1251         if (r < 0) {
1252                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1253
1254                 return 0;
1255         }
1256
1257         return 0;
1258 }
1259
1260 static int setup_volatile_state(const char *directory) {
1261         const char *p;
1262         int r;
1263
1264         assert(directory);
1265
1266         if (arg_volatile != VOLATILE_STATE)
1267                 return 0;
1268
1269         /* --volatile=state means we simply overmount /var
1270            with a tmpfs, and the rest read-only. */
1271
1272         r = bind_remount_recursive(directory, true);
1273         if (r < 0)
1274                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1275
1276         p = strjoina(directory, "/var");
1277         r = mkdir(p, 0755);
1278         if (r < 0 && errno != EEXIST)
1279                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1280
1281         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1282                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1283
1284         return 0;
1285 }
1286
1287 static int setup_volatile(const char *directory) {
1288         bool tmpfs_mounted = false, bind_mounted = false;
1289         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1290         const char *f, *t;
1291         int r;
1292
1293         assert(directory);
1294
1295         if (arg_volatile != VOLATILE_YES)
1296                 return 0;
1297
1298         /* --volatile=yes means we mount a tmpfs to the root dir, and
1299            the original /usr to use inside it, and that read-only. */
1300
1301         if (!mkdtemp(template))
1302                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1303
1304         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1305                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1306                 r = -errno;
1307                 goto fail;
1308         }
1309
1310         tmpfs_mounted = true;
1311
1312         f = strjoina(directory, "/usr");
1313         t = strjoina(template, "/usr");
1314
1315         r = mkdir(t, 0755);
1316         if (r < 0 && errno != EEXIST) {
1317                 log_error_errno(errno, "Failed to create %s: %m", t);
1318                 r = -errno;
1319                 goto fail;
1320         }
1321
1322         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1323                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1324                 r = -errno;
1325                 goto fail;
1326         }
1327
1328         bind_mounted = true;
1329
1330         r = bind_remount_recursive(t, true);
1331         if (r < 0) {
1332                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1333                 goto fail;
1334         }
1335
1336         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1337                 log_error_errno(errno, "Failed to move root mount: %m");
1338                 r = -errno;
1339                 goto fail;
1340         }
1341
1342         rmdir(template);
1343
1344         return 0;
1345
1346 fail:
1347         if (bind_mounted)
1348                 umount(t);
1349         if (tmpfs_mounted)
1350                 umount(template);
1351         rmdir(template);
1352         return r;
1353 }
1354
1355 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1356
1357         snprintf(s, 37,
1358                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1359                  SD_ID128_FORMAT_VAL(id));
1360
1361         return s;
1362 }
1363
1364 static int setup_boot_id(const char *dest) {
1365         _cleanup_free_ char *from = NULL, *to = NULL;
1366         sd_id128_t rnd = {};
1367         char as_uuid[37];
1368         int r;
1369
1370         assert(dest);
1371
1372         if (arg_share_system)
1373                 return 0;
1374
1375         /* Generate a new randomized boot ID, so that each boot-up of
1376          * the container gets a new one */
1377
1378         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1379         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1380         if (!from || !to)
1381                 return log_oom();
1382
1383         r = sd_id128_randomize(&rnd);
1384         if (r < 0)
1385                 return log_error_errno(r, "Failed to generate random boot id: %m");
1386
1387         id128_format_as_uuid(rnd, as_uuid);
1388
1389         r = write_string_file(from, as_uuid);
1390         if (r < 0)
1391                 return log_error_errno(r, "Failed to write boot id: %m");
1392
1393         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1394                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1395                 r = -errno;
1396         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1397                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1398
1399         unlink(from);
1400         return r;
1401 }
1402
1403 static int copy_devnodes(const char *dest) {
1404
1405         static const char devnodes[] =
1406                 "null\0"
1407                 "zero\0"
1408                 "full\0"
1409                 "random\0"
1410                 "urandom\0"
1411                 "tty\0"
1412                 "net/tun\0";
1413
1414         const char *d;
1415         int r = 0;
1416         _cleanup_umask_ mode_t u;
1417
1418         assert(dest);
1419
1420         u = umask(0000);
1421
1422         NULSTR_FOREACH(d, devnodes) {
1423                 _cleanup_free_ char *from = NULL, *to = NULL;
1424                 struct stat st;
1425
1426                 from = strappend("/dev/", d);
1427                 to = strjoin(dest, "/dev/", d, NULL);
1428                 if (!from || !to)
1429                         return log_oom();
1430
1431                 if (stat(from, &st) < 0) {
1432
1433                         if (errno != ENOENT)
1434                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1435
1436                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1437
1438                         log_error("%s is not a char or block device, cannot copy", from);
1439                         return -EIO;
1440
1441                 } else {
1442                         r = mkdir_parents(to, 0775);
1443                         if (r < 0) {
1444                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1445                                 return -r;
1446                         }
1447
1448                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1449                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1450                 }
1451         }
1452
1453         return r;
1454 }
1455
1456 static int setup_ptmx(const char *dest) {
1457         _cleanup_free_ char *p = NULL;
1458
1459         p = strappend(dest, "/dev/ptmx");
1460         if (!p)
1461                 return log_oom();
1462
1463         if (symlink("pts/ptmx", p) < 0)
1464                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1465
1466         return 0;
1467 }
1468
1469 static int setup_dev_console(const char *dest, const char *console) {
1470         _cleanup_umask_ mode_t u;
1471         const char *to;
1472         struct stat st;
1473         int r;
1474
1475         assert(dest);
1476         assert(console);
1477
1478         u = umask(0000);
1479
1480         if (stat("/dev/null", &st) < 0)
1481                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1482
1483         r = chmod_and_chown(console, 0600, 0, 0);
1484         if (r < 0)
1485                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1486
1487         /* We need to bind mount the right tty to /dev/console since
1488          * ptys can only exist on pts file systems. To have something
1489          * to bind mount things on we create a device node first, and
1490          * use /dev/null for that since we the cgroups device policy
1491          * allows us to create that freely, while we cannot create
1492          * /dev/console. (Note that the major minor doesn't actually
1493          * matter here, since we mount it over anyway). */
1494
1495         to = strjoina(dest, "/dev/console");
1496         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1497                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1498
1499         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1500                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1501
1502         return 0;
1503 }
1504
1505 static int setup_kmsg(const char *dest, int kmsg_socket) {
1506         _cleanup_free_ char *from = NULL, *to = NULL;
1507         _cleanup_umask_ mode_t u;
1508         int r, fd, k;
1509         union {
1510                 struct cmsghdr cmsghdr;
1511                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1512         } control = {};
1513         struct msghdr mh = {
1514                 .msg_control = &control,
1515                 .msg_controllen = sizeof(control),
1516         };
1517         struct cmsghdr *cmsg;
1518
1519         assert(dest);
1520         assert(kmsg_socket >= 0);
1521
1522         u = umask(0000);
1523
1524         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1525          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1526          * on the reading side behave very similar to /proc/kmsg,
1527          * their writing side behaves differently from /dev/kmsg in
1528          * that writing blocks when nothing is reading. In order to
1529          * avoid any problems with containers deadlocking due to this
1530          * we simply make /dev/kmsg unavailable to the container. */
1531         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1532             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1533                 return log_oom();
1534
1535         if (mkfifo(from, 0600) < 0)
1536                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1537
1538         r = chmod_and_chown(from, 0600, 0, 0);
1539         if (r < 0)
1540                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1541
1542         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1543                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1544
1545         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1546         if (fd < 0)
1547                 return log_error_errno(errno, "Failed to open fifo: %m");
1548
1549         cmsg = CMSG_FIRSTHDR(&mh);
1550         cmsg->cmsg_level = SOL_SOCKET;
1551         cmsg->cmsg_type = SCM_RIGHTS;
1552         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1553         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1554
1555         mh.msg_controllen = cmsg->cmsg_len;
1556
1557         /* Store away the fd in the socket, so that it stays open as
1558          * long as we run the child */
1559         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1560         safe_close(fd);
1561
1562         if (k < 0)
1563                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1564
1565         /* And now make the FIFO unavailable as /dev/kmsg... */
1566         unlink(from);
1567         return 0;
1568 }
1569
1570 static int send_rtnl(int send_fd) {
1571         union {
1572                 struct cmsghdr cmsghdr;
1573                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1574         } control = {};
1575         struct msghdr mh = {
1576                 .msg_control = &control,
1577                 .msg_controllen = sizeof(control),
1578         };
1579         struct cmsghdr *cmsg;
1580         _cleanup_close_ int fd = -1;
1581         ssize_t k;
1582
1583         assert(send_fd >= 0);
1584
1585         if (!arg_expose_ports)
1586                 return 0;
1587
1588         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1589         if (fd < 0)
1590                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1591
1592         cmsg = CMSG_FIRSTHDR(&mh);
1593         cmsg->cmsg_level = SOL_SOCKET;
1594         cmsg->cmsg_type = SCM_RIGHTS;
1595         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1596         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1597
1598         mh.msg_controllen = cmsg->cmsg_len;
1599
1600         /* Store away the fd in the socket, so that it stays open as
1601          * long as we run the child */
1602         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1603         if (k < 0)
1604                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1605
1606         return 0;
1607 }
1608
1609 static int flush_ports(union in_addr_union *exposed) {
1610         ExposePort *p;
1611         int r, af = AF_INET;
1612
1613         assert(exposed);
1614
1615         if (!arg_expose_ports)
1616                 return 0;
1617
1618         if (in_addr_is_null(af, exposed))
1619                 return 0;
1620
1621         log_debug("Lost IP address.");
1622
1623         LIST_FOREACH(ports, p, arg_expose_ports) {
1624                 r = fw_add_local_dnat(false,
1625                                       af,
1626                                       p->protocol,
1627                                       NULL,
1628                                       NULL, 0,
1629                                       NULL, 0,
1630                                       p->host_port,
1631                                       exposed,
1632                                       p->container_port,
1633                                       NULL);
1634                 if (r < 0)
1635                         log_warning_errno(r, "Failed to modify firewall: %m");
1636         }
1637
1638         *exposed = IN_ADDR_NULL;
1639         return 0;
1640 }
1641
1642 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1643         _cleanup_free_ struct local_address *addresses = NULL;
1644         _cleanup_free_ char *pretty = NULL;
1645         union in_addr_union new_exposed;
1646         ExposePort *p;
1647         bool add;
1648         int af = AF_INET, r;
1649
1650         assert(exposed);
1651
1652         /* Invoked each time an address is added or removed inside the
1653          * container */
1654
1655         if (!arg_expose_ports)
1656                 return 0;
1657
1658         r = local_addresses(rtnl, 0, af, &addresses);
1659         if (r < 0)
1660                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1661
1662         add = r > 0 &&
1663                 addresses[0].family == af &&
1664                 addresses[0].scope < RT_SCOPE_LINK;
1665
1666         if (!add)
1667                 return flush_ports(exposed);
1668
1669         new_exposed = addresses[0].address;
1670         if (in_addr_equal(af, exposed, &new_exposed))
1671                 return 0;
1672
1673         in_addr_to_string(af, &new_exposed, &pretty);
1674         log_debug("New container IP is %s.", strna(pretty));
1675
1676         LIST_FOREACH(ports, p, arg_expose_ports) {
1677
1678                 r = fw_add_local_dnat(true,
1679                                       af,
1680                                       p->protocol,
1681                                       NULL,
1682                                       NULL, 0,
1683                                       NULL, 0,
1684                                       p->host_port,
1685                                       &new_exposed,
1686                                       p->container_port,
1687                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1688                 if (r < 0)
1689                         log_warning_errno(r, "Failed to modify firewall: %m");
1690         }
1691
1692         *exposed = new_exposed;
1693         return 0;
1694 }
1695
1696 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1697         union in_addr_union *exposed = userdata;
1698
1699         assert(rtnl);
1700         assert(m);
1701         assert(exposed);
1702
1703         expose_ports(rtnl, exposed);
1704         return 0;
1705 }
1706
1707 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1708         union {
1709                 struct cmsghdr cmsghdr;
1710                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1711         } control = {};
1712         struct msghdr mh = {
1713                 .msg_control = &control,
1714                 .msg_controllen = sizeof(control),
1715         };
1716         struct cmsghdr *cmsg;
1717         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1718         int fd, r;
1719         ssize_t k;
1720
1721         assert(event);
1722         assert(recv_fd >= 0);
1723         assert(ret);
1724
1725         if (!arg_expose_ports)
1726                 return 0;
1727
1728         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1729         if (k < 0)
1730                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1731
1732         cmsg = CMSG_FIRSTHDR(&mh);
1733         assert(cmsg->cmsg_level == SOL_SOCKET);
1734         assert(cmsg->cmsg_type == SCM_RIGHTS);
1735         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1736         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1737
1738         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1739         if (r < 0) {
1740                 safe_close(fd);
1741                 return log_error_errno(r, "Failed to create rtnl object: %m");
1742         }
1743
1744         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1747
1748         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1749         if (r < 0)
1750                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1751
1752         r = sd_rtnl_attach_event(rtnl, event, 0);
1753         if (r < 0)
1754                 return log_error_errno(r, "Failed to add to even loop: %m");
1755
1756         *ret = rtnl;
1757         rtnl = NULL;
1758
1759         return 0;
1760 }
1761
1762 static int setup_hostname(void) {
1763
1764         if (arg_share_system)
1765                 return 0;
1766
1767         if (sethostname_idempotent(arg_machine) < 0)
1768                 return -errno;
1769
1770         return 0;
1771 }
1772
1773 static int setup_journal(const char *directory) {
1774         sd_id128_t machine_id, this_id;
1775         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1776         char *id;
1777         int r;
1778
1779         /* Don't link journals in ephemeral mode */
1780         if (arg_ephemeral)
1781                 return 0;
1782
1783         p = strappend(directory, "/etc/machine-id");
1784         if (!p)
1785                 return log_oom();
1786
1787         r = read_one_line_file(p, &b);
1788         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1789                 return 0;
1790         else if (r < 0)
1791                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1792
1793         id = strstrip(b);
1794         if (isempty(id) && arg_link_journal == LINK_AUTO)
1795                 return 0;
1796
1797         /* Verify validity */
1798         r = sd_id128_from_string(id, &machine_id);
1799         if (r < 0)
1800                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1801
1802         r = sd_id128_get_machine(&this_id);
1803         if (r < 0)
1804                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1805
1806         if (sd_id128_equal(machine_id, this_id)) {
1807                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1808                          "Host and machine ids are equal (%s): refusing to link journals", id);
1809                 if (arg_link_journal == LINK_AUTO)
1810                         return 0;
1811                 return -EEXIST;
1812         }
1813
1814         if (arg_link_journal == LINK_NO)
1815                 return 0;
1816
1817         free(p);
1818         p = strappend("/var/log/journal/", id);
1819         q = strjoin(directory, "/var/log/journal/", id, NULL);
1820         if (!p || !q)
1821                 return log_oom();
1822
1823         if (path_is_mount_point(p, false) > 0) {
1824                 if (arg_link_journal != LINK_AUTO) {
1825                         log_error("%s: already a mount point, refusing to use for journal", p);
1826                         return -EEXIST;
1827                 }
1828
1829                 return 0;
1830         }
1831
1832         if (path_is_mount_point(q, false) > 0) {
1833                 if (arg_link_journal != LINK_AUTO) {
1834                         log_error("%s: already a mount point, refusing to use for journal", q);
1835                         return -EEXIST;
1836                 }
1837
1838                 return 0;
1839         }
1840
1841         r = readlink_and_make_absolute(p, &d);
1842         if (r >= 0) {
1843                 if ((arg_link_journal == LINK_GUEST ||
1844                      arg_link_journal == LINK_AUTO) &&
1845                     path_equal(d, q)) {
1846
1847                         r = mkdir_p(q, 0755);
1848                         if (r < 0)
1849                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1850                         return 0;
1851                 }
1852
1853                 if (unlink(p) < 0)
1854                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1855         } else if (r == -EINVAL) {
1856
1857                 if (arg_link_journal == LINK_GUEST &&
1858                     rmdir(p) < 0) {
1859
1860                         if (errno == ENOTDIR) {
1861                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1862                                 return r;
1863                         } else {
1864                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1865                                 return -errno;
1866                         }
1867                 }
1868         } else if (r != -ENOENT) {
1869                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1870                 return r;
1871         }
1872
1873         if (arg_link_journal == LINK_GUEST) {
1874
1875                 if (symlink(q, p) < 0) {
1876                         if (arg_link_journal_try) {
1877                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1878                                 return 0;
1879                         } else {
1880                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1881                                 return -errno;
1882                         }
1883                 }
1884
1885                 r = mkdir_p(q, 0755);
1886                 if (r < 0)
1887                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1888                 return 0;
1889         }
1890
1891         if (arg_link_journal == LINK_HOST) {
1892                 /* don't create parents here -- if the host doesn't have
1893                  * permanent journal set up, don't force it here */
1894                 r = mkdir(p, 0755);
1895                 if (r < 0) {
1896                         if (arg_link_journal_try) {
1897                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1898                                 return 0;
1899                         } else {
1900                                 log_error_errno(errno, "Failed to create %s: %m", p);
1901                                 return r;
1902                         }
1903                 }
1904
1905         } else if (access(p, F_OK) < 0)
1906                 return 0;
1907
1908         if (dir_is_empty(q) == 0)
1909                 log_warning("%s is not empty, proceeding anyway.", q);
1910
1911         r = mkdir_p(q, 0755);
1912         if (r < 0) {
1913                 log_error_errno(errno, "Failed to create %s: %m", q);
1914                 return r;
1915         }
1916
1917         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1918                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1919
1920         return 0;
1921 }
1922
1923 static int drop_capabilities(void) {
1924         return capability_bounding_set_drop(~arg_retain, false);
1925 }
1926
1927 static int register_machine(pid_t pid, int local_ifindex) {
1928         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1929         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1930         int r;
1931
1932         if (!arg_register)
1933                 return 0;
1934
1935         r = sd_bus_default_system(&bus);
1936         if (r < 0)
1937                 return log_error_errno(r, "Failed to open system bus: %m");
1938
1939         if (arg_keep_unit) {
1940                 r = sd_bus_call_method(
1941                                 bus,
1942                                 "org.freedesktop.machine1",
1943                                 "/org/freedesktop/machine1",
1944                                 "org.freedesktop.machine1.Manager",
1945                                 "RegisterMachineWithNetwork",
1946                                 &error,
1947                                 NULL,
1948                                 "sayssusai",
1949                                 arg_machine,
1950                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1951                                 "nspawn",
1952                                 "container",
1953                                 (uint32_t) pid,
1954                                 strempty(arg_directory),
1955                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1956         } else {
1957                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1958                 char **i;
1959
1960                 r = sd_bus_message_new_method_call(
1961                                 bus,
1962                                 &m,
1963                                 "org.freedesktop.machine1",
1964                                 "/org/freedesktop/machine1",
1965                                 "org.freedesktop.machine1.Manager",
1966                                 "CreateMachineWithNetwork");
1967                 if (r < 0)
1968                         return bus_log_create_error(r);
1969
1970                 r = sd_bus_message_append(
1971                                 m,
1972                                 "sayssusai",
1973                                 arg_machine,
1974                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1975                                 "nspawn",
1976                                 "container",
1977                                 (uint32_t) pid,
1978                                 strempty(arg_directory),
1979                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1980                 if (r < 0)
1981                         return bus_log_create_error(r);
1982
1983                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1984                 if (r < 0)
1985                         return bus_log_create_error(r);
1986
1987                 if (!isempty(arg_slice)) {
1988                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1989                         if (r < 0)
1990                                 return bus_log_create_error(r);
1991                 }
1992
1993                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1994                 if (r < 0)
1995                         return bus_log_create_error(r);
1996
1997                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1998                                           /* Allow the container to
1999                                            * access and create the API
2000                                            * device nodes, so that
2001                                            * PrivateDevices= in the
2002                                            * container can work
2003                                            * fine */
2004                                           "/dev/null", "rwm",
2005                                           "/dev/zero", "rwm",
2006                                           "/dev/full", "rwm",
2007                                           "/dev/random", "rwm",
2008                                           "/dev/urandom", "rwm",
2009                                           "/dev/tty", "rwm",
2010                                           "/dev/net/tun", "rwm",
2011                                           /* Allow the container
2012                                            * access to ptys. However,
2013                                            * do not permit the
2014                                            * container to ever create
2015                                            * these device nodes. */
2016                                           "/dev/pts/ptmx", "rw",
2017                                           "char-pts", "rw");
2018                 if (r < 0)
2019                         return log_error_errno(r, "Failed to add device whitelist: %m");
2020
2021                 STRV_FOREACH(i, arg_property) {
2022                         r = sd_bus_message_open_container(m, 'r', "sv");
2023                         if (r < 0)
2024                                 return bus_log_create_error(r);
2025
2026                         r = bus_append_unit_property_assignment(m, *i);
2027                         if (r < 0)
2028                                 return r;
2029
2030                         r = sd_bus_message_close_container(m);
2031                         if (r < 0)
2032                                 return bus_log_create_error(r);
2033                 }
2034
2035                 r = sd_bus_message_close_container(m);
2036                 if (r < 0)
2037                         return bus_log_create_error(r);
2038
2039                 r = sd_bus_call(bus, m, 0, &error, NULL);
2040         }
2041
2042         if (r < 0) {
2043                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2044                 return r;
2045         }
2046
2047         return 0;
2048 }
2049
2050 static int terminate_machine(pid_t pid) {
2051         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2052         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2053         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2054         const char *path;
2055         int r;
2056
2057         if (!arg_register)
2058                 return 0;
2059
2060         r = sd_bus_default_system(&bus);
2061         if (r < 0)
2062                 return log_error_errno(r, "Failed to open system bus: %m");
2063
2064         r = sd_bus_call_method(
2065                         bus,
2066                         "org.freedesktop.machine1",
2067                         "/org/freedesktop/machine1",
2068                         "org.freedesktop.machine1.Manager",
2069                         "GetMachineByPID",
2070                         &error,
2071                         &reply,
2072                         "u",
2073                         (uint32_t) pid);
2074         if (r < 0) {
2075                 /* Note that the machine might already have been
2076                  * cleaned up automatically, hence don't consider it a
2077                  * failure if we cannot get the machine object. */
2078                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2079                 return 0;
2080         }
2081
2082         r = sd_bus_message_read(reply, "o", &path);
2083         if (r < 0)
2084                 return bus_log_parse_error(r);
2085
2086         r = sd_bus_call_method(
2087                         bus,
2088                         "org.freedesktop.machine1",
2089                         path,
2090                         "org.freedesktop.machine1.Machine",
2091                         "Terminate",
2092                         &error,
2093                         NULL,
2094                         NULL);
2095         if (r < 0) {
2096                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2097                 return 0;
2098         }
2099
2100         return 0;
2101 }
2102
2103 static int reset_audit_loginuid(void) {
2104         _cleanup_free_ char *p = NULL;
2105         int r;
2106
2107         if (arg_share_system)
2108                 return 0;
2109
2110         r = read_one_line_file("/proc/self/loginuid", &p);
2111         if (r == -ENOENT)
2112                 return 0;
2113         if (r < 0)
2114                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2115
2116         /* Already reset? */
2117         if (streq(p, "4294967295"))
2118                 return 0;
2119
2120         r = write_string_file("/proc/self/loginuid", "4294967295");
2121         if (r < 0) {
2122                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2123                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2124                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2125                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2126                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2127
2128                 sleep(5);
2129         }
2130
2131         return 0;
2132 }
2133
2134 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2135 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2136 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2137
2138 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2139         uint8_t result[8];
2140         size_t l, sz;
2141         uint8_t *v, *i;
2142         int r;
2143
2144         l = strlen(arg_machine);
2145         sz = sizeof(sd_id128_t) + l;
2146         if (idx > 0)
2147                 sz += sizeof(idx);
2148
2149         v = alloca(sz);
2150
2151         /* fetch some persistent data unique to the host */
2152         r = sd_id128_get_machine((sd_id128_t*) v);
2153         if (r < 0)
2154                 return r;
2155
2156         /* combine with some data unique (on this host) to this
2157          * container instance */
2158         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2159         if (idx > 0) {
2160                 idx = htole64(idx);
2161                 memcpy(i, &idx, sizeof(idx));
2162         }
2163
2164         /* Let's hash the host machine ID plus the container name. We
2165          * use a fixed, but originally randomly created hash key here. */
2166         siphash24(result, v, sz, hash_key.bytes);
2167
2168         assert_cc(ETH_ALEN <= sizeof(result));
2169         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2170
2171         /* see eth_random_addr in the kernel */
2172         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2173         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2174
2175         return 0;
2176 }
2177
2178 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2179         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2181         struct ether_addr mac_host, mac_container;
2182         int r, i;
2183
2184         if (!arg_private_network)
2185                 return 0;
2186
2187         if (!arg_network_veth)
2188                 return 0;
2189
2190         /* Use two different interface name prefixes depending whether
2191          * we are in bridge mode or not. */
2192         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2193                  arg_network_bridge ? "vb" : "ve", arg_machine);
2194
2195         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2196         if (r < 0)
2197                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2198
2199         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2200         if (r < 0)
2201                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2202
2203         r = sd_rtnl_open(&rtnl, 0);
2204         if (r < 0)
2205                 return log_error_errno(r, "Failed to connect to netlink: %m");
2206
2207         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2210
2211         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2214
2215         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2218
2219         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to open netlink container: %m");
2222
2223         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to open netlink container: %m");
2226
2227         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to open netlink container: %m");
2230
2231         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2234
2235         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2238
2239         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2240         if (r < 0)
2241                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2242
2243         r = sd_rtnl_message_close_container(m);
2244         if (r < 0)
2245                 return log_error_errno(r, "Failed to close netlink container: %m");
2246
2247         r = sd_rtnl_message_close_container(m);
2248         if (r < 0)
2249                 return log_error_errno(r, "Failed to close netlink container: %m");
2250
2251         r = sd_rtnl_message_close_container(m);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to close netlink container: %m");
2254
2255         r = sd_rtnl_call(rtnl, m, 0, NULL);
2256         if (r < 0)
2257                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2258
2259         i = (int) if_nametoindex(iface_name);
2260         if (i <= 0)
2261                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2262
2263         *ifi = i;
2264
2265         return 0;
2266 }
2267
2268 static int setup_bridge(const char veth_name[], int *ifi) {
2269         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2270         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2271         int r, bridge;
2272
2273         if (!arg_private_network)
2274                 return 0;
2275
2276         if (!arg_network_veth)
2277                 return 0;
2278
2279         if (!arg_network_bridge)
2280                 return 0;
2281
2282         bridge = (int) if_nametoindex(arg_network_bridge);
2283         if (bridge <= 0)
2284                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2285
2286         *ifi = bridge;
2287
2288         r = sd_rtnl_open(&rtnl, 0);
2289         if (r < 0)
2290                 return log_error_errno(r, "Failed to connect to netlink: %m");
2291
2292         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2293         if (r < 0)
2294                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2295
2296         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2297         if (r < 0)
2298                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2299
2300         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2301         if (r < 0)
2302                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2303
2304         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to add netlink master field: %m");
2307
2308         r = sd_rtnl_call(rtnl, m, 0, NULL);
2309         if (r < 0)
2310                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2311
2312         return 0;
2313 }
2314
2315 static int parse_interface(struct udev *udev, const char *name) {
2316         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2317         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2318         int ifi;
2319
2320         ifi = (int) if_nametoindex(name);
2321         if (ifi <= 0)
2322                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2323
2324         sprintf(ifi_str, "n%i", ifi);
2325         d = udev_device_new_from_device_id(udev, ifi_str);
2326         if (!d)
2327                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2328
2329         if (udev_device_get_is_initialized(d) <= 0) {
2330                 log_error("Network interface %s is not initialized yet.", name);
2331                 return -EBUSY;
2332         }
2333
2334         return ifi;
2335 }
2336
2337 static int move_network_interfaces(pid_t pid) {
2338         _cleanup_udev_unref_ struct udev *udev = NULL;
2339         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2340         char **i;
2341         int r;
2342
2343         if (!arg_private_network)
2344                 return 0;
2345
2346         if (strv_isempty(arg_network_interfaces))
2347                 return 0;
2348
2349         r = sd_rtnl_open(&rtnl, 0);
2350         if (r < 0)
2351                 return log_error_errno(r, "Failed to connect to netlink: %m");
2352
2353         udev = udev_new();
2354         if (!udev) {
2355                 log_error("Failed to connect to udev.");
2356                 return -ENOMEM;
2357         }
2358
2359         STRV_FOREACH(i, arg_network_interfaces) {
2360                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2361                 int ifi;
2362
2363                 ifi = parse_interface(udev, *i);
2364                 if (ifi < 0)
2365                         return ifi;
2366
2367                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2368                 if (r < 0)
2369                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2370
2371                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2372                 if (r < 0)
2373                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2374
2375                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2376                 if (r < 0)
2377                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2378         }
2379
2380         return 0;
2381 }
2382
2383 static int setup_macvlan(pid_t pid) {
2384         _cleanup_udev_unref_ struct udev *udev = NULL;
2385         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2386         unsigned idx = 0;
2387         char **i;
2388         int r;
2389
2390         if (!arg_private_network)
2391                 return 0;
2392
2393         if (strv_isempty(arg_network_macvlan))
2394                 return 0;
2395
2396         r = sd_rtnl_open(&rtnl, 0);
2397         if (r < 0)
2398                 return log_error_errno(r, "Failed to connect to netlink: %m");
2399
2400         udev = udev_new();
2401         if (!udev) {
2402                 log_error("Failed to connect to udev.");
2403                 return -ENOMEM;
2404         }
2405
2406         STRV_FOREACH(i, arg_network_macvlan) {
2407                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2408                 _cleanup_free_ char *n = NULL;
2409                 struct ether_addr mac;
2410                 int ifi;
2411
2412                 ifi = parse_interface(udev, *i);
2413                 if (ifi < 0)
2414                         return ifi;
2415
2416                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2417                 if (r < 0)
2418                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2419
2420                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2421                 if (r < 0)
2422                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2423
2424                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2425                 if (r < 0)
2426                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2427
2428                 n = strappend("mv-", *i);
2429                 if (!n)
2430                         return log_oom();
2431
2432                 strshorten(n, IFNAMSIZ-1);
2433
2434                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2435                 if (r < 0)
2436                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2437
2438                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2439                 if (r < 0)
2440                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2441
2442                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2445
2446                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2457
2458                 r = sd_rtnl_message_close_container(m);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2469         }
2470
2471         return 0;
2472 }
2473
2474 static int setup_ipvlan(pid_t pid) {
2475         _cleanup_udev_unref_ struct udev *udev = NULL;
2476         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2477         char **i;
2478         int r;
2479
2480         if (!arg_private_network)
2481                 return 0;
2482
2483         if (strv_isempty(arg_network_ipvlan))
2484                 return 0;
2485
2486         r = sd_rtnl_open(&rtnl, 0);
2487         if (r < 0)
2488                 return log_error_errno(r, "Failed to connect to netlink: %m");
2489
2490         udev = udev_new();
2491         if (!udev) {
2492                 log_error("Failed to connect to udev.");
2493                 return -ENOMEM;
2494         }
2495
2496         STRV_FOREACH(i, arg_network_ipvlan) {
2497                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2498                 _cleanup_free_ char *n = NULL;
2499                 int ifi;
2500
2501                 ifi = parse_interface(udev, *i);
2502                 if (ifi < 0)
2503                         return ifi;
2504
2505                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2506                 if (r < 0)
2507                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2508
2509                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2510                 if (r < 0)
2511                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2512
2513                 n = strappend("iv-", *i);
2514                 if (!n)
2515                         return log_oom();
2516
2517                 strshorten(n, IFNAMSIZ-1);
2518
2519                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2520                 if (r < 0)
2521                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2522
2523                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2524                 if (r < 0)
2525                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2526
2527                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2528                 if (r < 0)
2529                         return log_error_errno(r, "Failed to open netlink container: %m");
2530
2531                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2532                 if (r < 0)
2533                         return log_error_errno(r, "Failed to open netlink container: %m");
2534
2535                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2536                 if (r < 0)
2537                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2538
2539                 r = sd_rtnl_message_close_container(m);
2540                 if (r < 0)
2541                         return log_error_errno(r, "Failed to close netlink container: %m");
2542
2543                 r = sd_rtnl_message_close_container(m);
2544                 if (r < 0)
2545                         return log_error_errno(r, "Failed to close netlink container: %m");
2546
2547                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2548                 if (r < 0)
2549                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2550         }
2551
2552         return 0;
2553 }
2554
2555 static int setup_seccomp(void) {
2556
2557 #ifdef HAVE_SECCOMP
2558         static const int blacklist[] = {
2559                 SCMP_SYS(kexec_load),
2560                 SCMP_SYS(open_by_handle_at),
2561                 SCMP_SYS(iopl),
2562                 SCMP_SYS(ioperm),
2563                 SCMP_SYS(swapon),
2564                 SCMP_SYS(swapoff),
2565         };
2566
2567         static const int kmod_blacklist[] = {
2568                 SCMP_SYS(init_module),
2569                 SCMP_SYS(finit_module),
2570                 SCMP_SYS(delete_module),
2571         };
2572
2573         scmp_filter_ctx seccomp;
2574         unsigned i;
2575         int r;
2576
2577         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2578         if (!seccomp)
2579                 return log_oom();
2580
2581         r = seccomp_add_secondary_archs(seccomp);
2582         if (r < 0) {
2583                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2584                 goto finish;
2585         }
2586
2587         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2588                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2589                 if (r == -EFAULT)
2590                         continue; /* unknown syscall */
2591                 if (r < 0) {
2592                         log_error_errno(r, "Failed to block syscall: %m");
2593                         goto finish;
2594                 }
2595         }
2596
2597         /* If the CAP_SYS_MODULE capability is not requested then
2598          * we'll block the kmod syscalls too */
2599         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2600                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2601                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2602                         if (r == -EFAULT)
2603                                 continue; /* unknown syscall */
2604                         if (r < 0) {
2605                                 log_error_errno(r, "Failed to block syscall: %m");
2606                                 goto finish;
2607                         }
2608                 }
2609         }
2610
2611         /*
2612            Audit is broken in containers, much of the userspace audit
2613            hookup will fail if running inside a container. We don't
2614            care and just turn off creation of audit sockets.
2615
2616            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2617            with EAFNOSUPPORT which audit userspace uses as indication
2618            that audit is disabled in the kernel.
2619          */
2620
2621         r = seccomp_rule_add(
2622                         seccomp,
2623                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2624                         SCMP_SYS(socket),
2625                         2,
2626                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2627                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2628         if (r < 0) {
2629                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2630                 goto finish;
2631         }
2632
2633         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2634         if (r < 0) {
2635                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2636                 goto finish;
2637         }
2638
2639         r = seccomp_load(seccomp);
2640         if (r < 0)
2641                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2642
2643 finish:
2644         seccomp_release(seccomp);
2645         return r;
2646 #else
2647         return 0;
2648 #endif
2649
2650 }
2651
2652 static int setup_propagate(const char *root) {
2653         const char *p, *q;
2654
2655         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2656         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2657         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2658         (void) mkdir_p(p, 0600);
2659
2660         q = strjoina(root, "/run/systemd/nspawn/incoming");
2661         mkdir_parents(q, 0755);
2662         mkdir_p(q, 0600);
2663
2664         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2665                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2666
2667         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2668                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2669
2670         return 0;
2671 }
2672
2673 static int setup_image(char **device_path, int *loop_nr) {
2674         struct loop_info64 info = {
2675                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2676         };
2677         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2678         _cleanup_free_ char* loopdev = NULL;
2679         struct stat st;
2680         int r, nr;
2681
2682         assert(device_path);
2683         assert(loop_nr);
2684         assert(arg_image);
2685
2686         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2687         if (fd < 0)
2688                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2689
2690         if (fstat(fd, &st) < 0)
2691                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2692
2693         if (S_ISBLK(st.st_mode)) {
2694                 char *p;
2695
2696                 p = strdup(arg_image);
2697                 if (!p)
2698                         return log_oom();
2699
2700                 *device_path = p;
2701
2702                 *loop_nr = -1;
2703
2704                 r = fd;
2705                 fd = -1;
2706
2707                 return r;
2708         }
2709
2710         if (!S_ISREG(st.st_mode)) {
2711                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2712                 return -EINVAL;
2713         }
2714
2715         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2716         if (control < 0)
2717                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2718
2719         nr = ioctl(control, LOOP_CTL_GET_FREE);
2720         if (nr < 0)
2721                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2722
2723         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2724                 return log_oom();
2725
2726         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2727         if (loop < 0)
2728                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2729
2730         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2731                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2732
2733         if (arg_read_only)
2734                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2735
2736         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2737                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2738
2739         *device_path = loopdev;
2740         loopdev = NULL;
2741
2742         *loop_nr = nr;
2743
2744         r = loop;
2745         loop = -1;
2746
2747         return r;
2748 }
2749
2750 #define PARTITION_TABLE_BLURB \
2751         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2752         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2753         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2754         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2755         "to be bootable with systemd-nspawn."
2756
2757 static int dissect_image(
2758                 int fd,
2759                 char **root_device, bool *root_device_rw,
2760                 char **home_device, bool *home_device_rw,
2761                 char **srv_device, bool *srv_device_rw,
2762                 bool *secondary) {
2763
2764 #ifdef HAVE_BLKID
2765         int home_nr = -1, srv_nr = -1;
2766 #ifdef GPT_ROOT_NATIVE
2767         int root_nr = -1;
2768 #endif
2769 #ifdef GPT_ROOT_SECONDARY
2770         int secondary_root_nr = -1;
2771 #endif
2772         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2773         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2774         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2775         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2776         _cleanup_udev_unref_ struct udev *udev = NULL;
2777         struct udev_list_entry *first, *item;
2778         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2779         bool is_gpt, is_mbr, multiple_generic = false;
2780         const char *pttype = NULL;
2781         blkid_partlist pl;
2782         struct stat st;
2783         unsigned i;
2784         int r;
2785
2786         assert(fd >= 0);
2787         assert(root_device);
2788         assert(home_device);
2789         assert(srv_device);
2790         assert(secondary);
2791         assert(arg_image);
2792
2793         b = blkid_new_probe();
2794         if (!b)
2795                 return log_oom();
2796
2797         errno = 0;
2798         r = blkid_probe_set_device(b, fd, 0, 0);
2799         if (r != 0) {
2800                 if (errno == 0)
2801                         return log_oom();
2802
2803                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2804                 return -errno;
2805         }
2806
2807         blkid_probe_enable_partitions(b, 1);
2808         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2809
2810         errno = 0;
2811         r = blkid_do_safeprobe(b);
2812         if (r == -2 || r == 1) {
2813                 log_error("Failed to identify any partition table on\n"
2814                           "    %s\n"
2815                           PARTITION_TABLE_BLURB, arg_image);
2816                 return -EINVAL;
2817         } else if (r != 0) {
2818                 if (errno == 0)
2819                         errno = EIO;
2820                 log_error_errno(errno, "Failed to probe: %m");
2821                 return -errno;
2822         }
2823
2824         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2825
2826         is_gpt = streq_ptr(pttype, "gpt");
2827         is_mbr = streq_ptr(pttype, "dos");
2828
2829         if (!is_gpt && !is_mbr) {
2830                 log_error("No GPT or MBR partition table discovered on\n"
2831                           "    %s\n"
2832                           PARTITION_TABLE_BLURB, arg_image);
2833                 return -EINVAL;
2834         }
2835
2836         errno = 0;
2837         pl = blkid_probe_get_partitions(b);
2838         if (!pl) {
2839                 if (errno == 0)
2840                         return log_oom();
2841
2842                 log_error("Failed to list partitions of %s", arg_image);
2843                 return -errno;
2844         }
2845
2846         udev = udev_new();
2847         if (!udev)
2848                 return log_oom();
2849
2850         if (fstat(fd, &st) < 0)
2851                 return log_error_errno(errno, "Failed to stat block device: %m");
2852
2853         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2854         if (!d)
2855                 return log_oom();
2856
2857         for (i = 0;; i++) {
2858                 int n, m;
2859
2860                 if (i >= 10) {
2861                         log_error("Kernel partitions never appeared.");
2862                         return -ENXIO;
2863                 }
2864
2865                 e = udev_enumerate_new(udev);
2866                 if (!e)
2867                         return log_oom();
2868
2869                 r = udev_enumerate_add_match_parent(e, d);
2870                 if (r < 0)
2871                         return log_oom();
2872
2873                 r = udev_enumerate_scan_devices(e);
2874                 if (r < 0)
2875                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2876
2877                 /* Count the partitions enumerated by the kernel */
2878                 n = 0;
2879                 first = udev_enumerate_get_list_entry(e);
2880                 udev_list_entry_foreach(item, first)
2881                         n++;
2882
2883                 /* Count the partitions enumerated by blkid */
2884                 m = blkid_partlist_numof_partitions(pl);
2885                 if (n == m + 1)
2886                         break;
2887                 if (n > m + 1) {
2888                         log_error("blkid and kernel partition list do not match.");
2889                         return -EIO;
2890                 }
2891                 if (n < m + 1) {
2892                         unsigned j;
2893
2894                         /* The kernel has probed fewer partitions than
2895                          * blkid? Maybe the kernel prober is still
2896                          * running or it got EBUSY because udev
2897                          * already opened the device. Let's reprobe
2898                          * the device, which is a synchronous call
2899                          * that waits until probing is complete. */
2900
2901                         for (j = 0; j < 20; j++) {
2902
2903                                 r = ioctl(fd, BLKRRPART, 0);
2904                                 if (r < 0)
2905                                         r = -errno;
2906                                 if (r >= 0 || r != -EBUSY)
2907                                         break;
2908
2909                                 /* If something else has the device
2910                                  * open, such as an udev rule, the
2911                                  * ioctl will return EBUSY. Since
2912                                  * there's no way to wait until it
2913                                  * isn't busy anymore, let's just wait
2914                                  * a bit, and try again.
2915                                  *
2916                                  * This is really something they
2917                                  * should fix in the kernel! */
2918
2919                                 usleep(50 * USEC_PER_MSEC);
2920                         }
2921
2922                         if (r < 0)
2923                                 return log_error_errno(r, "Failed to reread partition table: %m");
2924                 }
2925
2926                 e = udev_enumerate_unref(e);
2927         }
2928
2929         first = udev_enumerate_get_list_entry(e);
2930         udev_list_entry_foreach(item, first) {
2931                 _cleanup_udev_device_unref_ struct udev_device *q;
2932                 const char *node;
2933                 unsigned long long flags;
2934                 blkid_partition pp;
2935                 dev_t qn;
2936                 int nr;
2937
2938                 errno = 0;
2939                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2940                 if (!q) {
2941                         if (!errno)
2942                                 errno = ENOMEM;
2943
2944                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2945                         return -errno;
2946                 }
2947
2948                 qn = udev_device_get_devnum(q);
2949                 if (major(qn) == 0)
2950                         continue;
2951
2952                 if (st.st_rdev == qn)
2953                         continue;
2954
2955                 node = udev_device_get_devnode(q);
2956                 if (!node)
2957                         continue;
2958
2959                 pp = blkid_partlist_devno_to_partition(pl, qn);
2960                 if (!pp)
2961                         continue;
2962
2963                 flags = blkid_partition_get_flags(pp);
2964
2965                 nr = blkid_partition_get_partno(pp);
2966                 if (nr < 0)
2967                         continue;
2968
2969                 if (is_gpt) {
2970                         sd_id128_t type_id;
2971                         const char *stype;
2972
2973                         if (flags & GPT_FLAG_NO_AUTO)
2974                                 continue;
2975
2976                         stype = blkid_partition_get_type_string(pp);
2977                         if (!stype)
2978                                 continue;
2979
2980                         if (sd_id128_from_string(stype, &type_id) < 0)
2981                                 continue;
2982
2983                         if (sd_id128_equal(type_id, GPT_HOME)) {
2984
2985                                 if (home && nr >= home_nr)
2986                                         continue;
2987
2988                                 home_nr = nr;
2989                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2990
2991                                 r = free_and_strdup(&home, node);
2992                                 if (r < 0)
2993                                         return log_oom();
2994
2995                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2996
2997                                 if (srv && nr >= srv_nr)
2998                                         continue;
2999
3000                                 srv_nr = nr;
3001                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3002
3003                                 r = free_and_strdup(&srv, node);
3004                                 if (r < 0)
3005                                         return log_oom();
3006                         }
3007 #ifdef GPT_ROOT_NATIVE
3008                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3009
3010                                 if (root && nr >= root_nr)
3011                                         continue;
3012
3013                                 root_nr = nr;
3014                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3015
3016                                 r = free_and_strdup(&root, node);
3017                                 if (r < 0)
3018                                         return log_oom();
3019                         }
3020 #endif
3021 #ifdef GPT_ROOT_SECONDARY
3022                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3023
3024                                 if (secondary_root && nr >= secondary_root_nr)
3025                                         continue;
3026
3027                                 secondary_root_nr = nr;
3028                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3029
3030                                 r = free_and_strdup(&secondary_root, node);
3031                                 if (r < 0)
3032                                         return log_oom();
3033                         }
3034 #endif
3035                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3036
3037                                 if (generic)
3038                                         multiple_generic = true;
3039                                 else {
3040                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3041
3042                                         r = free_and_strdup(&generic, node);
3043                                         if (r < 0)
3044                                                 return log_oom();
3045                                 }
3046                         }
3047
3048                 } else if (is_mbr) {
3049                         int type;
3050
3051                         if (flags != 0x80) /* Bootable flag */
3052                                 continue;
3053
3054                         type = blkid_partition_get_type(pp);
3055                         if (type != 0x83) /* Linux partition */
3056                                 continue;
3057
3058                         if (generic)
3059                                 multiple_generic = true;
3060                         else {
3061                                 generic_rw = true;
3062
3063                                 r = free_and_strdup(&root, node);
3064                                 if (r < 0)
3065                                         return log_oom();
3066                         }
3067                 }
3068         }
3069
3070         if (root) {
3071                 *root_device = root;
3072                 root = NULL;
3073
3074                 *root_device_rw = root_rw;
3075                 *secondary = false;
3076         } else if (secondary_root) {
3077                 *root_device = secondary_root;
3078                 secondary_root = NULL;
3079
3080                 *root_device_rw = secondary_root_rw;
3081                 *secondary = true;
3082         } else if (generic) {
3083
3084                 /* There were no partitions with precise meanings
3085                  * around, but we found generic partitions. In this
3086                  * case, if there's only one, we can go ahead and boot
3087                  * it, otherwise we bail out, because we really cannot
3088                  * make any sense of it. */
3089
3090                 if (multiple_generic) {
3091                         log_error("Identified multiple bootable Linux partitions on\n"
3092                                   "    %s\n"
3093                                   PARTITION_TABLE_BLURB, arg_image);
3094                         return -EINVAL;
3095                 }
3096
3097                 *root_device = generic;
3098                 generic = NULL;
3099
3100                 *root_device_rw = generic_rw;
3101                 *secondary = false;
3102         } else {
3103                 log_error("Failed to identify root partition in disk image\n"
3104                           "    %s\n"
3105                           PARTITION_TABLE_BLURB, arg_image);
3106                 return -EINVAL;
3107         }
3108
3109         if (home) {
3110                 *home_device = home;
3111                 home = NULL;
3112
3113                 *home_device_rw = home_rw;
3114         }
3115
3116         if (srv) {
3117                 *srv_device = srv;
3118                 srv = NULL;
3119
3120                 *srv_device_rw = srv_rw;
3121         }
3122
3123         return 0;
3124 #else
3125         log_error("--image= is not supported, compiled without blkid support.");
3126         return -ENOTSUP;
3127 #endif
3128 }
3129
3130 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3131 #ifdef HAVE_BLKID
3132         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3133         const char *fstype, *p;
3134         int r;
3135
3136         assert(what);
3137         assert(where);
3138
3139         if (arg_read_only)
3140                 rw = false;
3141
3142         if (directory)
3143                 p = strjoina(where, directory);
3144         else
3145                 p = where;
3146
3147         errno = 0;
3148         b = blkid_new_probe_from_filename(what);
3149         if (!b) {
3150                 if (errno == 0)
3151                         return log_oom();
3152                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3153                 return -errno;
3154         }
3155
3156         blkid_probe_enable_superblocks(b, 1);
3157         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3158
3159         errno = 0;
3160         r = blkid_do_safeprobe(b);
3161         if (r == -1 || r == 1) {
3162                 log_error("Cannot determine file system type of %s", what);
3163                 return -EINVAL;
3164         } else if (r != 0) {
3165                 if (errno == 0)
3166                         errno = EIO;
3167                 log_error_errno(errno, "Failed to probe %s: %m", what);
3168                 return -errno;
3169         }
3170
3171         errno = 0;
3172         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3173                 if (errno == 0)
3174                         errno = EINVAL;
3175                 log_error("Failed to determine file system type of %s", what);
3176                 return -errno;
3177         }
3178
3179         if (streq(fstype, "crypto_LUKS")) {
3180                 log_error("nspawn currently does not support LUKS disk images.");
3181                 return -ENOTSUP;
3182         }
3183
3184         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3185                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3186
3187         return 0;
3188 #else
3189         log_error("--image= is not supported, compiled without blkid support.");
3190         return -ENOTSUP;
3191 #endif
3192 }
3193
3194 static int mount_devices(
3195                 const char *where,
3196                 const char *root_device, bool root_device_rw,
3197                 const char *home_device, bool home_device_rw,
3198                 const char *srv_device, bool srv_device_rw) {
3199         int r;
3200
3201         assert(where);
3202
3203         if (root_device) {
3204                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3205                 if (r < 0)
3206                         return log_error_errno(r, "Failed to mount root directory: %m");
3207         }
3208
3209         if (home_device) {
3210                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3211                 if (r < 0)
3212                         return log_error_errno(r, "Failed to mount home directory: %m");
3213         }
3214
3215         if (srv_device) {
3216                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3217                 if (r < 0)
3218                         return log_error_errno(r, "Failed to mount server data directory: %m");
3219         }
3220
3221         return 0;
3222 }
3223
3224 static void loop_remove(int nr, int *image_fd) {
3225         _cleanup_close_ int control = -1;
3226         int r;
3227
3228         if (nr < 0)
3229                 return;
3230
3231         if (image_fd && *image_fd >= 0) {
3232                 r = ioctl(*image_fd, LOOP_CLR_FD);
3233                 if (r < 0)
3234                         log_debug_errno(errno, "Failed to close loop image: %m");
3235                 *image_fd = safe_close(*image_fd);
3236         }
3237
3238         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3239         if (control < 0) {
3240                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3241                 return;
3242         }
3243
3244         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3245         if (r < 0)
3246                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3247 }
3248
3249 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3250         int pipe_fds[2];
3251         pid_t pid;
3252
3253         assert(database);
3254         assert(key);
3255         assert(rpid);
3256
3257         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3258                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3259
3260         pid = fork();
3261         if (pid < 0)
3262                 return log_error_errno(errno, "Failed to fork getent child: %m");
3263         else if (pid == 0) {
3264                 int nullfd;
3265                 char *empty_env = NULL;
3266
3267                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3268                         _exit(EXIT_FAILURE);
3269
3270                 if (pipe_fds[0] > 2)
3271                         safe_close(pipe_fds[0]);
3272                 if (pipe_fds[1] > 2)
3273                         safe_close(pipe_fds[1]);
3274
3275                 nullfd = open("/dev/null", O_RDWR);
3276                 if (nullfd < 0)
3277                         _exit(EXIT_FAILURE);
3278
3279                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3280                         _exit(EXIT_FAILURE);
3281
3282                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3283                         _exit(EXIT_FAILURE);
3284
3285                 if (nullfd > 2)
3286                         safe_close(nullfd);
3287
3288                 reset_all_signal_handlers();
3289                 close_all_fds(NULL, 0);
3290
3291                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3292                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3293                 _exit(EXIT_FAILURE);
3294         }
3295
3296         pipe_fds[1] = safe_close(pipe_fds[1]);
3297
3298         *rpid = pid;
3299
3300         return pipe_fds[0];
3301 }
3302
3303 static int change_uid_gid(char **_home) {
3304         char line[LINE_MAX], *x, *u, *g, *h;
3305         const char *word, *state;
3306         _cleanup_free_ uid_t *uids = NULL;
3307         _cleanup_free_ char *home = NULL;
3308         _cleanup_fclose_ FILE *f = NULL;
3309         _cleanup_close_ int fd = -1;
3310         unsigned n_uids = 0;
3311         size_t sz = 0, l;
3312         uid_t uid;
3313         gid_t gid;
3314         pid_t pid;
3315         int r;
3316
3317         assert(_home);
3318
3319         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3320                 /* Reset everything fully to 0, just in case */
3321
3322                 if (setgroups(0, NULL) < 0)
3323                         return log_error_errno(errno, "setgroups() failed: %m");
3324
3325                 if (setresgid(0, 0, 0) < 0)
3326                         return log_error_errno(errno, "setregid() failed: %m");
3327
3328                 if (setresuid(0, 0, 0) < 0)
3329                         return log_error_errno(errno, "setreuid() failed: %m");
3330
3331                 *_home = NULL;
3332                 return 0;
3333         }
3334
3335         /* First, get user credentials */
3336         fd = spawn_getent("passwd", arg_user, &pid);
3337         if (fd < 0)
3338                 return fd;
3339
3340         f = fdopen(fd, "r");
3341         if (!f)
3342                 return log_oom();
3343         fd = -1;
3344
3345         if (!fgets(line, sizeof(line), f)) {
3346
3347                 if (!ferror(f)) {
3348                         log_error("Failed to resolve user %s.", arg_user);
3349                         return -ESRCH;
3350                 }
3351
3352                 log_error_errno(errno, "Failed to read from getent: %m");
3353                 return -errno;
3354         }
3355
3356         truncate_nl(line);
3357
3358         wait_for_terminate_and_warn("getent passwd", pid, true);
3359
3360         x = strchr(line, ':');
3361         if (!x) {
3362                 log_error("/etc/passwd entry has invalid user field.");
3363                 return -EIO;
3364         }
3365
3366         u = strchr(x+1, ':');
3367         if (!u) {
3368                 log_error("/etc/passwd entry has invalid password field.");
3369                 return -EIO;
3370         }
3371
3372         u++;
3373         g = strchr(u, ':');
3374         if (!g) {
3375                 log_error("/etc/passwd entry has invalid UID field.");
3376                 return -EIO;
3377         }
3378
3379         *g = 0;
3380         g++;
3381         x = strchr(g, ':');
3382         if (!x) {
3383                 log_error("/etc/passwd entry has invalid GID field.");
3384                 return -EIO;
3385         }
3386
3387         *x = 0;
3388         h = strchr(x+1, ':');
3389         if (!h) {
3390                 log_error("/etc/passwd entry has invalid GECOS field.");
3391                 return -EIO;
3392         }
3393
3394         h++;
3395         x = strchr(h, ':');
3396         if (!x) {
3397                 log_error("/etc/passwd entry has invalid home directory field.");
3398                 return -EIO;
3399         }
3400
3401         *x = 0;
3402
3403         r = parse_uid(u, &uid);
3404         if (r < 0) {
3405                 log_error("Failed to parse UID of user.");
3406                 return -EIO;
3407         }
3408
3409         r = parse_gid(g, &gid);
3410         if (r < 0) {
3411                 log_error("Failed to parse GID of user.");
3412                 return -EIO;
3413         }
3414
3415         home = strdup(h);
3416         if (!home)
3417                 return log_oom();
3418
3419         /* Second, get group memberships */
3420         fd = spawn_getent("initgroups", arg_user, &pid);
3421         if (fd < 0)
3422                 return fd;
3423
3424         fclose(f);
3425         f = fdopen(fd, "r");
3426         if (!f)
3427                 return log_oom();
3428         fd = -1;
3429
3430         if (!fgets(line, sizeof(line), f)) {
3431                 if (!ferror(f)) {
3432                         log_error("Failed to resolve user %s.", arg_user);
3433                         return -ESRCH;
3434                 }
3435
3436                 log_error_errno(errno, "Failed to read from getent: %m");
3437                 return -errno;
3438         }
3439
3440         truncate_nl(line);
3441
3442         wait_for_terminate_and_warn("getent initgroups", pid, true);
3443
3444         /* Skip over the username and subsequent separator whitespace */
3445         x = line;
3446         x += strcspn(x, WHITESPACE);
3447         x += strspn(x, WHITESPACE);
3448
3449         FOREACH_WORD(word, l, x, state) {
3450                 char c[l+1];
3451
3452                 memcpy(c, word, l);
3453                 c[l] = 0;
3454
3455                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3456                         return log_oom();
3457
3458                 r = parse_uid(c, &uids[n_uids++]);
3459                 if (r < 0) {
3460                         log_error("Failed to parse group data from getent.");
3461                         return -EIO;
3462                 }
3463         }
3464
3465         r = mkdir_parents(home, 0775);
3466         if (r < 0)
3467                 return log_error_errno(r, "Failed to make home root directory: %m");
3468
3469         r = mkdir_safe(home, 0755, uid, gid);
3470         if (r < 0 && r != -EEXIST)
3471                 return log_error_errno(r, "Failed to make home directory: %m");
3472
3473         fchown(STDIN_FILENO, uid, gid);
3474         fchown(STDOUT_FILENO, uid, gid);
3475         fchown(STDERR_FILENO, uid, gid);
3476
3477         if (setgroups(n_uids, uids) < 0)
3478                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3479
3480         if (setresgid(gid, gid, gid) < 0)
3481                 return log_error_errno(errno, "setregid() failed: %m");
3482
3483         if (setresuid(uid, uid, uid) < 0)
3484                 return log_error_errno(errno, "setreuid() failed: %m");
3485
3486         if (_home) {
3487                 *_home = home;
3488                 home = NULL;
3489         }
3490
3491         return 0;
3492 }
3493
3494 /*
3495  * Return values:
3496  * < 0 : wait_for_terminate() failed to get the state of the
3497  *       container, the container was terminated by a signal, or
3498  *       failed for an unknown reason.  No change is made to the
3499  *       container argument.
3500  * > 0 : The program executed in the container terminated with an
3501  *       error.  The exit code of the program executed in the
3502  *       container is returned.  The container argument has been set
3503  *       to CONTAINER_TERMINATED.
3504  *   0 : The container is being rebooted, has been shut down or exited
3505  *       successfully.  The container argument has been set to either
3506  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3507  *
3508  * That is, success is indicated by a return value of zero, and an
3509  * error is indicated by a non-zero value.
3510  */
3511 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3512         siginfo_t status;
3513         int r;
3514
3515         r = wait_for_terminate(pid, &status);
3516         if (r < 0)
3517                 return log_warning_errno(r, "Failed to wait for container: %m");
3518
3519         switch (status.si_code) {
3520
3521         case CLD_EXITED:
3522                 if (status.si_status == 0) {
3523                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3524
3525                 } else
3526                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3527
3528                 *container = CONTAINER_TERMINATED;
3529                 return status.si_status;
3530
3531         case CLD_KILLED:
3532                 if (status.si_status == SIGINT) {
3533
3534                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3535                         *container = CONTAINER_TERMINATED;
3536                         return 0;
3537
3538                 } else if (status.si_status == SIGHUP) {
3539
3540                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3541                         *container = CONTAINER_REBOOTED;
3542                         return 0;
3543                 }
3544
3545                 /* CLD_KILLED fallthrough */
3546
3547         case CLD_DUMPED:
3548                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3549                 return -EIO;
3550
3551         default:
3552                 log_error("Container %s failed due to unknown reason.", arg_machine);
3553                 return -EIO;
3554         }
3555
3556         return r;
3557 }
3558
3559 static void nop_handler(int sig) {}
3560
3561 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3562         pid_t pid;
3563
3564         pid = PTR_TO_UINT32(userdata);
3565         if (pid > 0) {
3566                 if (kill(pid, SIGRTMIN+3) >= 0) {
3567                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3568                         sd_event_source_set_userdata(s, NULL);
3569                         return 0;
3570                 }
3571         }
3572
3573         sd_event_exit(sd_event_source_get_event(s), 0);
3574         return 0;
3575 }
3576
3577 static int determine_names(void) {
3578         int r;
3579
3580         if (!arg_image && !arg_directory) {
3581                 if (arg_machine) {
3582                         _cleanup_(image_unrefp) Image *i = NULL;
3583
3584                         r = image_find(arg_machine, &i);
3585                         if (r < 0)
3586                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3587                         else if (r == 0) {
3588                                 log_error("No image for machine '%s': %m", arg_machine);
3589                                 return -ENOENT;
3590                         }
3591
3592                         if (i->type == IMAGE_RAW)
3593                                 r = set_sanitized_path(&arg_image, i->path);
3594                         else
3595                                 r = set_sanitized_path(&arg_directory, i->path);
3596                         if (r < 0)
3597                                 return log_error_errno(r, "Invalid image directory: %m");
3598
3599                         arg_read_only = arg_read_only || i->read_only;
3600                 } else
3601                         arg_directory = get_current_dir_name();
3602
3603                 if (!arg_directory && !arg_machine) {
3604                         log_error("Failed to determine path, please use -D or -i.");
3605                         return -EINVAL;
3606                 }
3607         }
3608
3609         if (!arg_machine) {
3610                 if (arg_directory && path_equal(arg_directory, "/"))
3611                         arg_machine = gethostname_malloc();
3612                 else
3613                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3614
3615                 if (!arg_machine)
3616                         return log_oom();
3617
3618                 hostname_cleanup(arg_machine, false);
3619                 if (!machine_name_is_valid(arg_machine)) {
3620                         log_error("Failed to determine machine name automatically, please use -M.");
3621                         return -EINVAL;
3622                 }
3623
3624                 if (arg_ephemeral) {
3625                         char *b;
3626
3627                         /* Add a random suffix when this is an
3628                          * ephemeral machine, so that we can run many
3629                          * instances at once without manually having
3630                          * to specify -M each time. */
3631
3632                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3633                                 return log_oom();
3634
3635                         free(arg_machine);
3636                         arg_machine = b;
3637                 }
3638         }
3639
3640         return 0;
3641 }
3642
3643 static int determine_uid_shift(void) {
3644         int r;
3645
3646         if (!arg_userns)
3647                 return 0;
3648
3649         if (arg_uid_shift == UID_INVALID) {
3650                 struct stat st;
3651
3652                 r = stat(arg_directory, &st);
3653                 if (r < 0)
3654                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3655
3656                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3657
3658                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3659                         log_error("UID and GID base of %s don't match.", arg_directory);
3660                         return -EINVAL;
3661                 }
3662
3663                 arg_uid_range = UINT32_C(0x10000);
3664         }
3665
3666         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3667                 log_error("UID base too high for UID range.");
3668                 return -EINVAL;
3669         }
3670
3671         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3672         return 0;
3673 }
3674
3675 int main(int argc, char *argv[]) {
3676
3677         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3678         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3679         _cleanup_close_ int master = -1, image_fd = -1;
3680         _cleanup_fdset_free_ FDSet *fds = NULL;
3681         int r, n_fd_passed, loop_nr = -1;
3682         char veth_name[IFNAMSIZ];
3683         bool secondary = false, remove_subvol = false;
3684         sigset_t mask, mask_chld;
3685         pid_t pid = 0;
3686         int ret = EXIT_SUCCESS;
3687         union in_addr_union exposed = {};
3688         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3689         bool interactive;
3690
3691         log_parse_environment();
3692         log_open();
3693
3694         r = parse_argv(argc, argv);
3695         if (r <= 0)
3696                 goto finish;
3697
3698         r = determine_names();
3699         if (r < 0)
3700                 goto finish;
3701
3702         if (geteuid() != 0) {
3703                 log_error("Need to be root.");
3704                 r = -EPERM;
3705                 goto finish;
3706         }
3707
3708         if (sd_booted() <= 0) {
3709                 log_error("Not running on a systemd system.");
3710                 r = -EINVAL;
3711                 goto finish;
3712         }
3713
3714         log_close();
3715         n_fd_passed = sd_listen_fds(false);
3716         if (n_fd_passed > 0) {
3717                 r = fdset_new_listen_fds(&fds, false);
3718                 if (r < 0) {
3719                         log_error_errno(r, "Failed to collect file descriptors: %m");
3720                         goto finish;
3721                 }
3722         }
3723         fdset_close_others(fds);
3724         log_open();
3725
3726         if (arg_directory) {
3727                 assert(!arg_image);
3728
3729                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3730                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3731                         r = -EINVAL;
3732                         goto finish;
3733                 }
3734
3735                 if (arg_ephemeral) {
3736                         char *np;
3737
3738                         /* If the specified path is a mount point we
3739                          * generate the new snapshot immediately
3740                          * inside it under a random name. However if
3741                          * the specified is not a mount point we
3742                          * create the new snapshot in the parent
3743                          * directory, just next to it. */
3744                         r = path_is_mount_point(arg_directory, false);
3745                         if (r < 0) {
3746                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3747                                 goto finish;
3748                         }
3749                         if (r > 0)
3750                                 r = tempfn_random_child(arg_directory, &np);
3751                         else
3752                                 r = tempfn_random(arg_directory, &np);
3753                         if (r < 0) {
3754                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3755                                 goto finish;
3756                         }
3757
3758                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3759                         if (r < 0) {
3760                                 log_error_errno(r, "Failed to lock %s: %m", np);
3761                                 goto finish;
3762                         }
3763
3764                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3765                         if (r < 0) {
3766                                 free(np);
3767                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3768                                 goto finish;
3769                         }
3770
3771                         free(arg_directory);
3772                         arg_directory = np;
3773
3774                         remove_subvol = true;
3775
3776                 } else {
3777                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3778                         if (r == -EBUSY) {
3779                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3780                                 goto finish;
3781                         }
3782                         if (r < 0) {
3783                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3784                                 return r;
3785                         }
3786
3787                         if (arg_template) {
3788                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3789                                 if (r == -EEXIST) {
3790                                         if (!arg_quiet)
3791                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3792                                 } else if (r < 0) {
3793                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3794                                         goto finish;
3795                                 } else {
3796                                         if (!arg_quiet)
3797                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3798                                 }
3799                         }
3800                 }
3801
3802                 if (arg_boot) {
3803                         if (path_is_os_tree(arg_directory) <= 0) {
3804                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3805                                 r = -EINVAL;
3806                                 goto finish;
3807                         }
3808                 } else {
3809                         const char *p;
3810
3811                         p = strjoina(arg_directory,
3812                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3813                         if (access(p, F_OK) < 0) {
3814                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3815                                 r = -EINVAL;
3816                                 goto finish;
3817                         }
3818                 }
3819
3820         } else {
3821                 char template[] = "/tmp/nspawn-root-XXXXXX";
3822
3823                 assert(arg_image);
3824                 assert(!arg_template);
3825
3826                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3827                 if (r == -EBUSY) {
3828                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3829                         goto finish;
3830                 }
3831                 if (r < 0) {
3832                         r = log_error_errno(r, "Failed to create image lock: %m");
3833                         goto finish;
3834                 }
3835
3836                 if (!mkdtemp(template)) {
3837                         log_error_errno(errno, "Failed to create temporary directory: %m");
3838                         r = -errno;
3839                         goto finish;
3840                 }
3841
3842                 arg_directory = strdup(template);
3843                 if (!arg_directory) {
3844                         r = log_oom();
3845                         goto finish;
3846                 }
3847
3848                 image_fd = setup_image(&device_path, &loop_nr);
3849                 if (image_fd < 0) {
3850                         r = image_fd;
3851                         goto finish;
3852                 }
3853
3854                 r = dissect_image(image_fd,
3855                                   &root_device, &root_device_rw,
3856                                   &home_device, &home_device_rw,
3857                                   &srv_device, &srv_device_rw,
3858                                   &secondary);
3859                 if (r < 0)
3860                         goto finish;
3861         }
3862
3863         r = determine_uid_shift();
3864         if (r < 0)
3865                 goto finish;
3866
3867         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3868
3869         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3870         if (master < 0) {
3871                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3872                 goto finish;
3873         }
3874
3875         r = ptsname_malloc(master, &console);
3876         if (r < 0) {
3877                 r = log_error_errno(r, "Failed to determine tty name: %m");
3878                 goto finish;
3879         }
3880
3881         if (unlockpt(master) < 0) {
3882                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3883                 goto finish;
3884         }
3885
3886         if (!arg_quiet)
3887                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3888                          arg_machine, arg_image ?: arg_directory);
3889
3890         assert_se(sigemptyset(&mask) == 0);
3891         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3892         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3893
3894         assert_se(sigemptyset(&mask_chld) == 0);
3895         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3896
3897         for (;;) {
3898                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3899                 ContainerStatus container_status;
3900                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3901                 struct sigaction sa = {
3902                         .sa_handler = nop_handler,
3903                         .sa_flags = SA_NOCLDSTOP,
3904                 };
3905
3906                 r = barrier_create(&barrier);
3907                 if (r < 0) {
3908                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3909                         goto finish;
3910                 }
3911
3912                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3913                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3914                         goto finish;
3915                 }
3916
3917                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3918                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3919                         goto finish;
3920                 }
3921
3922                 /* Child can be killed before execv(), so handle SIGCHLD
3923                  * in order to interrupt parent's blocking calls and
3924                  * give it a chance to call wait() and terminate. */
3925                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3926                 if (r < 0) {
3927                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3928                         goto finish;
3929                 }
3930
3931                 r = sigaction(SIGCHLD, &sa, NULL);
3932                 if (r < 0) {
3933                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3934                         goto finish;
3935                 }
3936
3937                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3938                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3939                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3940                 if (pid < 0) {
3941                         if (errno == EINVAL)
3942                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3943                         else
3944                                 r = log_error_errno(errno, "clone() failed: %m");
3945
3946                         goto finish;
3947                 }
3948
3949                 if (pid == 0) {
3950                         /* child */
3951                         _cleanup_free_ char *home = NULL;
3952                         unsigned n_env = 2;
3953                         const char *envp[] = {
3954                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3955                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3956                                 NULL, /* TERM */
3957                                 NULL, /* HOME */
3958                                 NULL, /* USER */
3959                                 NULL, /* LOGNAME */
3960                                 NULL, /* container_uuid */
3961                                 NULL, /* LISTEN_FDS */
3962                                 NULL, /* LISTEN_PID */
3963                                 NULL
3964                         };
3965                         char **env_use;
3966
3967                         barrier_set_role(&barrier, BARRIER_CHILD);
3968
3969                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3970                         if (envp[n_env])
3971                                 n_env ++;
3972
3973                         master = safe_close(master);
3974
3975                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3976                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3977
3978                         reset_all_signal_handlers();
3979                         reset_signal_mask();
3980
3981                         if (interactive) {
3982                                 close_nointr(STDIN_FILENO);
3983                                 close_nointr(STDOUT_FILENO);
3984                                 close_nointr(STDERR_FILENO);
3985
3986                                 r = open_terminal(console, O_RDWR);
3987                                 if (r != STDIN_FILENO) {
3988                                         if (r >= 0) {
3989                                                 safe_close(r);
3990                                                 r = -EINVAL;
3991                                         }
3992
3993                                         log_error_errno(r, "Failed to open console: %m");
3994                                         _exit(EXIT_FAILURE);
3995                                 }
3996
3997                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3998                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3999                                         log_error_errno(errno, "Failed to duplicate console: %m");
4000                                         _exit(EXIT_FAILURE);
4001                                 }
4002                         }
4003
4004                         if (setsid() < 0) {
4005                                 log_error_errno(errno, "setsid() failed: %m");
4006                                 _exit(EXIT_FAILURE);
4007                         }
4008
4009                         if (reset_audit_loginuid() < 0)
4010                                 _exit(EXIT_FAILURE);
4011
4012                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4013                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4014                                 _exit(EXIT_FAILURE);
4015                         }
4016
4017                         if (arg_private_network)
4018                                 loopback_setup();
4019
4020                         /* Mark everything as slave, so that we still
4021                          * receive mounts from the real root, but don't
4022                          * propagate mounts to the real root. */
4023                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4024                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4025                                 _exit(EXIT_FAILURE);
4026                         }
4027
4028                         if (mount_devices(arg_directory,
4029                                           root_device, root_device_rw,
4030                                           home_device, home_device_rw,
4031                                           srv_device, srv_device_rw) < 0)
4032                                 _exit(EXIT_FAILURE);
4033
4034                         /* Turn directory into bind mount */
4035                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4036                                 log_error_errno(errno, "Failed to make bind mount: %m");
4037                                 _exit(EXIT_FAILURE);
4038                         }
4039
4040                         r = setup_volatile(arg_directory);
4041                         if (r < 0)
4042                                 _exit(EXIT_FAILURE);
4043
4044                         if (setup_volatile_state(arg_directory) < 0)
4045                                 _exit(EXIT_FAILURE);
4046
4047                         r = base_filesystem_create(arg_directory);
4048                         if (r < 0)
4049                                 _exit(EXIT_FAILURE);
4050
4051                         if (arg_read_only) {
4052                                 r = bind_remount_recursive(arg_directory, true);
4053                                 if (r < 0) {
4054                                         log_error_errno(r, "Failed to make tree read-only: %m");
4055                                         _exit(EXIT_FAILURE);
4056                                 }
4057                         }
4058
4059                         if (mount_all(arg_directory) < 0)
4060                                 _exit(EXIT_FAILURE);
4061
4062                         if (copy_devnodes(arg_directory) < 0)
4063                                 _exit(EXIT_FAILURE);
4064
4065                         if (setup_ptmx(arg_directory) < 0)
4066                                 _exit(EXIT_FAILURE);
4067
4068                         dev_setup(arg_directory);
4069
4070                         if (setup_propagate(arg_directory) < 0)
4071                                 _exit(EXIT_FAILURE);
4072
4073                         if (setup_seccomp() < 0)
4074                                 _exit(EXIT_FAILURE);
4075
4076                         if (setup_dev_console(arg_directory, console) < 0)
4077                                 _exit(EXIT_FAILURE);
4078
4079                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4080                                 _exit(EXIT_FAILURE);
4081                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4082
4083                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4084                                 _exit(EXIT_FAILURE);
4085                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4086
4087                         /* Tell the parent that we are ready, and that
4088                          * it can cgroupify us to that we lack access
4089                          * to certain devices and resources. */
4090                         (void) barrier_place(&barrier); /* #1 */
4091
4092                         if (setup_boot_id(arg_directory) < 0)
4093                                 _exit(EXIT_FAILURE);
4094
4095                         if (setup_timezone(arg_directory) < 0)
4096                                 _exit(EXIT_FAILURE);
4097
4098                         if (setup_resolv_conf(arg_directory) < 0)
4099                                 _exit(EXIT_FAILURE);
4100
4101                         if (setup_journal(arg_directory) < 0)
4102                                 _exit(EXIT_FAILURE);
4103
4104                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4105                                 _exit(EXIT_FAILURE);
4106
4107                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4108                                 _exit(EXIT_FAILURE);
4109
4110                         if (mount_tmpfs(arg_directory) < 0)
4111                                 _exit(EXIT_FAILURE);
4112
4113                         /* Wait until we are cgroup-ified, so that we
4114                          * can mount the right cgroup path writable */
4115                         (void) barrier_place_and_sync(&barrier); /* #2 */
4116
4117                         if (mount_cgroup(arg_directory) < 0)
4118                                 _exit(EXIT_FAILURE);
4119
4120                         if (chdir(arg_directory) < 0) {
4121                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4122                                 _exit(EXIT_FAILURE);
4123                         }
4124
4125                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4126                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4127                                 _exit(EXIT_FAILURE);
4128                         }
4129
4130                         if (chroot(".") < 0) {
4131                                 log_error_errno(errno, "chroot() failed: %m");
4132                                 _exit(EXIT_FAILURE);
4133                         }
4134
4135                         if (chdir("/") < 0) {
4136                                 log_error_errno(errno, "chdir() failed: %m");
4137                                 _exit(EXIT_FAILURE);
4138                         }
4139
4140                         if (arg_userns) {
4141                                 if (unshare(CLONE_NEWUSER) < 0) {
4142                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4143                                         _exit(EXIT_FAILURE);
4144                                 }
4145
4146                                 /* Tell the parent, that it now can
4147                                  * write the UID map. */
4148                                 (void) barrier_place(&barrier); /* #3 */
4149
4150                                 /* Wait until the parent wrote the UID
4151                                  * map */
4152                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4153                         }
4154
4155                         umask(0022);
4156
4157                         if (drop_capabilities() < 0) {
4158                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4159                                 _exit(EXIT_FAILURE);
4160                         }
4161
4162                         setup_hostname();
4163
4164                         if (arg_personality != 0xffffffffLU) {
4165                                 if (personality(arg_personality) < 0) {
4166                                         log_error_errno(errno, "personality() failed: %m");
4167                                         _exit(EXIT_FAILURE);
4168                                 }
4169                         } else if (secondary) {
4170                                 if (personality(PER_LINUX32) < 0) {
4171                                         log_error_errno(errno, "personality() failed: %m");
4172                                         _exit(EXIT_FAILURE);
4173                                 }
4174                         }
4175
4176 #ifdef HAVE_SELINUX
4177                         if (arg_selinux_context)
4178                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4179                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4180                                         _exit(EXIT_FAILURE);
4181                                 }
4182 #endif
4183
4184                         r = change_uid_gid(&home);
4185                         if (r < 0)
4186                                 _exit(EXIT_FAILURE);
4187
4188                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4189                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4190                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4191                                 log_oom();
4192                                 _exit(EXIT_FAILURE);
4193                         }
4194
4195                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4196                                 char as_uuid[37];
4197
4198                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4199                                         log_oom();
4200                                         _exit(EXIT_FAILURE);
4201                                 }
4202                         }
4203
4204                         if (fdset_size(fds) > 0) {
4205                                 r = fdset_cloexec(fds, false);
4206                                 if (r < 0) {
4207                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4208                                         _exit(EXIT_FAILURE);
4209                                 }
4210
4211                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4212                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4213                                         log_oom();
4214                                         _exit(EXIT_FAILURE);
4215                                 }
4216                         }
4217
4218                         if (!strv_isempty(arg_setenv)) {
4219                                 char **n;
4220
4221                                 n = strv_env_merge(2, envp, arg_setenv);
4222                                 if (!n) {
4223                                         log_oom();
4224                                         _exit(EXIT_FAILURE);
4225                                 }
4226
4227                                 env_use = n;
4228                         } else
4229                                 env_use = (char**) envp;
4230
4231                         /* Let the parent know that we are ready and
4232                          * wait until the parent is ready with the
4233                          * setup, too... */
4234                         (void) barrier_place_and_sync(&barrier); /* #5 */
4235
4236                         if (arg_boot) {
4237                                 char **a;
4238                                 size_t l;
4239
4240                                 /* Automatically search for the init system */
4241
4242                                 l = 1 + argc - optind;
4243                                 a = newa(char*, l + 1);
4244                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4245
4246                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4247                                 execve(a[0], a, env_use);
4248
4249                                 a[0] = (char*) "/lib/systemd/systemd";
4250                                 execve(a[0], a, env_use);
4251
4252                                 a[0] = (char*) "/sbin/init";
4253                                 execve(a[0], a, env_use);
4254                         } else if (argc > optind)
4255                                 execvpe(argv[optind], argv + optind, env_use);
4256                         else {
4257                                 chdir(home ? home : "/root");
4258                                 execle("/bin/bash", "-bash", NULL, env_use);
4259                                 execle("/bin/sh", "-sh", NULL, env_use);
4260                         }
4261
4262                         log_error_errno(errno, "execv() failed: %m");
4263                         _exit(EXIT_FAILURE);
4264                 }
4265
4266                 barrier_set_role(&barrier, BARRIER_PARENT);
4267                 fdset_free(fds);
4268                 fds = NULL;
4269
4270                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4271                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4272
4273                 (void) barrier_place(&barrier); /* #1 */
4274
4275                 /* Wait for the most basic Child-setup to be done,
4276                  * before we add hardware to it, and place it in a
4277                  * cgroup. */
4278                 if (barrier_sync(&barrier)) { /* #1 */
4279                         int ifi = 0;
4280
4281                         r = move_network_interfaces(pid);
4282                         if (r < 0)
4283                                 goto finish;
4284
4285                         r = setup_veth(pid, veth_name, &ifi);
4286                         if (r < 0)
4287                                 goto finish;
4288
4289                         r = setup_bridge(veth_name, &ifi);
4290                         if (r < 0)
4291                                 goto finish;
4292
4293                         r = setup_macvlan(pid);
4294                         if (r < 0)
4295                                 goto finish;
4296
4297                         r = setup_ipvlan(pid);
4298                         if (r < 0)
4299                                 goto finish;
4300
4301                         r = register_machine(pid, ifi);
4302                         if (r < 0)
4303                                 goto finish;
4304
4305                         /* Notify the child that the parent is ready with all
4306                          * its setup, and that the child can now hand over
4307                          * control to the code to run inside the container. */
4308                         (void) barrier_place(&barrier); /* #2 */
4309
4310                         if (arg_userns) {
4311                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4312
4313                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4314
4315                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4316                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4317                                 r = write_string_file(uid_map, line);
4318                                 if (r < 0) {
4319                                         log_error_errno(r, "Failed to write UID map: %m");
4320                                         goto finish;
4321                                 }
4322
4323                                 /* We always assign the same UID and GID ranges */
4324                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4325                                 r = write_string_file(uid_map, line);
4326                                 if (r < 0) {
4327                                         log_error_errno(r, "Failed to write GID map: %m");
4328                                         goto finish;
4329                                 }
4330
4331                                 (void) barrier_place(&barrier); /* #4 */
4332                         }
4333
4334                         /* Block SIGCHLD here, before notifying child.
4335                          * process_pty() will handle it with the other signals. */
4336                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4337                         if (r < 0)
4338                                 goto finish;
4339
4340                         /* Reset signal to default */
4341                         r = default_signals(SIGCHLD, -1);
4342                         if (r < 0)
4343                                 goto finish;
4344
4345                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4346                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4347                                 _cleanup_event_unref_ sd_event *event = NULL;
4348                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4349                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4350                                 char last_char = 0;
4351
4352                                 sd_notifyf(false,
4353                                            "READY=1\n"
4354                                            "STATUS=Container running.\n"
4355                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4356
4357                                 r = sd_event_new(&event);
4358                                 if (r < 0) {
4359                                         log_error_errno(r, "Failed to get default event source: %m");
4360                                         goto finish;
4361                                 }
4362
4363                                 if (arg_boot) {
4364                                         /* Try to kill the init system on SIGINT or SIGTERM */
4365                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4366                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4367                                 } else {
4368                                         /* Immediately exit */
4369                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4370                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4371                                 }
4372
4373                                 /* simply exit on sigchld */
4374                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4375
4376                                 if (arg_expose_ports) {
4377                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4378                                         if (r < 0)
4379                                                 goto finish;
4380
4381                                         (void) expose_ports(rtnl, &exposed);
4382                                 }
4383
4384                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4385
4386                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4387                                 if (r < 0) {
4388                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4389                                         goto finish;
4390                                 }
4391
4392                                 r = sd_event_loop(event);
4393                                 if (r < 0) {
4394                                         log_error_errno(r, "Failed to run event loop: %m");
4395                                         goto finish;
4396                                 }
4397
4398                                 pty_forward_get_last_char(forward, &last_char);
4399
4400                                 forward = pty_forward_free(forward);
4401
4402                                 if (!arg_quiet && last_char != '\n')
4403                                         putc('\n', stdout);
4404
4405                                 /* Kill if it is not dead yet anyway */
4406                                 terminate_machine(pid);
4407                         }
4408                 }
4409
4410                 /* Normally redundant, but better safe than sorry */
4411                 kill(pid, SIGKILL);
4412
4413                 r = wait_for_container(pid, &container_status);
4414                 pid = 0;
4415
4416                 if (r < 0)
4417                         /* We failed to wait for the container, or the
4418                          * container exited abnormally */
4419                         goto finish;
4420                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4421                         /* The container exited with a non-zero
4422                          * status, or with zero status and no reboot
4423                          * was requested. */
4424                         ret = r;
4425                         break;
4426                 }
4427
4428                 /* CONTAINER_REBOOTED, loop again */
4429
4430                 if (arg_keep_unit) {
4431                         /* Special handling if we are running as a
4432                          * service: instead of simply restarting the
4433                          * machine we want to restart the entire
4434                          * service, so let's inform systemd about this
4435                          * with the special exit code 133. The service
4436                          * file uses RestartForceExitStatus=133 so
4437                          * that this results in a full nspawn
4438                          * restart. This is necessary since we might
4439                          * have cgroup parameters set we want to have
4440                          * flushed out. */
4441                         ret = 133;
4442                         r = 0;
4443                         break;
4444                 }
4445
4446                 flush_ports(&exposed);
4447         }
4448
4449 finish:
4450         sd_notify(false,
4451                   "STOPPING=1\n"
4452                   "STATUS=Terminating...");
4453
4454         loop_remove(loop_nr, &image_fd);
4455
4456         if (pid > 0)
4457                 kill(pid, SIGKILL);
4458
4459         if (remove_subvol && arg_directory) {
4460                 int k;
4461
4462                 k = btrfs_subvol_remove(arg_directory);
4463                 if (k < 0)
4464                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4465         }
4466
4467         if (arg_machine) {
4468                 const char *p;
4469
4470                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4471                 (void) rm_rf(p, false, true, false);
4472         }
4473
4474         free(arg_directory);
4475         free(arg_template);
4476         free(arg_image);
4477         free(arg_machine);
4478         free(arg_user);
4479         strv_free(arg_setenv);
4480         strv_free(arg_network_interfaces);
4481         strv_free(arg_network_macvlan);
4482         strv_free(arg_network_ipvlan);
4483         strv_free(arg_bind);
4484         strv_free(arg_bind_ro);
4485         strv_free(arg_tmpfs);
4486
4487         flush_ports(&exposed);
4488
4489         while (arg_expose_ports) {
4490                 ExposePort *p = arg_expose_ports;
4491                 LIST_REMOVE(ports, arg_expose_ports, p);
4492                 free(p);
4493         }
4494
4495         return r < 0 ? EXIT_FAILURE : ret;
4496 }