chiark / gitweb /
9967423dbc977d1a941f8a65204fff22cafe0b81
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
192 static bool arg_userns = false;
193
194 static void help(void) {
195         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
196                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
197                "  -h --help                 Show this help\n"
198                "     --version              Print version string\n"
199                "  -q --quiet                Do not show status information\n"
200                "  -D --directory=PATH       Root directory for the container\n"
201                "     --template=PATH        Initialize root directory from template directory,\n"
202                "                            if missing\n"
203                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
204                "                            remove it after exit\n"
205                "  -i --image=PATH           File system device or disk image for the container\n"
206                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
207                "  -u --user=USER            Run the command under specified user or uid\n"
208                "  -M --machine=NAME         Set the machine name for the container\n"
209                "     --uuid=UUID            Set a specific machine UUID for the container\n"
210                "  -S --slice=SLICE          Place the container in the specified slice\n"
211                "     --property=NAME=VALUE  Set scope unit property\n"
212                "     --private-network      Disable network in container\n"
213                "     --network-interface=INTERFACE\n"
214                "                            Assign an existing network interface to the\n"
215                "                            container\n"
216                "     --network-macvlan=INTERFACE\n"
217                "                            Create a macvlan network interface based on an\n"
218                "                            existing network interface to the container\n"
219                "     --network-ipvlan=INTERFACE\n"
220                "                            Create a ipvlan network interface based on an\n"
221                "                            existing network interface to the container\n"
222                "  -n --network-veth         Add a virtual ethernet connection between host\n"
223                "                            and container\n"
224                "     --network-bridge=INTERFACE\n"
225                "                            Add a virtual ethernet connection between host\n"
226                "                            and container and add it to an existing bridge on\n"
227                "                            the host\n"
228                "     --private-users[=UIDBASE[:NUIDS]]\n"
229                "                            Run within user namespace\n"
230                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
231                "                            Expose a container IP port on the host\n"
232                "  -Z --selinux-context=SECLABEL\n"
233                "                            Set the SELinux security context to be used by\n"
234                "                            processes in the container\n"
235                "  -L --selinux-apifs-context=SECLABEL\n"
236                "                            Set the SELinux security context to be used by\n"
237                "                            API/tmpfs file systems in the container\n"
238                "     --capability=CAP       In addition to the default, retain specified\n"
239                "                            capability\n"
240                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
241                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
242                "                            try-guest, try-host\n"
243                "  -j                        Equivalent to --link-journal=try-guest\n"
244                "     --read-only            Mount the root directory read-only\n"
245                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
246                "                            the container\n"
247                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
248                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
249                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
250                "     --share-system         Share system namespaces with host\n"
251                "     --register=BOOLEAN     Register container as machine\n"
252                "     --keep-unit            Do not register a scope for the machine, reuse\n"
253                "                            the service unit nspawn is running in\n"
254                "     --volatile[=MODE]      Run the system in volatile mode\n"
255                , program_invocation_short_name);
256 }
257
258 static int set_sanitized_path(char **b, const char *path) {
259         char *p;
260
261         assert(b);
262         assert(path);
263
264         p = canonicalize_file_name(path);
265         if (!p) {
266                 if (errno != ENOENT)
267                         return -errno;
268
269                 p = path_make_absolute_cwd(path);
270                 if (!p)
271                         return -ENOMEM;
272         }
273
274         free(*b);
275         *b = path_kill_slashes(p);
276         return 0;
277 }
278
279 static int parse_argv(int argc, char *argv[]) {
280
281         enum {
282                 ARG_VERSION = 0x100,
283                 ARG_PRIVATE_NETWORK,
284                 ARG_UUID,
285                 ARG_READ_ONLY,
286                 ARG_CAPABILITY,
287                 ARG_DROP_CAPABILITY,
288                 ARG_LINK_JOURNAL,
289                 ARG_BIND,
290                 ARG_BIND_RO,
291                 ARG_TMPFS,
292                 ARG_SETENV,
293                 ARG_SHARE_SYSTEM,
294                 ARG_REGISTER,
295                 ARG_KEEP_UNIT,
296                 ARG_NETWORK_INTERFACE,
297                 ARG_NETWORK_MACVLAN,
298                 ARG_NETWORK_IPVLAN,
299                 ARG_NETWORK_BRIDGE,
300                 ARG_PERSONALITY,
301                 ARG_VOLATILE,
302                 ARG_TEMPLATE,
303                 ARG_PROPERTY,
304                 ARG_PRIVATE_USERS,
305         };
306
307         static const struct option options[] = {
308                 { "help",                  no_argument,       NULL, 'h'                   },
309                 { "version",               no_argument,       NULL, ARG_VERSION           },
310                 { "directory",             required_argument, NULL, 'D'                   },
311                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
312                 { "ephemeral",             no_argument,       NULL, 'x'                   },
313                 { "user",                  required_argument, NULL, 'u'                   },
314                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
315                 { "boot",                  no_argument,       NULL, 'b'                   },
316                 { "uuid",                  required_argument, NULL, ARG_UUID              },
317                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
318                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
319                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
320                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
321                 { "bind",                  required_argument, NULL, ARG_BIND              },
322                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
323                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
324                 { "machine",               required_argument, NULL, 'M'                   },
325                 { "slice",                 required_argument, NULL, 'S'                   },
326                 { "setenv",                required_argument, NULL, ARG_SETENV            },
327                 { "selinux-context",       required_argument, NULL, 'Z'                   },
328                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
329                 { "quiet",                 no_argument,       NULL, 'q'                   },
330                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
331                 { "register",              required_argument, NULL, ARG_REGISTER          },
332                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
333                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
334                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
335                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
336                 { "network-veth",          no_argument,       NULL, 'n'                   },
337                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
338                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
339                 { "image",                 required_argument, NULL, 'i'                   },
340                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
341                 { "port",                  required_argument, NULL, 'p'                   },
342                 { "property",              required_argument, NULL, ARG_PROPERTY          },
343                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
344                 {}
345         };
346
347         int c, r;
348         uint64_t plus = 0, minus = 0;
349
350         assert(argc >= 0);
351         assert(argv);
352
353         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
354
355                 switch (c) {
356
357                 case 'h':
358                         help();
359                         return 0;
360
361                 case ARG_VERSION:
362                         puts(PACKAGE_STRING);
363                         puts(SYSTEMD_FEATURES);
364                         return 0;
365
366                 case 'D':
367                         r = set_sanitized_path(&arg_directory, optarg);
368                         if (r < 0)
369                                 return log_error_errno(r, "Invalid root directory: %m");
370
371                         break;
372
373                 case ARG_TEMPLATE:
374                         r = set_sanitized_path(&arg_template, optarg);
375                         if (r < 0)
376                                 return log_error_errno(r, "Invalid template directory: %m");
377
378                         break;
379
380                 case 'i':
381                         r = set_sanitized_path(&arg_image, optarg);
382                         if (r < 0)
383                                 return log_error_errno(r, "Invalid image path: %m");
384
385                         break;
386
387                 case 'x':
388                         arg_ephemeral = true;
389                         break;
390
391                 case 'u':
392                         free(arg_user);
393                         arg_user = strdup(optarg);
394                         if (!arg_user)
395                                 return log_oom();
396
397                         break;
398
399                 case ARG_NETWORK_BRIDGE:
400                         arg_network_bridge = optarg;
401
402                         /* fall through */
403
404                 case 'n':
405                         arg_network_veth = true;
406                         arg_private_network = true;
407                         break;
408
409                 case ARG_NETWORK_INTERFACE:
410                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
411                                 return log_oom();
412
413                         arg_private_network = true;
414                         break;
415
416                 case ARG_NETWORK_MACVLAN:
417                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
418                                 return log_oom();
419
420                         arg_private_network = true;
421                         break;
422
423                 case ARG_NETWORK_IPVLAN:
424                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
425                                 return log_oom();
426
427                         /* fall through */
428
429                 case ARG_PRIVATE_NETWORK:
430                         arg_private_network = true;
431                         break;
432
433                 case 'b':
434                         arg_boot = true;
435                         break;
436
437                 case ARG_UUID:
438                         r = sd_id128_from_string(optarg, &arg_uuid);
439                         if (r < 0) {
440                                 log_error("Invalid UUID: %s", optarg);
441                                 return r;
442                         }
443                         break;
444
445                 case 'S':
446                         arg_slice = optarg;
447                         break;
448
449                 case 'M':
450                         if (isempty(optarg)) {
451                                 free(arg_machine);
452                                 arg_machine = NULL;
453                         } else {
454                                 if (!machine_name_is_valid(optarg)) {
455                                         log_error("Invalid machine name: %s", optarg);
456                                         return -EINVAL;
457                                 }
458
459                                 r = free_and_strdup(&arg_machine, optarg);
460                                 if (r < 0)
461                                         return log_oom();
462
463                                 break;
464                         }
465
466                 case 'Z':
467                         arg_selinux_context = optarg;
468                         break;
469
470                 case 'L':
471                         arg_selinux_apifs_context = optarg;
472                         break;
473
474                 case ARG_READ_ONLY:
475                         arg_read_only = true;
476                         break;
477
478                 case ARG_CAPABILITY:
479                 case ARG_DROP_CAPABILITY: {
480                         const char *state, *word;
481                         size_t length;
482
483                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
484                                 _cleanup_free_ char *t;
485
486                                 t = strndup(word, length);
487                                 if (!t)
488                                         return log_oom();
489
490                                 if (streq(t, "all")) {
491                                         if (c == ARG_CAPABILITY)
492                                                 plus = (uint64_t) -1;
493                                         else
494                                                 minus = (uint64_t) -1;
495                                 } else {
496                                         int cap;
497
498                                         cap = capability_from_name(t);
499                                         if (cap < 0) {
500                                                 log_error("Failed to parse capability %s.", t);
501                                                 return -EINVAL;
502                                         }
503
504                                         if (c == ARG_CAPABILITY)
505                                                 plus |= 1ULL << (uint64_t) cap;
506                                         else
507                                                 minus |= 1ULL << (uint64_t) cap;
508                                 }
509                         }
510
511                         break;
512                 }
513
514                 case 'j':
515                         arg_link_journal = LINK_GUEST;
516                         arg_link_journal_try = true;
517                         break;
518
519                 case ARG_LINK_JOURNAL:
520                         if (streq(optarg, "auto")) {
521                                 arg_link_journal = LINK_AUTO;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "no")) {
524                                 arg_link_journal = LINK_NO;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = false;
529                         } else if (streq(optarg, "host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = false;
532                         } else if (streq(optarg, "try-guest")) {
533                                 arg_link_journal = LINK_GUEST;
534                                 arg_link_journal_try = true;
535                         } else if (streq(optarg, "try-host")) {
536                                 arg_link_journal = LINK_HOST;
537                                 arg_link_journal_try = true;
538                         } else {
539                                 log_error("Failed to parse link journal mode %s", optarg);
540                                 return -EINVAL;
541                         }
542
543                         break;
544
545                 case ARG_BIND:
546                 case ARG_BIND_RO: {
547                         _cleanup_free_ char *a = NULL, *b = NULL;
548                         char *e;
549                         char ***x;
550
551                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
552
553                         e = strchr(optarg, ':');
554                         if (e) {
555                                 a = strndup(optarg, e - optarg);
556                                 b = strdup(e + 1);
557                         } else {
558                                 a = strdup(optarg);
559                                 b = strdup(optarg);
560                         }
561
562                         if (!a || !b)
563                                 return log_oom();
564
565                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
566                                 log_error("Invalid bind mount specification: %s", optarg);
567                                 return -EINVAL;
568                         }
569
570                         r = strv_extend(x, a);
571                         if (r < 0)
572                                 return log_oom();
573
574                         r = strv_extend(x, b);
575                         if (r < 0)
576                                 return log_oom();
577
578                         break;
579                 }
580
581                 case ARG_TMPFS: {
582                         _cleanup_free_ char *a = NULL, *b = NULL;
583                         char *e;
584
585                         e = strchr(optarg, ':');
586                         if (e) {
587                                 a = strndup(optarg, e - optarg);
588                                 b = strdup(e + 1);
589                         } else {
590                                 a = strdup(optarg);
591                                 b = strdup("mode=0755");
592                         }
593
594                         if (!a || !b)
595                                 return log_oom();
596
597                         if (!path_is_absolute(a)) {
598                                 log_error("Invalid tmpfs specification: %s", optarg);
599                                 return -EINVAL;
600                         }
601
602                         r = strv_push(&arg_tmpfs, a);
603                         if (r < 0)
604                                 return log_oom();
605
606                         a = NULL;
607
608                         r = strv_push(&arg_tmpfs, b);
609                         if (r < 0)
610                                 return log_oom();
611
612                         b = NULL;
613
614                         break;
615                 }
616
617                 case ARG_SETENV: {
618                         char **n;
619
620                         if (!env_assignment_is_valid(optarg)) {
621                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         n = strv_env_set(arg_setenv, optarg);
626                         if (!n)
627                                 return log_oom();
628
629                         strv_free(arg_setenv);
630                         arg_setenv = n;
631                         break;
632                 }
633
634                 case 'q':
635                         arg_quiet = true;
636                         break;
637
638                 case ARG_SHARE_SYSTEM:
639                         arg_share_system = true;
640                         break;
641
642                 case ARG_REGISTER:
643                         r = parse_boolean(optarg);
644                         if (r < 0) {
645                                 log_error("Failed to parse --register= argument: %s", optarg);
646                                 return r;
647                         }
648
649                         arg_register = r;
650                         break;
651
652                 case ARG_KEEP_UNIT:
653                         arg_keep_unit = true;
654                         break;
655
656                 case ARG_PERSONALITY:
657
658                         arg_personality = personality_from_string(optarg);
659                         if (arg_personality == 0xffffffffLU) {
660                                 log_error("Unknown or unsupported personality '%s'.", optarg);
661                                 return -EINVAL;
662                         }
663
664                         break;
665
666                 case ARG_VOLATILE:
667
668                         if (!optarg)
669                                 arg_volatile = VOLATILE_YES;
670                         else {
671                                 r = parse_boolean(optarg);
672                                 if (r < 0) {
673                                         if (streq(optarg, "state"))
674                                                 arg_volatile = VOLATILE_STATE;
675                                         else {
676                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
677                                                 return r;
678                                         }
679                                 } else
680                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
681                         }
682
683                         break;
684
685                 case 'p': {
686                         const char *split, *e;
687                         uint16_t container_port, host_port;
688                         int protocol;
689                         ExposePort *p;
690
691                         if ((e = startswith(optarg, "tcp:")))
692                                 protocol = IPPROTO_TCP;
693                         else if ((e = startswith(optarg, "udp:")))
694                                 protocol = IPPROTO_UDP;
695                         else {
696                                 e = optarg;
697                                 protocol = IPPROTO_TCP;
698                         }
699
700                         split = strchr(e, ':');
701                         if (split) {
702                                 char v[split - e + 1];
703
704                                 memcpy(v, e, split - e);
705                                 v[split - e] = 0;
706
707                                 r = safe_atou16(v, &host_port);
708                                 if (r < 0 || host_port <= 0) {
709                                         log_error("Failed to parse host port: %s", optarg);
710                                         return -EINVAL;
711                                 }
712
713                                 r = safe_atou16(split + 1, &container_port);
714                         } else {
715                                 r = safe_atou16(e, &container_port);
716                                 host_port = container_port;
717                         }
718
719                         if (r < 0 || container_port <= 0) {
720                                 log_error("Failed to parse host port: %s", optarg);
721                                 return -EINVAL;
722                         }
723
724                         LIST_FOREACH(ports, p, arg_expose_ports) {
725                                 if (p->protocol == protocol && p->host_port == host_port) {
726                                         log_error("Duplicate port specification: %s", optarg);
727                                         return -EINVAL;
728                                 }
729                         }
730
731                         p = new(ExposePort, 1);
732                         if (!p)
733                                 return log_oom();
734
735                         p->protocol = protocol;
736                         p->host_port = host_port;
737                         p->container_port = container_port;
738
739                         LIST_PREPEND(ports, arg_expose_ports, p);
740
741                         break;
742                 }
743
744                 case ARG_PROPERTY:
745                         if (strv_extend(&arg_property, optarg) < 0)
746                                 return log_oom();
747
748                         break;
749
750                 case ARG_PRIVATE_USERS:
751                         if (optarg) {
752                                 _cleanup_free_ char *buffer = NULL;
753                                 const char *range, *shift;
754
755                                 range = strchr(optarg, ':');
756                                 if (range) {
757                                         buffer = strndup(optarg, range - optarg);
758                                         if (!buffer)
759                                                 return log_oom();
760                                         shift = buffer;
761
762                                         range++;
763                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
764                                                 log_error("Failed to parse UID range: %s", range);
765                                                 return -EINVAL;
766                                         }
767                                 } else
768                                         shift = optarg;
769
770                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
771                                         log_error("Failed to parse UID: %s", optarg);
772                                         return -EINVAL;
773                                 }
774                         }
775
776                         arg_userns = true;
777                         break;
778
779                 case '?':
780                         return -EINVAL;
781
782                 default:
783                         assert_not_reached("Unhandled option");
784                 }
785
786         if (arg_share_system)
787                 arg_register = false;
788
789         if (arg_boot && arg_share_system) {
790                 log_error("--boot and --share-system may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
795                 log_error("--keep-unit may not be used when invoked from a user session.");
796                 return -EINVAL;
797         }
798
799         if (arg_directory && arg_image) {
800                 log_error("--directory= and --image= may not be combined.");
801                 return -EINVAL;
802         }
803
804         if (arg_template && arg_image) {
805                 log_error("--template= and --image= may not be combined.");
806                 return -EINVAL;
807         }
808
809         if (arg_template && !(arg_directory || arg_machine)) {
810                 log_error("--template= needs --directory= or --machine=.");
811                 return -EINVAL;
812         }
813
814         if (arg_ephemeral && arg_template) {
815                 log_error("--ephemeral and --template= may not be combined.");
816                 return -EINVAL;
817         }
818
819         if (arg_ephemeral && arg_image) {
820                 log_error("--ephemeral and --image= may not be combined.");
821                 return -EINVAL;
822         }
823
824         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
825                 log_error("--ephemeral and --link-journal= may not be combined.");
826                 return -EINVAL;
827         }
828
829         if (arg_volatile != VOLATILE_NO && arg_read_only) {
830                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
831                 return -EINVAL;
832         }
833
834         if (arg_expose_ports && !arg_private_network) {
835                 log_error("Cannot use --port= without private networking.");
836                 return -EINVAL;
837         }
838
839         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
840
841         return 1;
842 }
843
844 static int mount_all(const char *dest) {
845
846         typedef struct MountPoint {
847                 const char *what;
848                 const char *where;
849                 const char *type;
850                 const char *options;
851                 unsigned long flags;
852                 bool fatal;
853         } MountPoint;
854
855         static const MountPoint mount_table[] = {
856                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
857                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
858                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
859                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
860                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
861                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
862                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
863                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
864                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
865 #ifdef HAVE_SELINUX
866                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
867                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
868 #endif
869         };
870
871         unsigned k;
872         int r = 0;
873
874         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
875                 _cleanup_free_ char *where = NULL, *options = NULL;
876                 const char *o;
877                 int t;
878
879                 where = strjoin(dest, "/", mount_table[k].where, NULL);
880                 if (!where)
881                         return log_oom();
882
883                 t = path_is_mount_point(where, true);
884                 if (t < 0) {
885                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
886
887                         if (r == 0)
888                                 r = t;
889
890                         continue;
891                 }
892
893                 /* Skip this entry if it is not a remount. */
894                 if (mount_table[k].what && t > 0)
895                         continue;
896
897                 t = mkdir_p(where, 0755);
898                 if (t < 0) {
899                         if (mount_table[k].fatal) {
900                                log_error_errno(t, "Failed to create directory %s: %m", where);
901
902                                 if (r == 0)
903                                         r = t;
904                         } else
905                                log_warning_errno(t, "Failed to create directory %s: %m", where);
906
907                         continue;
908                 }
909
910 #ifdef HAVE_SELINUX
911                 if (arg_selinux_apifs_context &&
912                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
913                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
914                         if (!options)
915                                 return log_oom();
916
917                         o = options;
918                 } else
919 #endif
920                         o = mount_table[k].options;
921
922                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
923                         char *uid_options = NULL;
924
925                         if (o)
926                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
927                         else
928                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
929                         if (!uid_options)
930                                 return log_oom();
931
932                         free(options);
933                         o = options = uid_options;
934                 }
935
936                 if (mount(mount_table[k].what,
937                           where,
938                           mount_table[k].type,
939                           mount_table[k].flags,
940                           o) < 0) {
941
942                         if (mount_table[k].fatal) {
943                                 log_error_errno(errno, "mount(%s) failed: %m", where);
944
945                                 if (r == 0)
946                                         r = -errno;
947                         } else
948                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
949                 }
950         }
951
952         return r;
953 }
954
955 static int mount_binds(const char *dest, char **l, bool ro) {
956         char **x, **y;
957
958         STRV_FOREACH_PAIR(x, y, l) {
959                 _cleanup_free_ char *where = NULL;
960                 struct stat source_st, dest_st;
961                 int r;
962
963                 if (stat(*x, &source_st) < 0)
964                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
965
966                 where = strappend(dest, *y);
967                 if (!where)
968                         return log_oom();
969
970                 r = stat(where, &dest_st);
971                 if (r == 0) {
972                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
973                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
974                                 return -EINVAL;
975                         }
976                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
977                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
978                                 return -EINVAL;
979                         }
980                 } else if (errno == ENOENT) {
981                         r = mkdir_parents_label(where, 0755);
982                         if (r < 0)
983                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
984                 } else {
985                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
986                         return -errno;
987                 }
988
989                 /* Create the mount point. Any non-directory file can be
990                  * mounted on any non-directory file (regular, fifo, socket,
991                  * char, block).
992                  */
993                 if (S_ISDIR(source_st.st_mode)) {
994                         r = mkdir_label(where, 0755);
995                         if (r < 0 && errno != EEXIST)
996                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
997                 } else {
998                         r = touch(where);
999                         if (r < 0)
1000                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1001                 }
1002
1003                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1004                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1005
1006                 if (ro) {
1007                         r = bind_remount_recursive(where, true);
1008                         if (r < 0)
1009                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1010                 }
1011         }
1012
1013         return 0;
1014 }
1015
1016 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1017         char *to;
1018         int r;
1019
1020         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1021
1022         r = path_is_mount_point(to, false);
1023         if (r < 0)
1024                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1025         if (r > 0)
1026                 return 0;
1027
1028         mkdir_p(to, 0755);
1029
1030         /* The superblock mount options of the mount point need to be
1031          * identical to the hosts', and hence writable... */
1032         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1033                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1034
1035         /* ... hence let's only make the bind mount read-only, not the
1036          * superblock. */
1037         if (read_only) {
1038                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1039                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1040         }
1041         return 1;
1042 }
1043
1044 static int mount_cgroup(const char *dest) {
1045         _cleanup_set_free_free_ Set *controllers = NULL;
1046         _cleanup_free_ char *own_cgroup_path = NULL;
1047         const char *cgroup_root, *systemd_root, *systemd_own;
1048         int r;
1049
1050         controllers = set_new(&string_hash_ops);
1051         if (!controllers)
1052                 return log_oom();
1053
1054         r = cg_kernel_controllers(controllers);
1055         if (r < 0)
1056                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1057
1058         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1059         if (r < 0)
1060                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1061
1062         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1063         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1064                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1065
1066         for (;;) {
1067                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1068
1069                 controller = set_steal_first(controllers);
1070                 if (!controller)
1071                         break;
1072
1073                 origin = strappend("/sys/fs/cgroup/", controller);
1074                 if (!origin)
1075                         return log_oom();
1076
1077                 r = readlink_malloc(origin, &combined);
1078                 if (r == -EINVAL) {
1079                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1080
1081                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1082                         if (r < 0)
1083                                 return r;
1084
1085                 } else if (r < 0)
1086                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1087                 else {
1088                         _cleanup_free_ char *target = NULL;
1089
1090                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1091                         if (!target)
1092                                 return log_oom();
1093
1094                         /* A symbolic link, a combination of controllers in one hierarchy */
1095
1096                         if (!filename_is_valid(combined)) {
1097                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1098                                 continue;
1099                         }
1100
1101                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1102                         if (r < 0)
1103                                 return r;
1104
1105                         if (symlink(combined, target) < 0)
1106                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1107                 }
1108         }
1109
1110         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1111         if (r < 0)
1112                 return r;
1113
1114         /* Make our own cgroup a (writable) bind mount */
1115         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1116         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1117                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1118
1119         /* And then remount the systemd cgroup root read-only */
1120         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1121         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1122                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1123
1124         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1125                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1126
1127         return 0;
1128 }
1129
1130 static int mount_tmpfs(const char *dest) {
1131         char **i, **o;
1132
1133         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1134                 _cleanup_free_ char *where = NULL;
1135                 int r;
1136
1137                 where = strappend(dest, *i);
1138                 if (!where)
1139                         return log_oom();
1140
1141                 r = mkdir_label(where, 0755);
1142                 if (r < 0 && r != -EEXIST)
1143                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1144
1145                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1146                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1147         }
1148
1149         return 0;
1150 }
1151
1152 static int setup_timezone(const char *dest) {
1153         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1154         char *z, *y;
1155         int r;
1156
1157         assert(dest);
1158
1159         /* Fix the timezone, if possible */
1160         r = readlink_malloc("/etc/localtime", &p);
1161         if (r < 0) {
1162                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1163                 return 0;
1164         }
1165
1166         z = path_startswith(p, "../usr/share/zoneinfo/");
1167         if (!z)
1168                 z = path_startswith(p, "/usr/share/zoneinfo/");
1169         if (!z) {
1170                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1171                 return 0;
1172         }
1173
1174         where = strappend(dest, "/etc/localtime");
1175         if (!where)
1176                 return log_oom();
1177
1178         r = readlink_malloc(where, &q);
1179         if (r >= 0) {
1180                 y = path_startswith(q, "../usr/share/zoneinfo/");
1181                 if (!y)
1182                         y = path_startswith(q, "/usr/share/zoneinfo/");
1183
1184                 /* Already pointing to the right place? Then do nothing .. */
1185                 if (y && streq(y, z))
1186                         return 0;
1187         }
1188
1189         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1190         if (!check)
1191                 return log_oom();
1192
1193         if (access(check, F_OK) < 0) {
1194                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1195                 return 0;
1196         }
1197
1198         what = strappend("../usr/share/zoneinfo/", z);
1199         if (!what)
1200                 return log_oom();
1201
1202         r = mkdir_parents(where, 0755);
1203         if (r < 0) {
1204                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1205
1206                 return 0;
1207         }
1208
1209         r = unlink(where);
1210         if (r < 0 && errno != ENOENT) {
1211                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         if (symlink(what, where) < 0) {
1217                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1218                 return 0;
1219         }
1220
1221         return 0;
1222 }
1223
1224 static int setup_resolv_conf(const char *dest) {
1225         _cleanup_free_ char *where = NULL;
1226         int r;
1227
1228         assert(dest);
1229
1230         if (arg_private_network)
1231                 return 0;
1232
1233         /* Fix resolv.conf, if possible */
1234         where = strappend(dest, "/etc/resolv.conf");
1235         if (!where)
1236                 return log_oom();
1237
1238         /* We don't really care for the results of this really. If it
1239          * fails, it fails, but meh... */
1240         r = mkdir_parents(where, 0755);
1241         if (r < 0) {
1242                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1243
1244                 return 0;
1245         }
1246
1247         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int setup_volatile_state(const char *directory) {
1258         const char *p;
1259         int r;
1260
1261         assert(directory);
1262
1263         if (arg_volatile != VOLATILE_STATE)
1264                 return 0;
1265
1266         /* --volatile=state means we simply overmount /var
1267            with a tmpfs, and the rest read-only. */
1268
1269         r = bind_remount_recursive(directory, true);
1270         if (r < 0)
1271                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1272
1273         p = strjoina(directory, "/var");
1274         r = mkdir(p, 0755);
1275         if (r < 0 && errno != EEXIST)
1276                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1277
1278         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1279                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1280
1281         return 0;
1282 }
1283
1284 static int setup_volatile(const char *directory) {
1285         bool tmpfs_mounted = false, bind_mounted = false;
1286         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1287         const char *f, *t;
1288         int r;
1289
1290         assert(directory);
1291
1292         if (arg_volatile != VOLATILE_YES)
1293                 return 0;
1294
1295         /* --volatile=yes means we mount a tmpfs to the root dir, and
1296            the original /usr to use inside it, and that read-only. */
1297
1298         if (!mkdtemp(template))
1299                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1300
1301         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1302                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1303                 r = -errno;
1304                 goto fail;
1305         }
1306
1307         tmpfs_mounted = true;
1308
1309         f = strjoina(directory, "/usr");
1310         t = strjoina(template, "/usr");
1311
1312         r = mkdir(t, 0755);
1313         if (r < 0 && errno != EEXIST) {
1314                 log_error_errno(errno, "Failed to create %s: %m", t);
1315                 r = -errno;
1316                 goto fail;
1317         }
1318
1319         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1320                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1321                 r = -errno;
1322                 goto fail;
1323         }
1324
1325         bind_mounted = true;
1326
1327         r = bind_remount_recursive(t, true);
1328         if (r < 0) {
1329                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1330                 goto fail;
1331         }
1332
1333         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1334                 log_error_errno(errno, "Failed to move root mount: %m");
1335                 r = -errno;
1336                 goto fail;
1337         }
1338
1339         rmdir(template);
1340
1341         return 0;
1342
1343 fail:
1344         if (bind_mounted)
1345                 umount(t);
1346         if (tmpfs_mounted)
1347                 umount(template);
1348         rmdir(template);
1349         return r;
1350 }
1351
1352 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1353
1354         snprintf(s, 37,
1355                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1356                  SD_ID128_FORMAT_VAL(id));
1357
1358         return s;
1359 }
1360
1361 static int setup_boot_id(const char *dest) {
1362         _cleanup_free_ char *from = NULL, *to = NULL;
1363         sd_id128_t rnd = {};
1364         char as_uuid[37];
1365         int r;
1366
1367         assert(dest);
1368
1369         if (arg_share_system)
1370                 return 0;
1371
1372         /* Generate a new randomized boot ID, so that each boot-up of
1373          * the container gets a new one */
1374
1375         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1376         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1377         if (!from || !to)
1378                 return log_oom();
1379
1380         r = sd_id128_randomize(&rnd);
1381         if (r < 0)
1382                 return log_error_errno(r, "Failed to generate random boot id: %m");
1383
1384         id128_format_as_uuid(rnd, as_uuid);
1385
1386         r = write_string_file(from, as_uuid);
1387         if (r < 0)
1388                 return log_error_errno(r, "Failed to write boot id: %m");
1389
1390         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1391                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1392                 r = -errno;
1393         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1394                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1395
1396         unlink(from);
1397         return r;
1398 }
1399
1400 static int copy_devnodes(const char *dest) {
1401
1402         static const char devnodes[] =
1403                 "null\0"
1404                 "zero\0"
1405                 "full\0"
1406                 "random\0"
1407                 "urandom\0"
1408                 "tty\0"
1409                 "net/tun\0";
1410
1411         const char *d;
1412         int r = 0;
1413         _cleanup_umask_ mode_t u;
1414
1415         assert(dest);
1416
1417         u = umask(0000);
1418
1419         NULSTR_FOREACH(d, devnodes) {
1420                 _cleanup_free_ char *from = NULL, *to = NULL;
1421                 struct stat st;
1422
1423                 from = strappend("/dev/", d);
1424                 to = strjoin(dest, "/dev/", d, NULL);
1425                 if (!from || !to)
1426                         return log_oom();
1427
1428                 if (stat(from, &st) < 0) {
1429
1430                         if (errno != ENOENT)
1431                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1432
1433                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1434
1435                         log_error("%s is not a char or block device, cannot copy", from);
1436                         return -EIO;
1437
1438                 } else {
1439                         r = mkdir_parents(to, 0775);
1440                         if (r < 0) {
1441                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1442                                 return -r;
1443                         }
1444
1445                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1446                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1447                 }
1448         }
1449
1450         return r;
1451 }
1452
1453 static int setup_ptmx(const char *dest) {
1454         _cleanup_free_ char *p = NULL;
1455
1456         p = strappend(dest, "/dev/ptmx");
1457         if (!p)
1458                 return log_oom();
1459
1460         if (symlink("pts/ptmx", p) < 0)
1461                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1462
1463         return 0;
1464 }
1465
1466 static int setup_dev_console(const char *dest, const char *console) {
1467         _cleanup_umask_ mode_t u;
1468         const char *to;
1469         struct stat st;
1470         int r;
1471
1472         assert(dest);
1473         assert(console);
1474
1475         u = umask(0000);
1476
1477         if (stat("/dev/null", &st) < 0)
1478                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1479
1480         r = chmod_and_chown(console, 0600, 0, 0);
1481         if (r < 0)
1482                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1483
1484         /* We need to bind mount the right tty to /dev/console since
1485          * ptys can only exist on pts file systems. To have something
1486          * to bind mount things on we create a device node first, and
1487          * use /dev/null for that since we the cgroups device policy
1488          * allows us to create that freely, while we cannot create
1489          * /dev/console. (Note that the major minor doesn't actually
1490          * matter here, since we mount it over anyway). */
1491
1492         to = strjoina(dest, "/dev/console");
1493         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1494                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1495
1496         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1497                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1498
1499         return 0;
1500 }
1501
1502 static int setup_kmsg(const char *dest, int kmsg_socket) {
1503         _cleanup_free_ char *from = NULL, *to = NULL;
1504         _cleanup_umask_ mode_t u;
1505         int r, fd, k;
1506         union {
1507                 struct cmsghdr cmsghdr;
1508                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1509         } control = {};
1510         struct msghdr mh = {
1511                 .msg_control = &control,
1512                 .msg_controllen = sizeof(control),
1513         };
1514         struct cmsghdr *cmsg;
1515
1516         assert(dest);
1517         assert(kmsg_socket >= 0);
1518
1519         u = umask(0000);
1520
1521         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1522          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1523          * on the reading side behave very similar to /proc/kmsg,
1524          * their writing side behaves differently from /dev/kmsg in
1525          * that writing blocks when nothing is reading. In order to
1526          * avoid any problems with containers deadlocking due to this
1527          * we simply make /dev/kmsg unavailable to the container. */
1528         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1529             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1530                 return log_oom();
1531
1532         if (mkfifo(from, 0600) < 0)
1533                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1534
1535         r = chmod_and_chown(from, 0600, 0, 0);
1536         if (r < 0)
1537                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1538
1539         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1540                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1541
1542         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1543         if (fd < 0)
1544                 return log_error_errno(errno, "Failed to open fifo: %m");
1545
1546         cmsg = CMSG_FIRSTHDR(&mh);
1547         cmsg->cmsg_level = SOL_SOCKET;
1548         cmsg->cmsg_type = SCM_RIGHTS;
1549         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1550         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1551
1552         mh.msg_controllen = cmsg->cmsg_len;
1553
1554         /* Store away the fd in the socket, so that it stays open as
1555          * long as we run the child */
1556         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1557         safe_close(fd);
1558
1559         if (k < 0)
1560                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1561
1562         /* And now make the FIFO unavailable as /dev/kmsg... */
1563         unlink(from);
1564         return 0;
1565 }
1566
1567 static int send_rtnl(int send_fd) {
1568         union {
1569                 struct cmsghdr cmsghdr;
1570                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1571         } control = {};
1572         struct msghdr mh = {
1573                 .msg_control = &control,
1574                 .msg_controllen = sizeof(control),
1575         };
1576         struct cmsghdr *cmsg;
1577         _cleanup_close_ int fd = -1;
1578         ssize_t k;
1579
1580         assert(send_fd >= 0);
1581
1582         if (!arg_expose_ports)
1583                 return 0;
1584
1585         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1586         if (fd < 0)
1587                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1588
1589         cmsg = CMSG_FIRSTHDR(&mh);
1590         cmsg->cmsg_level = SOL_SOCKET;
1591         cmsg->cmsg_type = SCM_RIGHTS;
1592         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1593         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1594
1595         mh.msg_controllen = cmsg->cmsg_len;
1596
1597         /* Store away the fd in the socket, so that it stays open as
1598          * long as we run the child */
1599         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1600         if (k < 0)
1601                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1602
1603         return 0;
1604 }
1605
1606 static int flush_ports(union in_addr_union *exposed) {
1607         ExposePort *p;
1608         int r, af = AF_INET;
1609
1610         assert(exposed);
1611
1612         if (!arg_expose_ports)
1613                 return 0;
1614
1615         if (in_addr_is_null(af, exposed))
1616                 return 0;
1617
1618         log_debug("Lost IP address.");
1619
1620         LIST_FOREACH(ports, p, arg_expose_ports) {
1621                 r = fw_add_local_dnat(false,
1622                                       af,
1623                                       p->protocol,
1624                                       NULL,
1625                                       NULL, 0,
1626                                       NULL, 0,
1627                                       p->host_port,
1628                                       exposed,
1629                                       p->container_port,
1630                                       NULL);
1631                 if (r < 0)
1632                         log_warning_errno(r, "Failed to modify firewall: %m");
1633         }
1634
1635         *exposed = IN_ADDR_NULL;
1636         return 0;
1637 }
1638
1639 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1640         _cleanup_free_ struct local_address *addresses = NULL;
1641         _cleanup_free_ char *pretty = NULL;
1642         union in_addr_union new_exposed;
1643         ExposePort *p;
1644         bool add;
1645         int af = AF_INET, r;
1646
1647         assert(exposed);
1648
1649         /* Invoked each time an address is added or removed inside the
1650          * container */
1651
1652         if (!arg_expose_ports)
1653                 return 0;
1654
1655         r = local_addresses(rtnl, 0, af, &addresses);
1656         if (r < 0)
1657                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1658
1659         add = r > 0 &&
1660                 addresses[0].family == af &&
1661                 addresses[0].scope < RT_SCOPE_LINK;
1662
1663         if (!add)
1664                 return flush_ports(exposed);
1665
1666         new_exposed = addresses[0].address;
1667         if (in_addr_equal(af, exposed, &new_exposed))
1668                 return 0;
1669
1670         in_addr_to_string(af, &new_exposed, &pretty);
1671         log_debug("New container IP is %s.", strna(pretty));
1672
1673         LIST_FOREACH(ports, p, arg_expose_ports) {
1674
1675                 r = fw_add_local_dnat(true,
1676                                       af,
1677                                       p->protocol,
1678                                       NULL,
1679                                       NULL, 0,
1680                                       NULL, 0,
1681                                       p->host_port,
1682                                       &new_exposed,
1683                                       p->container_port,
1684                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1685                 if (r < 0)
1686                         log_warning_errno(r, "Failed to modify firewall: %m");
1687         }
1688
1689         *exposed = new_exposed;
1690         return 0;
1691 }
1692
1693 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1694         union in_addr_union *exposed = userdata;
1695
1696         assert(rtnl);
1697         assert(m);
1698         assert(exposed);
1699
1700         expose_ports(rtnl, exposed);
1701         return 0;
1702 }
1703
1704 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1705         union {
1706                 struct cmsghdr cmsghdr;
1707                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1708         } control = {};
1709         struct msghdr mh = {
1710                 .msg_control = &control,
1711                 .msg_controllen = sizeof(control),
1712         };
1713         struct cmsghdr *cmsg;
1714         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1715         int fd, r;
1716         ssize_t k;
1717
1718         assert(event);
1719         assert(recv_fd >= 0);
1720         assert(ret);
1721
1722         if (!arg_expose_ports)
1723                 return 0;
1724
1725         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1726         if (k < 0)
1727                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1728
1729         cmsg = CMSG_FIRSTHDR(&mh);
1730         assert(cmsg->cmsg_level == SOL_SOCKET);
1731         assert(cmsg->cmsg_type == SCM_RIGHTS);
1732         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1733         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1734
1735         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1736         if (r < 0) {
1737                 safe_close(fd);
1738                 return log_error_errno(r, "Failed to create rtnl object: %m");
1739         }
1740
1741         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1742         if (r < 0)
1743                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1744
1745         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1746         if (r < 0)
1747                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1748
1749         r = sd_rtnl_attach_event(rtnl, event, 0);
1750         if (r < 0)
1751                 return log_error_errno(r, "Failed to add to even loop: %m");
1752
1753         *ret = rtnl;
1754         rtnl = NULL;
1755
1756         return 0;
1757 }
1758
1759 static int setup_hostname(void) {
1760
1761         if (arg_share_system)
1762                 return 0;
1763
1764         if (sethostname_idempotent(arg_machine) < 0)
1765                 return -errno;
1766
1767         return 0;
1768 }
1769
1770 static int setup_journal(const char *directory) {
1771         sd_id128_t machine_id, this_id;
1772         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1773         char *id;
1774         int r;
1775
1776         /* Don't link journals in ephemeral mode */
1777         if (arg_ephemeral)
1778                 return 0;
1779
1780         p = strappend(directory, "/etc/machine-id");
1781         if (!p)
1782                 return log_oom();
1783
1784         r = read_one_line_file(p, &b);
1785         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1786                 return 0;
1787         else if (r < 0)
1788                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1789
1790         id = strstrip(b);
1791         if (isempty(id) && arg_link_journal == LINK_AUTO)
1792                 return 0;
1793
1794         /* Verify validity */
1795         r = sd_id128_from_string(id, &machine_id);
1796         if (r < 0)
1797                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1798
1799         r = sd_id128_get_machine(&this_id);
1800         if (r < 0)
1801                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1802
1803         if (sd_id128_equal(machine_id, this_id)) {
1804                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1805                          "Host and machine ids are equal (%s): refusing to link journals", id);
1806                 if (arg_link_journal == LINK_AUTO)
1807                         return 0;
1808                 return -EEXIST;
1809         }
1810
1811         if (arg_link_journal == LINK_NO)
1812                 return 0;
1813
1814         free(p);
1815         p = strappend("/var/log/journal/", id);
1816         q = strjoin(directory, "/var/log/journal/", id, NULL);
1817         if (!p || !q)
1818                 return log_oom();
1819
1820         if (path_is_mount_point(p, false) > 0) {
1821                 if (arg_link_journal != LINK_AUTO) {
1822                         log_error("%s: already a mount point, refusing to use for journal", p);
1823                         return -EEXIST;
1824                 }
1825
1826                 return 0;
1827         }
1828
1829         if (path_is_mount_point(q, false) > 0) {
1830                 if (arg_link_journal != LINK_AUTO) {
1831                         log_error("%s: already a mount point, refusing to use for journal", q);
1832                         return -EEXIST;
1833                 }
1834
1835                 return 0;
1836         }
1837
1838         r = readlink_and_make_absolute(p, &d);
1839         if (r >= 0) {
1840                 if ((arg_link_journal == LINK_GUEST ||
1841                      arg_link_journal == LINK_AUTO) &&
1842                     path_equal(d, q)) {
1843
1844                         r = mkdir_p(q, 0755);
1845                         if (r < 0)
1846                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1847                         return 0;
1848                 }
1849
1850                 if (unlink(p) < 0)
1851                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1852         } else if (r == -EINVAL) {
1853
1854                 if (arg_link_journal == LINK_GUEST &&
1855                     rmdir(p) < 0) {
1856
1857                         if (errno == ENOTDIR) {
1858                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1859                                 return r;
1860                         } else {
1861                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1862                                 return -errno;
1863                         }
1864                 }
1865         } else if (r != -ENOENT) {
1866                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1867                 return r;
1868         }
1869
1870         if (arg_link_journal == LINK_GUEST) {
1871
1872                 if (symlink(q, p) < 0) {
1873                         if (arg_link_journal_try) {
1874                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1875                                 return 0;
1876                         } else {
1877                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1878                                 return -errno;
1879                         }
1880                 }
1881
1882                 r = mkdir_p(q, 0755);
1883                 if (r < 0)
1884                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1885                 return 0;
1886         }
1887
1888         if (arg_link_journal == LINK_HOST) {
1889                 /* don't create parents here -- if the host doesn't have
1890                  * permanent journal set up, don't force it here */
1891                 r = mkdir(p, 0755);
1892                 if (r < 0) {
1893                         if (arg_link_journal_try) {
1894                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1895                                 return 0;
1896                         } else {
1897                                 log_error_errno(errno, "Failed to create %s: %m", p);
1898                                 return r;
1899                         }
1900                 }
1901
1902         } else if (access(p, F_OK) < 0)
1903                 return 0;
1904
1905         if (dir_is_empty(q) == 0)
1906                 log_warning("%s is not empty, proceeding anyway.", q);
1907
1908         r = mkdir_p(q, 0755);
1909         if (r < 0) {
1910                 log_error_errno(errno, "Failed to create %s: %m", q);
1911                 return r;
1912         }
1913
1914         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1915                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1916
1917         return 0;
1918 }
1919
1920 static int drop_capabilities(void) {
1921         return capability_bounding_set_drop(~arg_retain, false);
1922 }
1923
1924 static int register_machine(pid_t pid, int local_ifindex) {
1925         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1926         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1927         int r;
1928
1929         if (!arg_register)
1930                 return 0;
1931
1932         r = sd_bus_default_system(&bus);
1933         if (r < 0)
1934                 return log_error_errno(r, "Failed to open system bus: %m");
1935
1936         if (arg_keep_unit) {
1937                 r = sd_bus_call_method(
1938                                 bus,
1939                                 "org.freedesktop.machine1",
1940                                 "/org/freedesktop/machine1",
1941                                 "org.freedesktop.machine1.Manager",
1942                                 "RegisterMachineWithNetwork",
1943                                 &error,
1944                                 NULL,
1945                                 "sayssusai",
1946                                 arg_machine,
1947                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1948                                 "nspawn",
1949                                 "container",
1950                                 (uint32_t) pid,
1951                                 strempty(arg_directory),
1952                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1953         } else {
1954                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1955                 char **i;
1956
1957                 r = sd_bus_message_new_method_call(
1958                                 bus,
1959                                 &m,
1960                                 "org.freedesktop.machine1",
1961                                 "/org/freedesktop/machine1",
1962                                 "org.freedesktop.machine1.Manager",
1963                                 "CreateMachineWithNetwork");
1964                 if (r < 0)
1965                         return bus_log_create_error(r);
1966
1967                 r = sd_bus_message_append(
1968                                 m,
1969                                 "sayssusai",
1970                                 arg_machine,
1971                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1972                                 "nspawn",
1973                                 "container",
1974                                 (uint32_t) pid,
1975                                 strempty(arg_directory),
1976                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1977                 if (r < 0)
1978                         return bus_log_create_error(r);
1979
1980                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1981                 if (r < 0)
1982                         return bus_log_create_error(r);
1983
1984                 if (!isempty(arg_slice)) {
1985                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1986                         if (r < 0)
1987                                 return bus_log_create_error(r);
1988                 }
1989
1990                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1991                 if (r < 0)
1992                         return bus_log_create_error(r);
1993
1994                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1995                                           /* Allow the container to
1996                                            * access and create the API
1997                                            * device nodes, so that
1998                                            * PrivateDevices= in the
1999                                            * container can work
2000                                            * fine */
2001                                           "/dev/null", "rwm",
2002                                           "/dev/zero", "rwm",
2003                                           "/dev/full", "rwm",
2004                                           "/dev/random", "rwm",
2005                                           "/dev/urandom", "rwm",
2006                                           "/dev/tty", "rwm",
2007                                           "/dev/net/tun", "rwm",
2008                                           /* Allow the container
2009                                            * access to ptys. However,
2010                                            * do not permit the
2011                                            * container to ever create
2012                                            * these device nodes. */
2013                                           "/dev/pts/ptmx", "rw",
2014                                           "char-pts", "rw");
2015                 if (r < 0)
2016                         return log_error_errno(r, "Failed to add device whitelist: %m");
2017
2018                 STRV_FOREACH(i, arg_property) {
2019                         r = sd_bus_message_open_container(m, 'r', "sv");
2020                         if (r < 0)
2021                                 return bus_log_create_error(r);
2022
2023                         r = bus_append_unit_property_assignment(m, *i);
2024                         if (r < 0)
2025                                 return r;
2026
2027                         r = sd_bus_message_close_container(m);
2028                         if (r < 0)
2029                                 return bus_log_create_error(r);
2030                 }
2031
2032                 r = sd_bus_message_close_container(m);
2033                 if (r < 0)
2034                         return bus_log_create_error(r);
2035
2036                 r = sd_bus_call(bus, m, 0, &error, NULL);
2037         }
2038
2039         if (r < 0) {
2040                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2041                 return r;
2042         }
2043
2044         return 0;
2045 }
2046
2047 static int terminate_machine(pid_t pid) {
2048         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2049         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2050         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2051         const char *path;
2052         int r;
2053
2054         if (!arg_register)
2055                 return 0;
2056
2057         r = sd_bus_default_system(&bus);
2058         if (r < 0)
2059                 return log_error_errno(r, "Failed to open system bus: %m");
2060
2061         r = sd_bus_call_method(
2062                         bus,
2063                         "org.freedesktop.machine1",
2064                         "/org/freedesktop/machine1",
2065                         "org.freedesktop.machine1.Manager",
2066                         "GetMachineByPID",
2067                         &error,
2068                         &reply,
2069                         "u",
2070                         (uint32_t) pid);
2071         if (r < 0) {
2072                 /* Note that the machine might already have been
2073                  * cleaned up automatically, hence don't consider it a
2074                  * failure if we cannot get the machine object. */
2075                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2076                 return 0;
2077         }
2078
2079         r = sd_bus_message_read(reply, "o", &path);
2080         if (r < 0)
2081                 return bus_log_parse_error(r);
2082
2083         r = sd_bus_call_method(
2084                         bus,
2085                         "org.freedesktop.machine1",
2086                         path,
2087                         "org.freedesktop.machine1.Machine",
2088                         "Terminate",
2089                         &error,
2090                         NULL,
2091                         NULL);
2092         if (r < 0) {
2093                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2094                 return 0;
2095         }
2096
2097         return 0;
2098 }
2099
2100 static int reset_audit_loginuid(void) {
2101         _cleanup_free_ char *p = NULL;
2102         int r;
2103
2104         if (arg_share_system)
2105                 return 0;
2106
2107         r = read_one_line_file("/proc/self/loginuid", &p);
2108         if (r == -ENOENT)
2109                 return 0;
2110         if (r < 0)
2111                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2112
2113         /* Already reset? */
2114         if (streq(p, "4294967295"))
2115                 return 0;
2116
2117         r = write_string_file("/proc/self/loginuid", "4294967295");
2118         if (r < 0) {
2119                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2120                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2121                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2122                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2123                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2124
2125                 sleep(5);
2126         }
2127
2128         return 0;
2129 }
2130
2131 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2132 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2133 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2134
2135 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2136         uint8_t result[8];
2137         size_t l, sz;
2138         uint8_t *v, *i;
2139         int r;
2140
2141         l = strlen(arg_machine);
2142         sz = sizeof(sd_id128_t) + l;
2143         if (idx > 0)
2144                 sz += sizeof(idx);
2145
2146         v = alloca(sz);
2147
2148         /* fetch some persistent data unique to the host */
2149         r = sd_id128_get_machine((sd_id128_t*) v);
2150         if (r < 0)
2151                 return r;
2152
2153         /* combine with some data unique (on this host) to this
2154          * container instance */
2155         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2156         if (idx > 0) {
2157                 idx = htole64(idx);
2158                 memcpy(i, &idx, sizeof(idx));
2159         }
2160
2161         /* Let's hash the host machine ID plus the container name. We
2162          * use a fixed, but originally randomly created hash key here. */
2163         siphash24(result, v, sz, hash_key.bytes);
2164
2165         assert_cc(ETH_ALEN <= sizeof(result));
2166         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2167
2168         /* see eth_random_addr in the kernel */
2169         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2170         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2171
2172         return 0;
2173 }
2174
2175 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2176         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2177         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2178         struct ether_addr mac_host, mac_container;
2179         int r, i;
2180
2181         if (!arg_private_network)
2182                 return 0;
2183
2184         if (!arg_network_veth)
2185                 return 0;
2186
2187         /* Use two different interface name prefixes depending whether
2188          * we are in bridge mode or not. */
2189         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2190                  arg_network_bridge ? "vb" : "ve", arg_machine);
2191
2192         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2193         if (r < 0)
2194                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2195
2196         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2197         if (r < 0)
2198                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2199
2200         r = sd_rtnl_open(&rtnl, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to connect to netlink: %m");
2203
2204         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2207
2208         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2211
2212         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2215
2216         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to open netlink container: %m");
2219
2220         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2221         if (r < 0)
2222                 return log_error_errno(r, "Failed to open netlink container: %m");
2223
2224         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2225         if (r < 0)
2226                 return log_error_errno(r, "Failed to open netlink container: %m");
2227
2228         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2229         if (r < 0)
2230                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2231
2232         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2233         if (r < 0)
2234                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2235
2236         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2237         if (r < 0)
2238                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2239
2240         r = sd_rtnl_message_close_container(m);
2241         if (r < 0)
2242                 return log_error_errno(r, "Failed to close netlink container: %m");
2243
2244         r = sd_rtnl_message_close_container(m);
2245         if (r < 0)
2246                 return log_error_errno(r, "Failed to close netlink container: %m");
2247
2248         r = sd_rtnl_message_close_container(m);
2249         if (r < 0)
2250                 return log_error_errno(r, "Failed to close netlink container: %m");
2251
2252         r = sd_rtnl_call(rtnl, m, 0, NULL);
2253         if (r < 0)
2254                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2255
2256         i = (int) if_nametoindex(iface_name);
2257         if (i <= 0)
2258                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2259
2260         *ifi = i;
2261
2262         return 0;
2263 }
2264
2265 static int setup_bridge(const char veth_name[], int *ifi) {
2266         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2267         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2268         int r, bridge;
2269
2270         if (!arg_private_network)
2271                 return 0;
2272
2273         if (!arg_network_veth)
2274                 return 0;
2275
2276         if (!arg_network_bridge)
2277                 return 0;
2278
2279         bridge = (int) if_nametoindex(arg_network_bridge);
2280         if (bridge <= 0)
2281                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2282
2283         *ifi = bridge;
2284
2285         r = sd_rtnl_open(&rtnl, 0);
2286         if (r < 0)
2287                 return log_error_errno(r, "Failed to connect to netlink: %m");
2288
2289         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2290         if (r < 0)
2291                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2292
2293         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2294         if (r < 0)
2295                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2296
2297         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2298         if (r < 0)
2299                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2300
2301         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2302         if (r < 0)
2303                 return log_error_errno(r, "Failed to add netlink master field: %m");
2304
2305         r = sd_rtnl_call(rtnl, m, 0, NULL);
2306         if (r < 0)
2307                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2308
2309         return 0;
2310 }
2311
2312 static int parse_interface(struct udev *udev, const char *name) {
2313         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2314         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2315         int ifi;
2316
2317         ifi = (int) if_nametoindex(name);
2318         if (ifi <= 0)
2319                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2320
2321         sprintf(ifi_str, "n%i", ifi);
2322         d = udev_device_new_from_device_id(udev, ifi_str);
2323         if (!d)
2324                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2325
2326         if (udev_device_get_is_initialized(d) <= 0) {
2327                 log_error("Network interface %s is not initialized yet.", name);
2328                 return -EBUSY;
2329         }
2330
2331         return ifi;
2332 }
2333
2334 static int move_network_interfaces(pid_t pid) {
2335         _cleanup_udev_unref_ struct udev *udev = NULL;
2336         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2337         char **i;
2338         int r;
2339
2340         if (!arg_private_network)
2341                 return 0;
2342
2343         if (strv_isempty(arg_network_interfaces))
2344                 return 0;
2345
2346         r = sd_rtnl_open(&rtnl, 0);
2347         if (r < 0)
2348                 return log_error_errno(r, "Failed to connect to netlink: %m");
2349
2350         udev = udev_new();
2351         if (!udev) {
2352                 log_error("Failed to connect to udev.");
2353                 return -ENOMEM;
2354         }
2355
2356         STRV_FOREACH(i, arg_network_interfaces) {
2357                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2358                 int ifi;
2359
2360                 ifi = parse_interface(udev, *i);
2361                 if (ifi < 0)
2362                         return ifi;
2363
2364                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2365                 if (r < 0)
2366                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2367
2368                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2371
2372                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2375         }
2376
2377         return 0;
2378 }
2379
2380 static int setup_macvlan(pid_t pid) {
2381         _cleanup_udev_unref_ struct udev *udev = NULL;
2382         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2383         unsigned idx = 0;
2384         char **i;
2385         int r;
2386
2387         if (!arg_private_network)
2388                 return 0;
2389
2390         if (strv_isempty(arg_network_macvlan))
2391                 return 0;
2392
2393         r = sd_rtnl_open(&rtnl, 0);
2394         if (r < 0)
2395                 return log_error_errno(r, "Failed to connect to netlink: %m");
2396
2397         udev = udev_new();
2398         if (!udev) {
2399                 log_error("Failed to connect to udev.");
2400                 return -ENOMEM;
2401         }
2402
2403         STRV_FOREACH(i, arg_network_macvlan) {
2404                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2405                 _cleanup_free_ char *n = NULL;
2406                 struct ether_addr mac;
2407                 int ifi;
2408
2409                 ifi = parse_interface(udev, *i);
2410                 if (ifi < 0)
2411                         return ifi;
2412
2413                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2414                 if (r < 0)
2415                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2416
2417                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2418                 if (r < 0)
2419                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2420
2421                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2422                 if (r < 0)
2423                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2424
2425                 n = strappend("mv-", *i);
2426                 if (!n)
2427                         return log_oom();
2428
2429                 strshorten(n, IFNAMSIZ-1);
2430
2431                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2432                 if (r < 0)
2433                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2434
2435                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2436                 if (r < 0)
2437                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2438
2439                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2440                 if (r < 0)
2441                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2442
2443                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2444                 if (r < 0)
2445                         return log_error_errno(r, "Failed to open netlink container: %m");
2446
2447                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2448                 if (r < 0)
2449                         return log_error_errno(r, "Failed to open netlink container: %m");
2450
2451                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2452                 if (r < 0)
2453                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2454
2455                 r = sd_rtnl_message_close_container(m);
2456                 if (r < 0)
2457                         return log_error_errno(r, "Failed to close netlink container: %m");
2458
2459                 r = sd_rtnl_message_close_container(m);
2460                 if (r < 0)
2461                         return log_error_errno(r, "Failed to close netlink container: %m");
2462
2463                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2464                 if (r < 0)
2465                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2466         }
2467
2468         return 0;
2469 }
2470
2471 static int setup_ipvlan(pid_t pid) {
2472         _cleanup_udev_unref_ struct udev *udev = NULL;
2473         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2474         char **i;
2475         int r;
2476
2477         if (!arg_private_network)
2478                 return 0;
2479
2480         if (strv_isempty(arg_network_ipvlan))
2481                 return 0;
2482
2483         r = sd_rtnl_open(&rtnl, 0);
2484         if (r < 0)
2485                 return log_error_errno(r, "Failed to connect to netlink: %m");
2486
2487         udev = udev_new();
2488         if (!udev) {
2489                 log_error("Failed to connect to udev.");
2490                 return -ENOMEM;
2491         }
2492
2493         STRV_FOREACH(i, arg_network_ipvlan) {
2494                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2495                 _cleanup_free_ char *n = NULL;
2496                 int ifi;
2497
2498                 ifi = parse_interface(udev, *i);
2499                 if (ifi < 0)
2500                         return ifi;
2501
2502                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2503                 if (r < 0)
2504                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2505
2506                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2507                 if (r < 0)
2508                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2509
2510                 n = strappend("iv-", *i);
2511                 if (!n)
2512                         return log_oom();
2513
2514                 strshorten(n, IFNAMSIZ-1);
2515
2516                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2517                 if (r < 0)
2518                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2519
2520                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2521                 if (r < 0)
2522                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2523
2524                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2525                 if (r < 0)
2526                         return log_error_errno(r, "Failed to open netlink container: %m");
2527
2528                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2529                 if (r < 0)
2530                         return log_error_errno(r, "Failed to open netlink container: %m");
2531
2532                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2533                 if (r < 0)
2534                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2535
2536                 r = sd_rtnl_message_close_container(m);
2537                 if (r < 0)
2538                         return log_error_errno(r, "Failed to close netlink container: %m");
2539
2540                 r = sd_rtnl_message_close_container(m);
2541                 if (r < 0)
2542                         return log_error_errno(r, "Failed to close netlink container: %m");
2543
2544                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2545                 if (r < 0)
2546                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2547         }
2548
2549         return 0;
2550 }
2551
2552 static int setup_seccomp(void) {
2553
2554 #ifdef HAVE_SECCOMP
2555         static const int blacklist[] = {
2556                 SCMP_SYS(kexec_load),
2557                 SCMP_SYS(open_by_handle_at),
2558                 SCMP_SYS(iopl),
2559                 SCMP_SYS(ioperm),
2560                 SCMP_SYS(swapon),
2561                 SCMP_SYS(swapoff),
2562         };
2563
2564         static const int kmod_blacklist[] = {
2565                 SCMP_SYS(init_module),
2566                 SCMP_SYS(finit_module),
2567                 SCMP_SYS(delete_module),
2568         };
2569
2570         scmp_filter_ctx seccomp;
2571         unsigned i;
2572         int r;
2573
2574         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2575         if (!seccomp)
2576                 return log_oom();
2577
2578         r = seccomp_add_secondary_archs(seccomp);
2579         if (r < 0) {
2580                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2581                 goto finish;
2582         }
2583
2584         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2585                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2586                 if (r == -EFAULT)
2587                         continue; /* unknown syscall */
2588                 if (r < 0) {
2589                         log_error_errno(r, "Failed to block syscall: %m");
2590                         goto finish;
2591                 }
2592         }
2593
2594         /* If the CAP_SYS_MODULE capability is not requested then
2595          * we'll block the kmod syscalls too */
2596         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2597                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2598                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2599                         if (r == -EFAULT)
2600                                 continue; /* unknown syscall */
2601                         if (r < 0) {
2602                                 log_error_errno(r, "Failed to block syscall: %m");
2603                                 goto finish;
2604                         }
2605                 }
2606         }
2607
2608         /*
2609            Audit is broken in containers, much of the userspace audit
2610            hookup will fail if running inside a container. We don't
2611            care and just turn off creation of audit sockets.
2612
2613            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2614            with EAFNOSUPPORT which audit userspace uses as indication
2615            that audit is disabled in the kernel.
2616          */
2617
2618         r = seccomp_rule_add(
2619                         seccomp,
2620                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2621                         SCMP_SYS(socket),
2622                         2,
2623                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2624                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2625         if (r < 0) {
2626                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2627                 goto finish;
2628         }
2629
2630         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2631         if (r < 0) {
2632                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2633                 goto finish;
2634         }
2635
2636         r = seccomp_load(seccomp);
2637         if (r < 0)
2638                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2639
2640 finish:
2641         seccomp_release(seccomp);
2642         return r;
2643 #else
2644         return 0;
2645 #endif
2646
2647 }
2648
2649 static int setup_propagate(const char *root) {
2650         const char *p, *q;
2651
2652         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2653         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2654         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2655         (void) mkdir_p(p, 0600);
2656
2657         q = strjoina(root, "/run/systemd/nspawn/incoming");
2658         mkdir_parents(q, 0755);
2659         mkdir_p(q, 0600);
2660
2661         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2662                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2663
2664         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2665                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2666
2667         return 0;
2668 }
2669
2670 static int setup_image(char **device_path, int *loop_nr) {
2671         struct loop_info64 info = {
2672                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2673         };
2674         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2675         _cleanup_free_ char* loopdev = NULL;
2676         struct stat st;
2677         int r, nr;
2678
2679         assert(device_path);
2680         assert(loop_nr);
2681         assert(arg_image);
2682
2683         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2684         if (fd < 0)
2685                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2686
2687         if (fstat(fd, &st) < 0)
2688                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2689
2690         if (S_ISBLK(st.st_mode)) {
2691                 char *p;
2692
2693                 p = strdup(arg_image);
2694                 if (!p)
2695                         return log_oom();
2696
2697                 *device_path = p;
2698
2699                 *loop_nr = -1;
2700
2701                 r = fd;
2702                 fd = -1;
2703
2704                 return r;
2705         }
2706
2707         if (!S_ISREG(st.st_mode)) {
2708                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2709                 return -EINVAL;
2710         }
2711
2712         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2713         if (control < 0)
2714                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2715
2716         nr = ioctl(control, LOOP_CTL_GET_FREE);
2717         if (nr < 0)
2718                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2719
2720         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2721                 return log_oom();
2722
2723         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2724         if (loop < 0)
2725                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2726
2727         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2728                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2729
2730         if (arg_read_only)
2731                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2732
2733         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2734                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2735
2736         *device_path = loopdev;
2737         loopdev = NULL;
2738
2739         *loop_nr = nr;
2740
2741         r = loop;
2742         loop = -1;
2743
2744         return r;
2745 }
2746
2747 #define PARTITION_TABLE_BLURB \
2748         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2749         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2750         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2751         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2752         "to be bootable with systemd-nspawn."
2753
2754 static int dissect_image(
2755                 int fd,
2756                 char **root_device, bool *root_device_rw,
2757                 char **home_device, bool *home_device_rw,
2758                 char **srv_device, bool *srv_device_rw,
2759                 bool *secondary) {
2760
2761 #ifdef HAVE_BLKID
2762         int home_nr = -1, srv_nr = -1;
2763 #ifdef GPT_ROOT_NATIVE
2764         int root_nr = -1;
2765 #endif
2766 #ifdef GPT_ROOT_SECONDARY
2767         int secondary_root_nr = -1;
2768 #endif
2769         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2770         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2771         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2772         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2773         _cleanup_udev_unref_ struct udev *udev = NULL;
2774         struct udev_list_entry *first, *item;
2775         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2776         bool is_gpt, is_mbr, multiple_generic = false;
2777         const char *pttype = NULL;
2778         blkid_partlist pl;
2779         struct stat st;
2780         unsigned i;
2781         int r;
2782
2783         assert(fd >= 0);
2784         assert(root_device);
2785         assert(home_device);
2786         assert(srv_device);
2787         assert(secondary);
2788         assert(arg_image);
2789
2790         b = blkid_new_probe();
2791         if (!b)
2792                 return log_oom();
2793
2794         errno = 0;
2795         r = blkid_probe_set_device(b, fd, 0, 0);
2796         if (r != 0) {
2797                 if (errno == 0)
2798                         return log_oom();
2799
2800                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2801                 return -errno;
2802         }
2803
2804         blkid_probe_enable_partitions(b, 1);
2805         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2806
2807         errno = 0;
2808         r = blkid_do_safeprobe(b);
2809         if (r == -2 || r == 1) {
2810                 log_error("Failed to identify any partition table on\n"
2811                           "    %s\n"
2812                           PARTITION_TABLE_BLURB, arg_image);
2813                 return -EINVAL;
2814         } else if (r != 0) {
2815                 if (errno == 0)
2816                         errno = EIO;
2817                 log_error_errno(errno, "Failed to probe: %m");
2818                 return -errno;
2819         }
2820
2821         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2822
2823         is_gpt = streq_ptr(pttype, "gpt");
2824         is_mbr = streq_ptr(pttype, "dos");
2825
2826         if (!is_gpt && !is_mbr) {
2827                 log_error("No GPT or MBR partition table discovered on\n"
2828                           "    %s\n"
2829                           PARTITION_TABLE_BLURB, arg_image);
2830                 return -EINVAL;
2831         }
2832
2833         errno = 0;
2834         pl = blkid_probe_get_partitions(b);
2835         if (!pl) {
2836                 if (errno == 0)
2837                         return log_oom();
2838
2839                 log_error("Failed to list partitions of %s", arg_image);
2840                 return -errno;
2841         }
2842
2843         udev = udev_new();
2844         if (!udev)
2845                 return log_oom();
2846
2847         if (fstat(fd, &st) < 0)
2848                 return log_error_errno(errno, "Failed to stat block device: %m");
2849
2850         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2851         if (!d)
2852                 return log_oom();
2853
2854         for (i = 0;; i++) {
2855                 int n, m;
2856
2857                 if (i >= 10) {
2858                         log_error("Kernel partitions never appeared.");
2859                         return -ENXIO;
2860                 }
2861
2862                 e = udev_enumerate_new(udev);
2863                 if (!e)
2864                         return log_oom();
2865
2866                 r = udev_enumerate_add_match_parent(e, d);
2867                 if (r < 0)
2868                         return log_oom();
2869
2870                 r = udev_enumerate_scan_devices(e);
2871                 if (r < 0)
2872                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2873
2874                 /* Count the partitions enumerated by the kernel */
2875                 n = 0;
2876                 first = udev_enumerate_get_list_entry(e);
2877                 udev_list_entry_foreach(item, first)
2878                         n++;
2879
2880                 /* Count the partitions enumerated by blkid */
2881                 m = blkid_partlist_numof_partitions(pl);
2882                 if (n == m + 1)
2883                         break;
2884                 if (n > m + 1) {
2885                         log_error("blkid and kernel partition list do not match.");
2886                         return -EIO;
2887                 }
2888                 if (n < m + 1) {
2889                         unsigned j;
2890
2891                         /* The kernel has probed fewer partitions than
2892                          * blkid? Maybe the kernel prober is still
2893                          * running or it got EBUSY because udev
2894                          * already opened the device. Let's reprobe
2895                          * the device, which is a synchronous call
2896                          * that waits until probing is complete. */
2897
2898                         for (j = 0; j < 20; j++) {
2899
2900                                 r = ioctl(fd, BLKRRPART, 0);
2901                                 if (r < 0)
2902                                         r = -errno;
2903                                 if (r >= 0 || r != -EBUSY)
2904                                         break;
2905
2906                                 /* If something else has the device
2907                                  * open, such as an udev rule, the
2908                                  * ioctl will return EBUSY. Since
2909                                  * there's no way to wait until it
2910                                  * isn't busy anymore, let's just wait
2911                                  * a bit, and try again.
2912                                  *
2913                                  * This is really something they
2914                                  * should fix in the kernel! */
2915
2916                                 usleep(50 * USEC_PER_MSEC);
2917                         }
2918
2919                         if (r < 0)
2920                                 return log_error_errno(r, "Failed to reread partition table: %m");
2921                 }
2922
2923                 e = udev_enumerate_unref(e);
2924         }
2925
2926         first = udev_enumerate_get_list_entry(e);
2927         udev_list_entry_foreach(item, first) {
2928                 _cleanup_udev_device_unref_ struct udev_device *q;
2929                 const char *node;
2930                 unsigned long long flags;
2931                 blkid_partition pp;
2932                 dev_t qn;
2933                 int nr;
2934
2935                 errno = 0;
2936                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2937                 if (!q) {
2938                         if (!errno)
2939                                 errno = ENOMEM;
2940
2941                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2942                         return -errno;
2943                 }
2944
2945                 qn = udev_device_get_devnum(q);
2946                 if (major(qn) == 0)
2947                         continue;
2948
2949                 if (st.st_rdev == qn)
2950                         continue;
2951
2952                 node = udev_device_get_devnode(q);
2953                 if (!node)
2954                         continue;
2955
2956                 pp = blkid_partlist_devno_to_partition(pl, qn);
2957                 if (!pp)
2958                         continue;
2959
2960                 flags = blkid_partition_get_flags(pp);
2961
2962                 nr = blkid_partition_get_partno(pp);
2963                 if (nr < 0)
2964                         continue;
2965
2966                 if (is_gpt) {
2967                         sd_id128_t type_id;
2968                         const char *stype;
2969
2970                         if (flags & GPT_FLAG_NO_AUTO)
2971                                 continue;
2972
2973                         stype = blkid_partition_get_type_string(pp);
2974                         if (!stype)
2975                                 continue;
2976
2977                         if (sd_id128_from_string(stype, &type_id) < 0)
2978                                 continue;
2979
2980                         if (sd_id128_equal(type_id, GPT_HOME)) {
2981
2982                                 if (home && nr >= home_nr)
2983                                         continue;
2984
2985                                 home_nr = nr;
2986                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2987
2988                                 r = free_and_strdup(&home, node);
2989                                 if (r < 0)
2990                                         return log_oom();
2991
2992                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2993
2994                                 if (srv && nr >= srv_nr)
2995                                         continue;
2996
2997                                 srv_nr = nr;
2998                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2999
3000                                 r = free_and_strdup(&srv, node);
3001                                 if (r < 0)
3002                                         return log_oom();
3003                         }
3004 #ifdef GPT_ROOT_NATIVE
3005                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3006
3007                                 if (root && nr >= root_nr)
3008                                         continue;
3009
3010                                 root_nr = nr;
3011                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3012
3013                                 r = free_and_strdup(&root, node);
3014                                 if (r < 0)
3015                                         return log_oom();
3016                         }
3017 #endif
3018 #ifdef GPT_ROOT_SECONDARY
3019                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3020
3021                                 if (secondary_root && nr >= secondary_root_nr)
3022                                         continue;
3023
3024                                 secondary_root_nr = nr;
3025                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3026
3027                                 r = free_and_strdup(&secondary_root, node);
3028                                 if (r < 0)
3029                                         return log_oom();
3030                         }
3031 #endif
3032                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3033
3034                                 if (generic)
3035                                         multiple_generic = true;
3036                                 else {
3037                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3038
3039                                         r = free_and_strdup(&generic, node);
3040                                         if (r < 0)
3041                                                 return log_oom();
3042                                 }
3043                         }
3044
3045                 } else if (is_mbr) {
3046                         int type;
3047
3048                         if (flags != 0x80) /* Bootable flag */
3049                                 continue;
3050
3051                         type = blkid_partition_get_type(pp);
3052                         if (type != 0x83) /* Linux partition */
3053                                 continue;
3054
3055                         if (generic)
3056                                 multiple_generic = true;
3057                         else {
3058                                 generic_rw = true;
3059
3060                                 r = free_and_strdup(&root, node);
3061                                 if (r < 0)
3062                                         return log_oom();
3063                         }
3064                 }
3065         }
3066
3067         if (root) {
3068                 *root_device = root;
3069                 root = NULL;
3070
3071                 *root_device_rw = root_rw;
3072                 *secondary = false;
3073         } else if (secondary_root) {
3074                 *root_device = secondary_root;
3075                 secondary_root = NULL;
3076
3077                 *root_device_rw = secondary_root_rw;
3078                 *secondary = true;
3079         } else if (generic) {
3080
3081                 /* There were no partitions with precise meanings
3082                  * around, but we found generic partitions. In this
3083                  * case, if there's only one, we can go ahead and boot
3084                  * it, otherwise we bail out, because we really cannot
3085                  * make any sense of it. */
3086
3087                 if (multiple_generic) {
3088                         log_error("Identified multiple bootable Linux partitions on\n"
3089                                   "    %s\n"
3090                                   PARTITION_TABLE_BLURB, arg_image);
3091                         return -EINVAL;
3092                 }
3093
3094                 *root_device = generic;
3095                 generic = NULL;
3096
3097                 *root_device_rw = generic_rw;
3098                 *secondary = false;
3099         } else {
3100                 log_error("Failed to identify root partition in disk image\n"
3101                           "    %s\n"
3102                           PARTITION_TABLE_BLURB, arg_image);
3103                 return -EINVAL;
3104         }
3105
3106         if (home) {
3107                 *home_device = home;
3108                 home = NULL;
3109
3110                 *home_device_rw = home_rw;
3111         }
3112
3113         if (srv) {
3114                 *srv_device = srv;
3115                 srv = NULL;
3116
3117                 *srv_device_rw = srv_rw;
3118         }
3119
3120         return 0;
3121 #else
3122         log_error("--image= is not supported, compiled without blkid support.");
3123         return -ENOTSUP;
3124 #endif
3125 }
3126
3127 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3128 #ifdef HAVE_BLKID
3129         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3130         const char *fstype, *p;
3131         int r;
3132
3133         assert(what);
3134         assert(where);
3135
3136         if (arg_read_only)
3137                 rw = false;
3138
3139         if (directory)
3140                 p = strjoina(where, directory);
3141         else
3142                 p = where;
3143
3144         errno = 0;
3145         b = blkid_new_probe_from_filename(what);
3146         if (!b) {
3147                 if (errno == 0)
3148                         return log_oom();
3149                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3150                 return -errno;
3151         }
3152
3153         blkid_probe_enable_superblocks(b, 1);
3154         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3155
3156         errno = 0;
3157         r = blkid_do_safeprobe(b);
3158         if (r == -1 || r == 1) {
3159                 log_error("Cannot determine file system type of %s", what);
3160                 return -EINVAL;
3161         } else if (r != 0) {
3162                 if (errno == 0)
3163                         errno = EIO;
3164                 log_error_errno(errno, "Failed to probe %s: %m", what);
3165                 return -errno;
3166         }
3167
3168         errno = 0;
3169         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3170                 if (errno == 0)
3171                         errno = EINVAL;
3172                 log_error("Failed to determine file system type of %s", what);
3173                 return -errno;
3174         }
3175
3176         if (streq(fstype, "crypto_LUKS")) {
3177                 log_error("nspawn currently does not support LUKS disk images.");
3178                 return -ENOTSUP;
3179         }
3180
3181         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3182                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3183
3184         return 0;
3185 #else
3186         log_error("--image= is not supported, compiled without blkid support.");
3187         return -ENOTSUP;
3188 #endif
3189 }
3190
3191 static int mount_devices(
3192                 const char *where,
3193                 const char *root_device, bool root_device_rw,
3194                 const char *home_device, bool home_device_rw,
3195                 const char *srv_device, bool srv_device_rw) {
3196         int r;
3197
3198         assert(where);
3199
3200         if (root_device) {
3201                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3202                 if (r < 0)
3203                         return log_error_errno(r, "Failed to mount root directory: %m");
3204         }
3205
3206         if (home_device) {
3207                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3208                 if (r < 0)
3209                         return log_error_errno(r, "Failed to mount home directory: %m");
3210         }
3211
3212         if (srv_device) {
3213                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3214                 if (r < 0)
3215                         return log_error_errno(r, "Failed to mount server data directory: %m");
3216         }
3217
3218         return 0;
3219 }
3220
3221 static void loop_remove(int nr, int *image_fd) {
3222         _cleanup_close_ int control = -1;
3223         int r;
3224
3225         if (nr < 0)
3226                 return;
3227
3228         if (image_fd && *image_fd >= 0) {
3229                 r = ioctl(*image_fd, LOOP_CLR_FD);
3230                 if (r < 0)
3231                         log_debug_errno(errno, "Failed to close loop image: %m");
3232                 *image_fd = safe_close(*image_fd);
3233         }
3234
3235         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3236         if (control < 0) {
3237                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3238                 return;
3239         }
3240
3241         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3242         if (r < 0)
3243                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3244 }
3245
3246 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3247         int pipe_fds[2];
3248         pid_t pid;
3249
3250         assert(database);
3251         assert(key);
3252         assert(rpid);
3253
3254         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3255                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3256
3257         pid = fork();
3258         if (pid < 0)
3259                 return log_error_errno(errno, "Failed to fork getent child: %m");
3260         else if (pid == 0) {
3261                 int nullfd;
3262                 char *empty_env = NULL;
3263
3264                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3265                         _exit(EXIT_FAILURE);
3266
3267                 if (pipe_fds[0] > 2)
3268                         safe_close(pipe_fds[0]);
3269                 if (pipe_fds[1] > 2)
3270                         safe_close(pipe_fds[1]);
3271
3272                 nullfd = open("/dev/null", O_RDWR);
3273                 if (nullfd < 0)
3274                         _exit(EXIT_FAILURE);
3275
3276                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3277                         _exit(EXIT_FAILURE);
3278
3279                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3280                         _exit(EXIT_FAILURE);
3281
3282                 if (nullfd > 2)
3283                         safe_close(nullfd);
3284
3285                 reset_all_signal_handlers();
3286                 close_all_fds(NULL, 0);
3287
3288                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3289                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3290                 _exit(EXIT_FAILURE);
3291         }
3292
3293         pipe_fds[1] = safe_close(pipe_fds[1]);
3294
3295         *rpid = pid;
3296
3297         return pipe_fds[0];
3298 }
3299
3300 static int change_uid_gid(char **_home) {
3301         char line[LINE_MAX], *x, *u, *g, *h;
3302         const char *word, *state;
3303         _cleanup_free_ uid_t *uids = NULL;
3304         _cleanup_free_ char *home = NULL;
3305         _cleanup_fclose_ FILE *f = NULL;
3306         _cleanup_close_ int fd = -1;
3307         unsigned n_uids = 0;
3308         size_t sz = 0, l;
3309         uid_t uid;
3310         gid_t gid;
3311         pid_t pid;
3312         int r;
3313
3314         assert(_home);
3315
3316         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3317                 /* Reset everything fully to 0, just in case */
3318
3319                 if (setgroups(0, NULL) < 0)
3320                         return log_error_errno(errno, "setgroups() failed: %m");
3321
3322                 if (setresgid(0, 0, 0) < 0)
3323                         return log_error_errno(errno, "setregid() failed: %m");
3324
3325                 if (setresuid(0, 0, 0) < 0)
3326                         return log_error_errno(errno, "setreuid() failed: %m");
3327
3328                 *_home = NULL;
3329                 return 0;
3330         }
3331
3332         /* First, get user credentials */
3333         fd = spawn_getent("passwd", arg_user, &pid);
3334         if (fd < 0)
3335                 return fd;
3336
3337         f = fdopen(fd, "r");
3338         if (!f)
3339                 return log_oom();
3340         fd = -1;
3341
3342         if (!fgets(line, sizeof(line), f)) {
3343
3344                 if (!ferror(f)) {
3345                         log_error("Failed to resolve user %s.", arg_user);
3346                         return -ESRCH;
3347                 }
3348
3349                 log_error_errno(errno, "Failed to read from getent: %m");
3350                 return -errno;
3351         }
3352
3353         truncate_nl(line);
3354
3355         wait_for_terminate_and_warn("getent passwd", pid, true);
3356
3357         x = strchr(line, ':');
3358         if (!x) {
3359                 log_error("/etc/passwd entry has invalid user field.");
3360                 return -EIO;
3361         }
3362
3363         u = strchr(x+1, ':');
3364         if (!u) {
3365                 log_error("/etc/passwd entry has invalid password field.");
3366                 return -EIO;
3367         }
3368
3369         u++;
3370         g = strchr(u, ':');
3371         if (!g) {
3372                 log_error("/etc/passwd entry has invalid UID field.");
3373                 return -EIO;
3374         }
3375
3376         *g = 0;
3377         g++;
3378         x = strchr(g, ':');
3379         if (!x) {
3380                 log_error("/etc/passwd entry has invalid GID field.");
3381                 return -EIO;
3382         }
3383
3384         *x = 0;
3385         h = strchr(x+1, ':');
3386         if (!h) {
3387                 log_error("/etc/passwd entry has invalid GECOS field.");
3388                 return -EIO;
3389         }
3390
3391         h++;
3392         x = strchr(h, ':');
3393         if (!x) {
3394                 log_error("/etc/passwd entry has invalid home directory field.");
3395                 return -EIO;
3396         }
3397
3398         *x = 0;
3399
3400         r = parse_uid(u, &uid);
3401         if (r < 0) {
3402                 log_error("Failed to parse UID of user.");
3403                 return -EIO;
3404         }
3405
3406         r = parse_gid(g, &gid);
3407         if (r < 0) {
3408                 log_error("Failed to parse GID of user.");
3409                 return -EIO;
3410         }
3411
3412         home = strdup(h);
3413         if (!home)
3414                 return log_oom();
3415
3416         /* Second, get group memberships */
3417         fd = spawn_getent("initgroups", arg_user, &pid);
3418         if (fd < 0)
3419                 return fd;
3420
3421         fclose(f);
3422         f = fdopen(fd, "r");
3423         if (!f)
3424                 return log_oom();
3425         fd = -1;
3426
3427         if (!fgets(line, sizeof(line), f)) {
3428                 if (!ferror(f)) {
3429                         log_error("Failed to resolve user %s.", arg_user);
3430                         return -ESRCH;
3431                 }
3432
3433                 log_error_errno(errno, "Failed to read from getent: %m");
3434                 return -errno;
3435         }
3436
3437         truncate_nl(line);
3438
3439         wait_for_terminate_and_warn("getent initgroups", pid, true);
3440
3441         /* Skip over the username and subsequent separator whitespace */
3442         x = line;
3443         x += strcspn(x, WHITESPACE);
3444         x += strspn(x, WHITESPACE);
3445
3446         FOREACH_WORD(word, l, x, state) {
3447                 char c[l+1];
3448
3449                 memcpy(c, word, l);
3450                 c[l] = 0;
3451
3452                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3453                         return log_oom();
3454
3455                 r = parse_uid(c, &uids[n_uids++]);
3456                 if (r < 0) {
3457                         log_error("Failed to parse group data from getent.");
3458                         return -EIO;
3459                 }
3460         }
3461
3462         r = mkdir_parents(home, 0775);
3463         if (r < 0)
3464                 return log_error_errno(r, "Failed to make home root directory: %m");
3465
3466         r = mkdir_safe(home, 0755, uid, gid);
3467         if (r < 0 && r != -EEXIST)
3468                 return log_error_errno(r, "Failed to make home directory: %m");
3469
3470         fchown(STDIN_FILENO, uid, gid);
3471         fchown(STDOUT_FILENO, uid, gid);
3472         fchown(STDERR_FILENO, uid, gid);
3473
3474         if (setgroups(n_uids, uids) < 0)
3475                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3476
3477         if (setresgid(gid, gid, gid) < 0)
3478                 return log_error_errno(errno, "setregid() failed: %m");
3479
3480         if (setresuid(uid, uid, uid) < 0)
3481                 return log_error_errno(errno, "setreuid() failed: %m");
3482
3483         if (_home) {
3484                 *_home = home;
3485                 home = NULL;
3486         }
3487
3488         return 0;
3489 }
3490
3491 /*
3492  * Return values:
3493  * < 0 : wait_for_terminate() failed to get the state of the
3494  *       container, the container was terminated by a signal, or
3495  *       failed for an unknown reason.  No change is made to the
3496  *       container argument.
3497  * > 0 : The program executed in the container terminated with an
3498  *       error.  The exit code of the program executed in the
3499  *       container is returned.  The container argument has been set
3500  *       to CONTAINER_TERMINATED.
3501  *   0 : The container is being rebooted, has been shut down or exited
3502  *       successfully.  The container argument has been set to either
3503  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3504  *
3505  * That is, success is indicated by a return value of zero, and an
3506  * error is indicated by a non-zero value.
3507  */
3508 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3509         siginfo_t status;
3510         int r;
3511
3512         r = wait_for_terminate(pid, &status);
3513         if (r < 0)
3514                 return log_warning_errno(r, "Failed to wait for container: %m");
3515
3516         switch (status.si_code) {
3517
3518         case CLD_EXITED:
3519                 if (status.si_status == 0) {
3520                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3521
3522                 } else
3523                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3524
3525                 *container = CONTAINER_TERMINATED;
3526                 return status.si_status;
3527
3528         case CLD_KILLED:
3529                 if (status.si_status == SIGINT) {
3530
3531                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3532                         *container = CONTAINER_TERMINATED;
3533                         return 0;
3534
3535                 } else if (status.si_status == SIGHUP) {
3536
3537                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3538                         *container = CONTAINER_REBOOTED;
3539                         return 0;
3540                 }
3541
3542                 /* CLD_KILLED fallthrough */
3543
3544         case CLD_DUMPED:
3545                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3546                 return -EIO;
3547
3548         default:
3549                 log_error("Container %s failed due to unknown reason.", arg_machine);
3550                 return -EIO;
3551         }
3552
3553         return r;
3554 }
3555
3556 static void nop_handler(int sig) {}
3557
3558 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3559         pid_t pid;
3560
3561         pid = PTR_TO_UINT32(userdata);
3562         if (pid > 0) {
3563                 if (kill(pid, SIGRTMIN+3) >= 0) {
3564                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3565                         sd_event_source_set_userdata(s, NULL);
3566                         return 0;
3567                 }
3568         }
3569
3570         sd_event_exit(sd_event_source_get_event(s), 0);
3571         return 0;
3572 }
3573
3574 static int determine_names(void) {
3575         int r;
3576
3577         if (!arg_image && !arg_directory) {
3578                 if (arg_machine) {
3579                         _cleanup_(image_unrefp) Image *i = NULL;
3580
3581                         r = image_find(arg_machine, &i);
3582                         if (r < 0)
3583                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3584                         else if (r == 0) {
3585                                 log_error("No image for machine '%s': %m", arg_machine);
3586                                 return -ENOENT;
3587                         }
3588
3589                         if (i->type == IMAGE_RAW)
3590                                 r = set_sanitized_path(&arg_image, i->path);
3591                         else
3592                                 r = set_sanitized_path(&arg_directory, i->path);
3593                         if (r < 0)
3594                                 return log_error_errno(r, "Invalid image directory: %m");
3595
3596                         arg_read_only = arg_read_only || i->read_only;
3597                 } else
3598                         arg_directory = get_current_dir_name();
3599
3600                 if (!arg_directory && !arg_machine) {
3601                         log_error("Failed to determine path, please use -D or -i.");
3602                         return -EINVAL;
3603                 }
3604         }
3605
3606         if (!arg_machine) {
3607                 if (arg_directory && path_equal(arg_directory, "/"))
3608                         arg_machine = gethostname_malloc();
3609                 else
3610                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3611
3612                 if (!arg_machine)
3613                         return log_oom();
3614
3615                 hostname_cleanup(arg_machine, false);
3616                 if (!machine_name_is_valid(arg_machine)) {
3617                         log_error("Failed to determine machine name automatically, please use -M.");
3618                         return -EINVAL;
3619                 }
3620
3621                 if (arg_ephemeral) {
3622                         char *b;
3623
3624                         /* Add a random suffix when this is an
3625                          * ephemeral machine, so that we can run many
3626                          * instances at once without manually having
3627                          * to specify -M each time. */
3628
3629                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3630                                 return log_oom();
3631
3632                         free(arg_machine);
3633                         arg_machine = b;
3634                 }
3635         }
3636
3637         return 0;
3638 }
3639
3640 static int determine_uid_shift(void) {
3641         int r;
3642
3643         if (!arg_userns)
3644                 return 0;
3645
3646         if (arg_uid_shift == UID_INVALID) {
3647                 struct stat st;
3648
3649                 r = stat(arg_directory, &st);
3650                 if (r < 0)
3651                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3652
3653                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3654
3655                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3656                         log_error("UID and GID base of %s don't match.", arg_directory);
3657                         return -EINVAL;
3658                 }
3659
3660                 arg_uid_range = UINT32_C(0x10000);
3661         }
3662
3663         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3664                 log_error("UID base too high for UID range.");
3665                 return -EINVAL;
3666         }
3667
3668         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3669         return 0;
3670 }
3671
3672 int main(int argc, char *argv[]) {
3673
3674         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3675         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3676         _cleanup_close_ int master = -1, image_fd = -1;
3677         _cleanup_fdset_free_ FDSet *fds = NULL;
3678         int r, n_fd_passed, loop_nr = -1;
3679         char veth_name[IFNAMSIZ];
3680         bool secondary = false, remove_subvol = false;
3681         sigset_t mask, mask_chld;
3682         pid_t pid = 0;
3683         int ret = EXIT_SUCCESS;
3684         union in_addr_union exposed = {};
3685         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3686         bool interactive;
3687
3688         log_parse_environment();
3689         log_open();
3690
3691         r = parse_argv(argc, argv);
3692         if (r <= 0)
3693                 goto finish;
3694
3695         r = determine_names();
3696         if (r < 0)
3697                 goto finish;
3698
3699         if (geteuid() != 0) {
3700                 log_error("Need to be root.");
3701                 r = -EPERM;
3702                 goto finish;
3703         }
3704
3705         if (sd_booted() <= 0) {
3706                 log_error("Not running on a systemd system.");
3707                 r = -EINVAL;
3708                 goto finish;
3709         }
3710
3711         log_close();
3712         n_fd_passed = sd_listen_fds(false);
3713         if (n_fd_passed > 0) {
3714                 r = fdset_new_listen_fds(&fds, false);
3715                 if (r < 0) {
3716                         log_error_errno(r, "Failed to collect file descriptors: %m");
3717                         goto finish;
3718                 }
3719         }
3720         fdset_close_others(fds);
3721         log_open();
3722
3723         if (arg_directory) {
3724                 assert(!arg_image);
3725
3726                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3727                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3728                         r = -EINVAL;
3729                         goto finish;
3730                 }
3731
3732                 if (arg_ephemeral) {
3733                         char *np;
3734
3735                         /* If the specified path is a mount point we
3736                          * generate the new snapshot immediately
3737                          * inside it under a random name. However if
3738                          * the specified is not a mount point we
3739                          * create the new snapshot in the parent
3740                          * directory, just next to it. */
3741                         r = path_is_mount_point(arg_directory, false);
3742                         if (r < 0) {
3743                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3744                                 goto finish;
3745                         }
3746                         if (r > 0)
3747                                 r = tempfn_random_child(arg_directory, &np);
3748                         else
3749                                 r = tempfn_random(arg_directory, &np);
3750                         if (r < 0) {
3751                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3752                                 goto finish;
3753                         }
3754
3755                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3756                         if (r < 0) {
3757                                 log_error_errno(r, "Failed to lock %s: %m", np);
3758                                 goto finish;
3759                         }
3760
3761                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3762                         if (r < 0) {
3763                                 free(np);
3764                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3765                                 goto finish;
3766                         }
3767
3768                         free(arg_directory);
3769                         arg_directory = np;
3770
3771                         remove_subvol = true;
3772
3773                 } else {
3774                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3775                         if (r == -EBUSY) {
3776                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3777                                 goto finish;
3778                         }
3779                         if (r < 0) {
3780                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3781                                 return r;
3782                         }
3783
3784                         if (arg_template) {
3785                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3786                                 if (r == -EEXIST) {
3787                                         if (!arg_quiet)
3788                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3789                                 } else if (r < 0) {
3790                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3791                                         goto finish;
3792                                 } else {
3793                                         if (!arg_quiet)
3794                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3795                                 }
3796                         }
3797                 }
3798
3799                 if (arg_boot) {
3800                         if (path_is_os_tree(arg_directory) <= 0) {
3801                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3802                                 r = -EINVAL;
3803                                 goto finish;
3804                         }
3805                 } else {
3806                         const char *p;
3807
3808                         p = strjoina(arg_directory,
3809                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3810                         if (access(p, F_OK) < 0) {
3811                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3812                                 r = -EINVAL;
3813                                 goto finish;
3814                         }
3815                 }
3816
3817         } else {
3818                 char template[] = "/tmp/nspawn-root-XXXXXX";
3819
3820                 assert(arg_image);
3821                 assert(!arg_template);
3822
3823                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3824                 if (r == -EBUSY) {
3825                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3826                         goto finish;
3827                 }
3828                 if (r < 0) {
3829                         r = log_error_errno(r, "Failed to create image lock: %m");
3830                         goto finish;
3831                 }
3832
3833                 if (!mkdtemp(template)) {
3834                         log_error_errno(errno, "Failed to create temporary directory: %m");
3835                         r = -errno;
3836                         goto finish;
3837                 }
3838
3839                 arg_directory = strdup(template);
3840                 if (!arg_directory) {
3841                         r = log_oom();
3842                         goto finish;
3843                 }
3844
3845                 image_fd = setup_image(&device_path, &loop_nr);
3846                 if (image_fd < 0) {
3847                         r = image_fd;
3848                         goto finish;
3849                 }
3850
3851                 r = dissect_image(image_fd,
3852                                   &root_device, &root_device_rw,
3853                                   &home_device, &home_device_rw,
3854                                   &srv_device, &srv_device_rw,
3855                                   &secondary);
3856                 if (r < 0)
3857                         goto finish;
3858         }
3859
3860         r = determine_uid_shift();
3861         if (r < 0)
3862                 goto finish;
3863
3864         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3865
3866         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3867         if (master < 0) {
3868                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3869                 goto finish;
3870         }
3871
3872         r = ptsname_malloc(master, &console);
3873         if (r < 0) {
3874                 r = log_error_errno(r, "Failed to determine tty name: %m");
3875                 goto finish;
3876         }
3877
3878         if (unlockpt(master) < 0) {
3879                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3880                 goto finish;
3881         }
3882
3883         if (!arg_quiet)
3884                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3885                          arg_machine, arg_image ?: arg_directory);
3886
3887         assert_se(sigemptyset(&mask) == 0);
3888         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3889         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3890
3891         assert_se(sigemptyset(&mask_chld) == 0);
3892         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3893
3894         for (;;) {
3895                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3896                 ContainerStatus container_status;
3897                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3898                 struct sigaction sa = {
3899                         .sa_handler = nop_handler,
3900                         .sa_flags = SA_NOCLDSTOP,
3901                 };
3902
3903                 r = barrier_create(&barrier);
3904                 if (r < 0) {
3905                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3906                         goto finish;
3907                 }
3908
3909                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3910                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3911                         goto finish;
3912                 }
3913
3914                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3915                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3916                         goto finish;
3917                 }
3918
3919                 /* Child can be killed before execv(), so handle SIGCHLD
3920                  * in order to interrupt parent's blocking calls and
3921                  * give it a chance to call wait() and terminate. */
3922                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3923                 if (r < 0) {
3924                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3925                         goto finish;
3926                 }
3927
3928                 r = sigaction(SIGCHLD, &sa, NULL);
3929                 if (r < 0) {
3930                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3931                         goto finish;
3932                 }
3933
3934                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3935                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3936                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3937                 if (pid < 0) {
3938                         if (errno == EINVAL)
3939                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3940                         else
3941                                 r = log_error_errno(errno, "clone() failed: %m");
3942
3943                         goto finish;
3944                 }
3945
3946                 if (pid == 0) {
3947                         /* child */
3948                         _cleanup_free_ char *home = NULL;
3949                         unsigned n_env = 2;
3950                         const char *envp[] = {
3951                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3952                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3953                                 NULL, /* TERM */
3954                                 NULL, /* HOME */
3955                                 NULL, /* USER */
3956                                 NULL, /* LOGNAME */
3957                                 NULL, /* container_uuid */
3958                                 NULL, /* LISTEN_FDS */
3959                                 NULL, /* LISTEN_PID */
3960                                 NULL
3961                         };
3962                         char **env_use;
3963
3964                         barrier_set_role(&barrier, BARRIER_CHILD);
3965
3966                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3967                         if (envp[n_env])
3968                                 n_env ++;
3969
3970                         master = safe_close(master);
3971
3972                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3973                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3974
3975                         reset_all_signal_handlers();
3976                         reset_signal_mask();
3977
3978                         if (interactive) {
3979                                 close_nointr(STDIN_FILENO);
3980                                 close_nointr(STDOUT_FILENO);
3981                                 close_nointr(STDERR_FILENO);
3982
3983                                 r = open_terminal(console, O_RDWR);
3984                                 if (r != STDIN_FILENO) {
3985                                         if (r >= 0) {
3986                                                 safe_close(r);
3987                                                 r = -EINVAL;
3988                                         }
3989
3990                                         log_error_errno(r, "Failed to open console: %m");
3991                                         _exit(EXIT_FAILURE);
3992                                 }
3993
3994                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3995                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3996                                         log_error_errno(errno, "Failed to duplicate console: %m");
3997                                         _exit(EXIT_FAILURE);
3998                                 }
3999                         }
4000
4001                         if (setsid() < 0) {
4002                                 log_error_errno(errno, "setsid() failed: %m");
4003                                 _exit(EXIT_FAILURE);
4004                         }
4005
4006                         if (reset_audit_loginuid() < 0)
4007                                 _exit(EXIT_FAILURE);
4008
4009                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4010                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4011                                 _exit(EXIT_FAILURE);
4012                         }
4013
4014                         if (arg_private_network)
4015                                 loopback_setup();
4016
4017                         /* Mark everything as slave, so that we still
4018                          * receive mounts from the real root, but don't
4019                          * propagate mounts to the real root. */
4020                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4021                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4022                                 _exit(EXIT_FAILURE);
4023                         }
4024
4025                         if (mount_devices(arg_directory,
4026                                           root_device, root_device_rw,
4027                                           home_device, home_device_rw,
4028                                           srv_device, srv_device_rw) < 0)
4029                                 _exit(EXIT_FAILURE);
4030
4031                         /* Turn directory into bind mount */
4032                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4033                                 log_error_errno(errno, "Failed to make bind mount: %m");
4034                                 _exit(EXIT_FAILURE);
4035                         }
4036
4037                         r = setup_volatile(arg_directory);
4038                         if (r < 0)
4039                                 _exit(EXIT_FAILURE);
4040
4041                         if (setup_volatile_state(arg_directory) < 0)
4042                                 _exit(EXIT_FAILURE);
4043
4044                         r = base_filesystem_create(arg_directory);
4045                         if (r < 0)
4046                                 _exit(EXIT_FAILURE);
4047
4048                         if (arg_read_only) {
4049                                 r = bind_remount_recursive(arg_directory, true);
4050                                 if (r < 0) {
4051                                         log_error_errno(r, "Failed to make tree read-only: %m");
4052                                         _exit(EXIT_FAILURE);
4053                                 }
4054                         }
4055
4056                         if (mount_all(arg_directory) < 0)
4057                                 _exit(EXIT_FAILURE);
4058
4059                         if (copy_devnodes(arg_directory) < 0)
4060                                 _exit(EXIT_FAILURE);
4061
4062                         if (setup_ptmx(arg_directory) < 0)
4063                                 _exit(EXIT_FAILURE);
4064
4065                         dev_setup(arg_directory);
4066
4067                         if (setup_propagate(arg_directory) < 0)
4068                                 _exit(EXIT_FAILURE);
4069
4070                         if (setup_seccomp() < 0)
4071                                 _exit(EXIT_FAILURE);
4072
4073                         if (setup_dev_console(arg_directory, console) < 0)
4074                                 _exit(EXIT_FAILURE);
4075
4076                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4077                                 _exit(EXIT_FAILURE);
4078                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4079
4080                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4081                                 _exit(EXIT_FAILURE);
4082                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4083
4084                         /* Tell the parent that we are ready, and that
4085                          * it can cgroupify us to that we lack access
4086                          * to certain devices and resources. */
4087                         (void) barrier_place(&barrier); /* #1 */
4088
4089                         if (setup_boot_id(arg_directory) < 0)
4090                                 _exit(EXIT_FAILURE);
4091
4092                         if (setup_timezone(arg_directory) < 0)
4093                                 _exit(EXIT_FAILURE);
4094
4095                         if (setup_resolv_conf(arg_directory) < 0)
4096                                 _exit(EXIT_FAILURE);
4097
4098                         if (setup_journal(arg_directory) < 0)
4099                                 _exit(EXIT_FAILURE);
4100
4101                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4102                                 _exit(EXIT_FAILURE);
4103
4104                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4105                                 _exit(EXIT_FAILURE);
4106
4107                         if (mount_tmpfs(arg_directory) < 0)
4108                                 _exit(EXIT_FAILURE);
4109
4110                         /* Wait until we are cgroup-ified, so that we
4111                          * can mount the right cgroup path writable */
4112                         (void) barrier_place_and_sync(&barrier); /* #2 */
4113
4114                         if (mount_cgroup(arg_directory) < 0)
4115                                 _exit(EXIT_FAILURE);
4116
4117                         if (chdir(arg_directory) < 0) {
4118                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4119                                 _exit(EXIT_FAILURE);
4120                         }
4121
4122                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4123                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4124                                 _exit(EXIT_FAILURE);
4125                         }
4126
4127                         if (chroot(".") < 0) {
4128                                 log_error_errno(errno, "chroot() failed: %m");
4129                                 _exit(EXIT_FAILURE);
4130                         }
4131
4132                         if (chdir("/") < 0) {
4133                                 log_error_errno(errno, "chdir() failed: %m");
4134                                 _exit(EXIT_FAILURE);
4135                         }
4136
4137                         if (arg_userns) {
4138                                 if (unshare(CLONE_NEWUSER) < 0) {
4139                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4140                                         _exit(EXIT_FAILURE);
4141                                 }
4142
4143                                 /* Tell the parent, that it now can
4144                                  * write the UID map. */
4145                                 (void) barrier_place(&barrier); /* #3 */
4146
4147                                 /* Wait until the parent wrote the UID
4148                                  * map */
4149                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4150                         }
4151
4152                         umask(0022);
4153
4154                         if (drop_capabilities() < 0) {
4155                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4156                                 _exit(EXIT_FAILURE);
4157                         }
4158
4159                         setup_hostname();
4160
4161                         if (arg_personality != 0xffffffffLU) {
4162                                 if (personality(arg_personality) < 0) {
4163                                         log_error_errno(errno, "personality() failed: %m");
4164                                         _exit(EXIT_FAILURE);
4165                                 }
4166                         } else if (secondary) {
4167                                 if (personality(PER_LINUX32) < 0) {
4168                                         log_error_errno(errno, "personality() failed: %m");
4169                                         _exit(EXIT_FAILURE);
4170                                 }
4171                         }
4172
4173 #ifdef HAVE_SELINUX
4174                         if (arg_selinux_context)
4175                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4176                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4177                                         _exit(EXIT_FAILURE);
4178                                 }
4179 #endif
4180
4181                         r = change_uid_gid(&home);
4182                         if (r < 0)
4183                                 _exit(EXIT_FAILURE);
4184
4185                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4186                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4187                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4188                                 log_oom();
4189                                 _exit(EXIT_FAILURE);
4190                         }
4191
4192                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4193                                 char as_uuid[37];
4194
4195                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4196                                         log_oom();
4197                                         _exit(EXIT_FAILURE);
4198                                 }
4199                         }
4200
4201                         if (fdset_size(fds) > 0) {
4202                                 r = fdset_cloexec(fds, false);
4203                                 if (r < 0) {
4204                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4205                                         _exit(EXIT_FAILURE);
4206                                 }
4207
4208                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4209                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4210                                         log_oom();
4211                                         _exit(EXIT_FAILURE);
4212                                 }
4213                         }
4214
4215                         if (!strv_isempty(arg_setenv)) {
4216                                 char **n;
4217
4218                                 n = strv_env_merge(2, envp, arg_setenv);
4219                                 if (!n) {
4220                                         log_oom();
4221                                         _exit(EXIT_FAILURE);
4222                                 }
4223
4224                                 env_use = n;
4225                         } else
4226                                 env_use = (char**) envp;
4227
4228                         /* Let the parent know that we are ready and
4229                          * wait until the parent is ready with the
4230                          * setup, too... */
4231                         (void) barrier_place_and_sync(&barrier); /* #5 */
4232
4233                         if (arg_boot) {
4234                                 char **a;
4235                                 size_t l;
4236
4237                                 /* Automatically search for the init system */
4238
4239                                 l = 1 + argc - optind;
4240                                 a = newa(char*, l + 1);
4241                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4242
4243                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4244                                 execve(a[0], a, env_use);
4245
4246                                 a[0] = (char*) "/lib/systemd/systemd";
4247                                 execve(a[0], a, env_use);
4248
4249                                 a[0] = (char*) "/sbin/init";
4250                                 execve(a[0], a, env_use);
4251                         } else if (argc > optind)
4252                                 execvpe(argv[optind], argv + optind, env_use);
4253                         else {
4254                                 chdir(home ? home : "/root");
4255                                 execle("/bin/bash", "-bash", NULL, env_use);
4256                                 execle("/bin/sh", "-sh", NULL, env_use);
4257                         }
4258
4259                         log_error_errno(errno, "execv() failed: %m");
4260                         _exit(EXIT_FAILURE);
4261                 }
4262
4263                 barrier_set_role(&barrier, BARRIER_PARENT);
4264                 fdset_free(fds);
4265                 fds = NULL;
4266
4267                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4268                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4269
4270                 (void) barrier_place(&barrier); /* #1 */
4271
4272                 /* Wait for the most basic Child-setup to be done,
4273                  * before we add hardware to it, and place it in a
4274                  * cgroup. */
4275                 if (barrier_sync(&barrier)) { /* #1 */
4276                         int ifi = 0;
4277
4278                         r = move_network_interfaces(pid);
4279                         if (r < 0)
4280                                 goto finish;
4281
4282                         r = setup_veth(pid, veth_name, &ifi);
4283                         if (r < 0)
4284                                 goto finish;
4285
4286                         r = setup_bridge(veth_name, &ifi);
4287                         if (r < 0)
4288                                 goto finish;
4289
4290                         r = setup_macvlan(pid);
4291                         if (r < 0)
4292                                 goto finish;
4293
4294                         r = setup_ipvlan(pid);
4295                         if (r < 0)
4296                                 goto finish;
4297
4298                         r = register_machine(pid, ifi);
4299                         if (r < 0)
4300                                 goto finish;
4301
4302                         /* Notify the child that the parent is ready with all
4303                          * its setup, and that the child can now hand over
4304                          * control to the code to run inside the container. */
4305                         (void) barrier_place(&barrier); /* #2 */
4306
4307                         if (arg_userns) {
4308                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4309
4310                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4311
4312                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4313                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4314                                 r = write_string_file(uid_map, line);
4315                                 if (r < 0) {
4316                                         log_error_errno(r, "Failed to write UID map: %m");
4317                                         goto finish;
4318                                 }
4319
4320                                 /* We always assign the same UID and GID ranges */
4321                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4322                                 r = write_string_file(uid_map, line);
4323                                 if (r < 0) {
4324                                         log_error_errno(r, "Failed to write GID map: %m");
4325                                         goto finish;
4326                                 }
4327
4328                                 (void) barrier_place(&barrier); /* #4 */
4329                         }
4330
4331                         /* Block SIGCHLD here, before notifying child.
4332                          * process_pty() will handle it with the other signals. */
4333                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4334                         if (r < 0)
4335                                 goto finish;
4336
4337                         /* Reset signal to default */
4338                         r = default_signals(SIGCHLD, -1);
4339                         if (r < 0)
4340                                 goto finish;
4341
4342                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4343                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4344                                 _cleanup_event_unref_ sd_event *event = NULL;
4345                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4346                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4347                                 char last_char = 0;
4348
4349                                 sd_notifyf(false,
4350                                            "READY=1\n"
4351                                            "STATUS=Container running.\n"
4352                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4353
4354                                 r = sd_event_new(&event);
4355                                 if (r < 0) {
4356                                         log_error_errno(r, "Failed to get default event source: %m");
4357                                         goto finish;
4358                                 }
4359
4360                                 if (arg_boot) {
4361                                         /* Try to kill the init system on SIGINT or SIGTERM */
4362                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4363                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4364                                 } else {
4365                                         /* Immediately exit */
4366                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4367                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4368                                 }
4369
4370                                 /* simply exit on sigchld */
4371                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4372
4373                                 if (arg_expose_ports) {
4374                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4375                                         if (r < 0)
4376                                                 goto finish;
4377
4378                                         (void) expose_ports(rtnl, &exposed);
4379                                 }
4380
4381                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4382
4383                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4384                                 if (r < 0) {
4385                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4386                                         goto finish;
4387                                 }
4388
4389                                 r = sd_event_loop(event);
4390                                 if (r < 0) {
4391                                         log_error_errno(r, "Failed to run event loop: %m");
4392                                         goto finish;
4393                                 }
4394
4395                                 pty_forward_get_last_char(forward, &last_char);
4396
4397                                 forward = pty_forward_free(forward);
4398
4399                                 if (!arg_quiet && last_char != '\n')
4400                                         putc('\n', stdout);
4401
4402                                 /* Kill if it is not dead yet anyway */
4403                                 terminate_machine(pid);
4404                         }
4405                 }
4406
4407                 /* Normally redundant, but better safe than sorry */
4408                 kill(pid, SIGKILL);
4409
4410                 r = wait_for_container(pid, &container_status);
4411                 pid = 0;
4412
4413                 if (r < 0)
4414                         /* We failed to wait for the container, or the
4415                          * container exited abnormally */
4416                         goto finish;
4417                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4418                         /* The container exited with a non-zero
4419                          * status, or with zero status and no reboot
4420                          * was requested. */
4421                         ret = r;
4422                         break;
4423                 }
4424
4425                 /* CONTAINER_REBOOTED, loop again */
4426
4427                 if (arg_keep_unit) {
4428                         /* Special handling if we are running as a
4429                          * service: instead of simply restarting the
4430                          * machine we want to restart the entire
4431                          * service, so let's inform systemd about this
4432                          * with the special exit code 133. The service
4433                          * file uses RestartForceExitStatus=133 so
4434                          * that this results in a full nspawn
4435                          * restart. This is necessary since we might
4436                          * have cgroup parameters set we want to have
4437                          * flushed out. */
4438                         ret = 133;
4439                         r = 0;
4440                         break;
4441                 }
4442
4443                 flush_ports(&exposed);
4444         }
4445
4446 finish:
4447         sd_notify(false,
4448                   "STOPPING=1\n"
4449                   "STATUS=Terminating...");
4450
4451         loop_remove(loop_nr, &image_fd);
4452
4453         if (pid > 0)
4454                 kill(pid, SIGKILL);
4455
4456         if (remove_subvol && arg_directory) {
4457                 int k;
4458
4459                 k = btrfs_subvol_remove(arg_directory);
4460                 if (k < 0)
4461                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4462         }
4463
4464         if (arg_machine) {
4465                 const char *p;
4466
4467                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4468                 (void) rm_rf(p, false, true, false);
4469         }
4470
4471         free(arg_directory);
4472         free(arg_template);
4473         free(arg_image);
4474         free(arg_machine);
4475         free(arg_user);
4476         strv_free(arg_setenv);
4477         strv_free(arg_network_interfaces);
4478         strv_free(arg_network_macvlan);
4479         strv_free(arg_network_ipvlan);
4480         strv_free(arg_bind);
4481         strv_free(arg_bind_ro);
4482         strv_free(arg_tmpfs);
4483
4484         flush_ports(&exposed);
4485
4486         while (arg_expose_ports) {
4487                 ExposePort *p = arg_expose_ports;
4488                 LIST_REMOVE(ports, arg_expose_ports, p);
4489                 free(p);
4490         }
4491
4492         return r < 0 ? EXIT_FAILURE : ret;
4493 }