chiark / gitweb /
136933ec6cdb3d8c07f68da76b1c4c405d059c7a
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "macro.h"
63 #include "missing.h"
64 #include "cgroup-util.h"
65 #include "strv.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
69 #include "fdset.h"
70 #include "build.h"
71 #include "fileio.h"
72 #include "bus-util.h"
73 #include "bus-error.h"
74 #include "ptyfwd.h"
75 #include "env-util.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
79 #include "gpt.h"
80 #include "siphash24.h"
81 #include "copy.h"
82 #include "base-filesystem.h"
83 #include "barrier.h"
84 #include "event-util.h"
85 #include "capability.h"
86 #include "cap-list.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
89 #include "list.h"
90 #include "in-addr-util.h"
91 #include "fw-util.h"
92 #include "local-addresses.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef struct ExposePort {
99         int protocol;
100         uint16_t host_port;
101         uint16_t container_port;
102         LIST_FIELDS(struct ExposePort, ports);
103 } ExposePort;
104
105 typedef enum ContainerStatus {
106         CONTAINER_TERMINATED,
107         CONTAINER_REBOOTED
108 } ContainerStatus;
109
110 typedef enum LinkJournal {
111         LINK_NO,
112         LINK_AUTO,
113         LINK_HOST,
114         LINK_GUEST
115 } LinkJournal;
116
117 typedef enum Volatile {
118         VOLATILE_NO,
119         VOLATILE_YES,
120         VOLATILE_STATE,
121 } Volatile;
122
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138         (1ULL << CAP_CHOWN) |
139         (1ULL << CAP_DAC_OVERRIDE) |
140         (1ULL << CAP_DAC_READ_SEARCH) |
141         (1ULL << CAP_FOWNER) |
142         (1ULL << CAP_FSETID) |
143         (1ULL << CAP_IPC_OWNER) |
144         (1ULL << CAP_KILL) |
145         (1ULL << CAP_LEASE) |
146         (1ULL << CAP_LINUX_IMMUTABLE) |
147         (1ULL << CAP_NET_BIND_SERVICE) |
148         (1ULL << CAP_NET_BROADCAST) |
149         (1ULL << CAP_NET_RAW) |
150         (1ULL << CAP_SETGID) |
151         (1ULL << CAP_SETFCAP) |
152         (1ULL << CAP_SETPCAP) |
153         (1ULL << CAP_SETUID) |
154         (1ULL << CAP_SYS_ADMIN) |
155         (1ULL << CAP_SYS_CHROOT) |
156         (1ULL << CAP_SYS_NICE) |
157         (1ULL << CAP_SYS_PTRACE) |
158         (1ULL << CAP_SYS_TTY_CONFIG) |
159         (1ULL << CAP_SYS_RESOURCE) |
160         (1ULL << CAP_SYS_BOOT) |
161         (1ULL << CAP_AUDIT_WRITE) |
162         (1ULL << CAP_AUDIT_CONTROL) |
163         (1ULL << CAP_MKNOD);
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184 static int arg_kill_signal = 0;
185
186 static void help(void) {
187         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189                "  -h --help                 Show this help\n"
190                "     --version              Print version string\n"
191                "  -q --quiet                Do not show status information\n"
192                "  -D --directory=PATH       Root directory for the container\n"
193                "     --template=PATH        Initialize root directory from template directory,\n"
194                "                            if missing\n"
195                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
196                "                            remove it after exit\n"
197                "  -i --image=PATH           File system device or disk image for the container\n"
198                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
199                "  -u --user=USER            Run the command under specified user or uid\n"
200                "  -M --machine=NAME         Set the machine name for the container\n"
201                "     --uuid=UUID            Set a specific machine UUID for the container\n"
202                "  -S --slice=SLICE          Place the container in the specified slice\n"
203                "     --property=NAME=VALUE  Set scope unit property\n"
204                "     --private-network      Disable network in container\n"
205                "     --network-interface=INTERFACE\n"
206                "                            Assign an existing network interface to the\n"
207                "                            container\n"
208                "     --network-macvlan=INTERFACE\n"
209                "                            Create a macvlan network interface based on an\n"
210                "                            existing network interface to the container\n"
211                "     --network-ipvlan=INTERFACE\n"
212                "                            Create a ipvlan network interface based on an\n"
213                "                            existing network interface to the container\n"
214                "  -n --network-veth         Add a virtual ethernet connection between host\n"
215                "                            and container\n"
216                "     --network-bridge=INTERFACE\n"
217                "                            Add a virtual ethernet connection between host\n"
218                "                            and container and add it to an existing bridge on\n"
219                "                            the host\n"
220                "     --private-users[=UIDBASE[:NUIDS]]\n"
221                "                            Run within user namespace\n"
222                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223                "                            Expose a container IP port on the host\n"
224                "  -Z --selinux-context=SECLABEL\n"
225                "                            Set the SELinux security context to be used by\n"
226                "                            processes in the container\n"
227                "  -L --selinux-apifs-context=SECLABEL\n"
228                "                            Set the SELinux security context to be used by\n"
229                "                            API/tmpfs file systems in the container\n"
230                "     --capability=CAP       In addition to the default, retain specified\n"
231                "                            capability\n"
232                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
233                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
234                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
235                "                            try-guest, try-host\n"
236                "  -j                        Equivalent to --link-journal=try-guest\n"
237                "     --read-only            Mount the root directory read-only\n"
238                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
239                "                            the container\n"
240                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
241                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
242                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
243                "     --share-system         Share system namespaces with host\n"
244                "     --register=BOOLEAN     Register container as machine\n"
245                "     --keep-unit            Do not register a scope for the machine, reuse\n"
246                "                            the service unit nspawn is running in\n"
247                "     --volatile[=MODE]      Run the system in volatile mode\n"
248                , program_invocation_short_name);
249 }
250
251 static int set_sanitized_path(char **b, const char *path) {
252         char *p;
253
254         assert(b);
255         assert(path);
256
257         p = canonicalize_file_name(path);
258         if (!p) {
259                 if (errno != ENOENT)
260                         return -errno;
261
262                 p = path_make_absolute_cwd(path);
263                 if (!p)
264                         return -ENOMEM;
265         }
266
267         free(*b);
268         *b = path_kill_slashes(p);
269         return 0;
270 }
271
272 static int parse_argv(int argc, char *argv[]) {
273
274         enum {
275                 ARG_VERSION = 0x100,
276                 ARG_PRIVATE_NETWORK,
277                 ARG_UUID,
278                 ARG_READ_ONLY,
279                 ARG_CAPABILITY,
280                 ARG_DROP_CAPABILITY,
281                 ARG_LINK_JOURNAL,
282                 ARG_BIND,
283                 ARG_BIND_RO,
284                 ARG_TMPFS,
285                 ARG_SETENV,
286                 ARG_SHARE_SYSTEM,
287                 ARG_REGISTER,
288                 ARG_KEEP_UNIT,
289                 ARG_NETWORK_INTERFACE,
290                 ARG_NETWORK_MACVLAN,
291                 ARG_NETWORK_IPVLAN,
292                 ARG_NETWORK_BRIDGE,
293                 ARG_PERSONALITY,
294                 ARG_VOLATILE,
295                 ARG_TEMPLATE,
296                 ARG_PROPERTY,
297                 ARG_PRIVATE_USERS,
298                 ARG_KILL_SIGNAL,
299         };
300
301         static const struct option options[] = {
302                 { "help",                  no_argument,       NULL, 'h'                   },
303                 { "version",               no_argument,       NULL, ARG_VERSION           },
304                 { "directory",             required_argument, NULL, 'D'                   },
305                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
306                 { "ephemeral",             no_argument,       NULL, 'x'                   },
307                 { "user",                  required_argument, NULL, 'u'                   },
308                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
309                 { "boot",                  no_argument,       NULL, 'b'                   },
310                 { "uuid",                  required_argument, NULL, ARG_UUID              },
311                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
312                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
313                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
314                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
315                 { "bind",                  required_argument, NULL, ARG_BIND              },
316                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
317                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
318                 { "machine",               required_argument, NULL, 'M'                   },
319                 { "slice",                 required_argument, NULL, 'S'                   },
320                 { "setenv",                required_argument, NULL, ARG_SETENV            },
321                 { "selinux-context",       required_argument, NULL, 'Z'                   },
322                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
323                 { "quiet",                 no_argument,       NULL, 'q'                   },
324                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
325                 { "register",              required_argument, NULL, ARG_REGISTER          },
326                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
327                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
328                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
329                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
330                 { "network-veth",          no_argument,       NULL, 'n'                   },
331                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
332                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
333                 { "image",                 required_argument, NULL, 'i'                   },
334                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
335                 { "port",                  required_argument, NULL, 'p'                   },
336                 { "property",              required_argument, NULL, ARG_PROPERTY          },
337                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
338                 { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
339                 {}
340         };
341
342         int c, r;
343         uint64_t plus = 0, minus = 0;
344
345         assert(argc >= 0);
346         assert(argv);
347
348         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
349
350                 switch (c) {
351
352                 case 'h':
353                         help();
354                         return 0;
355
356                 case ARG_VERSION:
357                         puts(PACKAGE_STRING);
358                         puts(SYSTEMD_FEATURES);
359                         return 0;
360
361                 case 'D':
362                         r = set_sanitized_path(&arg_directory, optarg);
363                         if (r < 0)
364                                 return log_error_errno(r, "Invalid root directory: %m");
365
366                         break;
367
368                 case ARG_TEMPLATE:
369                         r = set_sanitized_path(&arg_template, optarg);
370                         if (r < 0)
371                                 return log_error_errno(r, "Invalid template directory: %m");
372
373                         break;
374
375                 case 'i':
376                         r = set_sanitized_path(&arg_image, optarg);
377                         if (r < 0)
378                                 return log_error_errno(r, "Invalid image path: %m");
379
380                         break;
381
382                 case 'x':
383                         arg_ephemeral = true;
384                         break;
385
386                 case 'u':
387                         free(arg_user);
388                         arg_user = strdup(optarg);
389                         if (!arg_user)
390                                 return log_oom();
391
392                         break;
393
394                 case ARG_NETWORK_BRIDGE:
395                         arg_network_bridge = optarg;
396
397                         /* fall through */
398
399                 case 'n':
400                         arg_network_veth = true;
401                         arg_private_network = true;
402                         break;
403
404                 case ARG_NETWORK_INTERFACE:
405                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
406                                 return log_oom();
407
408                         arg_private_network = true;
409                         break;
410
411                 case ARG_NETWORK_MACVLAN:
412                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
413                                 return log_oom();
414
415                         arg_private_network = true;
416                         break;
417
418                 case ARG_NETWORK_IPVLAN:
419                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420                                 return log_oom();
421
422                         /* fall through */
423
424                 case ARG_PRIVATE_NETWORK:
425                         arg_private_network = true;
426                         break;
427
428                 case 'b':
429                         arg_boot = true;
430                         break;
431
432                 case ARG_UUID:
433                         r = sd_id128_from_string(optarg, &arg_uuid);
434                         if (r < 0) {
435                                 log_error("Invalid UUID: %s", optarg);
436                                 return r;
437                         }
438                         break;
439
440                 case 'S':
441                         arg_slice = optarg;
442                         break;
443
444                 case 'M':
445                         if (isempty(optarg)) {
446                                 free(arg_machine);
447                                 arg_machine = NULL;
448                         } else {
449                                 if (!machine_name_is_valid(optarg)) {
450                                         log_error("Invalid machine name: %s", optarg);
451                                         return -EINVAL;
452                                 }
453
454                                 r = free_and_strdup(&arg_machine, optarg);
455                                 if (r < 0)
456                                         return log_oom();
457
458                                 break;
459                         }
460
461                 case 'Z':
462                         arg_selinux_context = optarg;
463                         break;
464
465                 case 'L':
466                         arg_selinux_apifs_context = optarg;
467                         break;
468
469                 case ARG_READ_ONLY:
470                         arg_read_only = true;
471                         break;
472
473                 case ARG_CAPABILITY:
474                 case ARG_DROP_CAPABILITY: {
475                         const char *state, *word;
476                         size_t length;
477
478                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
479                                 _cleanup_free_ char *t;
480
481                                 t = strndup(word, length);
482                                 if (!t)
483                                         return log_oom();
484
485                                 if (streq(t, "all")) {
486                                         if (c == ARG_CAPABILITY)
487                                                 plus = (uint64_t) -1;
488                                         else
489                                                 minus = (uint64_t) -1;
490                                 } else {
491                                         int cap;
492
493                                         cap = capability_from_name(t);
494                                         if (cap < 0) {
495                                                 log_error("Failed to parse capability %s.", t);
496                                                 return -EINVAL;
497                                         }
498
499                                         if (c == ARG_CAPABILITY)
500                                                 plus |= 1ULL << (uint64_t) cap;
501                                         else
502                                                 minus |= 1ULL << (uint64_t) cap;
503                                 }
504                         }
505
506                         break;
507                 }
508
509                 case 'j':
510                         arg_link_journal = LINK_GUEST;
511                         arg_link_journal_try = true;
512                         break;
513
514                 case ARG_LINK_JOURNAL:
515                         if (streq(optarg, "auto")) {
516                                 arg_link_journal = LINK_AUTO;
517                                 arg_link_journal_try = false;
518                         } else if (streq(optarg, "no")) {
519                                 arg_link_journal = LINK_NO;
520                                 arg_link_journal_try = false;
521                         } else if (streq(optarg, "guest")) {
522                                 arg_link_journal = LINK_GUEST;
523                                 arg_link_journal_try = false;
524                         } else if (streq(optarg, "host")) {
525                                 arg_link_journal = LINK_HOST;
526                                 arg_link_journal_try = false;
527                         } else if (streq(optarg, "try-guest")) {
528                                 arg_link_journal = LINK_GUEST;
529                                 arg_link_journal_try = true;
530                         } else if (streq(optarg, "try-host")) {
531                                 arg_link_journal = LINK_HOST;
532                                 arg_link_journal_try = true;
533                         } else {
534                                 log_error("Failed to parse link journal mode %s", optarg);
535                                 return -EINVAL;
536                         }
537
538                         break;
539
540                 case ARG_BIND:
541                 case ARG_BIND_RO: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544                         char ***x;
545
546                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
547
548                         e = strchr(optarg, ':');
549                         if (e) {
550                                 a = strndup(optarg, e - optarg);
551                                 b = strdup(e + 1);
552                         } else {
553                                 a = strdup(optarg);
554                                 b = strdup(optarg);
555                         }
556
557                         if (!a || !b)
558                                 return log_oom();
559
560                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
561                                 log_error("Invalid bind mount specification: %s", optarg);
562                                 return -EINVAL;
563                         }
564
565                         r = strv_extend(x, a);
566                         if (r < 0)
567                                 return log_oom();
568
569                         r = strv_extend(x, b);
570                         if (r < 0)
571                                 return log_oom();
572
573                         break;
574                 }
575
576                 case ARG_TMPFS: {
577                         _cleanup_free_ char *a = NULL, *b = NULL;
578                         char *e;
579
580                         e = strchr(optarg, ':');
581                         if (e) {
582                                 a = strndup(optarg, e - optarg);
583                                 b = strdup(e + 1);
584                         } else {
585                                 a = strdup(optarg);
586                                 b = strdup("mode=0755");
587                         }
588
589                         if (!a || !b)
590                                 return log_oom();
591
592                         if (!path_is_absolute(a)) {
593                                 log_error("Invalid tmpfs specification: %s", optarg);
594                                 return -EINVAL;
595                         }
596
597                         r = strv_push(&arg_tmpfs, a);
598                         if (r < 0)
599                                 return log_oom();
600
601                         a = NULL;
602
603                         r = strv_push(&arg_tmpfs, b);
604                         if (r < 0)
605                                 return log_oom();
606
607                         b = NULL;
608
609                         break;
610                 }
611
612                 case ARG_SETENV: {
613                         char **n;
614
615                         if (!env_assignment_is_valid(optarg)) {
616                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
617                                 return -EINVAL;
618                         }
619
620                         n = strv_env_set(arg_setenv, optarg);
621                         if (!n)
622                                 return log_oom();
623
624                         strv_free(arg_setenv);
625                         arg_setenv = n;
626                         break;
627                 }
628
629                 case 'q':
630                         arg_quiet = true;
631                         break;
632
633                 case ARG_SHARE_SYSTEM:
634                         arg_share_system = true;
635                         break;
636
637                 case ARG_REGISTER:
638                         r = parse_boolean(optarg);
639                         if (r < 0) {
640                                 log_error("Failed to parse --register= argument: %s", optarg);
641                                 return r;
642                         }
643
644                         arg_register = r;
645                         break;
646
647                 case ARG_KEEP_UNIT:
648                         arg_keep_unit = true;
649                         break;
650
651                 case ARG_PERSONALITY:
652
653                         arg_personality = personality_from_string(optarg);
654                         if (arg_personality == 0xffffffffLU) {
655                                 log_error("Unknown or unsupported personality '%s'.", optarg);
656                                 return -EINVAL;
657                         }
658
659                         break;
660
661                 case ARG_VOLATILE:
662
663                         if (!optarg)
664                                 arg_volatile = VOLATILE_YES;
665                         else {
666                                 r = parse_boolean(optarg);
667                                 if (r < 0) {
668                                         if (streq(optarg, "state"))
669                                                 arg_volatile = VOLATILE_STATE;
670                                         else {
671                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
672                                                 return r;
673                                         }
674                                 } else
675                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676                         }
677
678                         break;
679
680                 case 'p': {
681                         const char *split, *e;
682                         uint16_t container_port, host_port;
683                         int protocol;
684                         ExposePort *p;
685
686                         if ((e = startswith(optarg, "tcp:")))
687                                 protocol = IPPROTO_TCP;
688                         else if ((e = startswith(optarg, "udp:")))
689                                 protocol = IPPROTO_UDP;
690                         else {
691                                 e = optarg;
692                                 protocol = IPPROTO_TCP;
693                         }
694
695                         split = strchr(e, ':');
696                         if (split) {
697                                 char v[split - e + 1];
698
699                                 memcpy(v, e, split - e);
700                                 v[split - e] = 0;
701
702                                 r = safe_atou16(v, &host_port);
703                                 if (r < 0 || host_port <= 0) {
704                                         log_error("Failed to parse host port: %s", optarg);
705                                         return -EINVAL;
706                                 }
707
708                                 r = safe_atou16(split + 1, &container_port);
709                         } else {
710                                 r = safe_atou16(e, &container_port);
711                                 host_port = container_port;
712                         }
713
714                         if (r < 0 || container_port <= 0) {
715                                 log_error("Failed to parse host port: %s", optarg);
716                                 return -EINVAL;
717                         }
718
719                         LIST_FOREACH(ports, p, arg_expose_ports) {
720                                 if (p->protocol == protocol && p->host_port == host_port) {
721                                         log_error("Duplicate port specification: %s", optarg);
722                                         return -EINVAL;
723                                 }
724                         }
725
726                         p = new(ExposePort, 1);
727                         if (!p)
728                                 return log_oom();
729
730                         p->protocol = protocol;
731                         p->host_port = host_port;
732                         p->container_port = container_port;
733
734                         LIST_PREPEND(ports, arg_expose_ports, p);
735
736                         break;
737                 }
738
739                 case ARG_PROPERTY:
740                         if (strv_extend(&arg_property, optarg) < 0)
741                                 return log_oom();
742
743                         break;
744
745                 case ARG_PRIVATE_USERS:
746                         if (optarg) {
747                                 _cleanup_free_ char *buffer = NULL;
748                                 const char *range, *shift;
749
750                                 range = strchr(optarg, ':');
751                                 if (range) {
752                                         buffer = strndup(optarg, range - optarg);
753                                         if (!buffer)
754                                                 return log_oom();
755                                         shift = buffer;
756
757                                         range++;
758                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759                                                 log_error("Failed to parse UID range: %s", range);
760                                                 return -EINVAL;
761                                         }
762                                 } else
763                                         shift = optarg;
764
765                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
766                                         log_error("Failed to parse UID: %s", optarg);
767                                         return -EINVAL;
768                                 }
769                         }
770
771                         arg_userns = true;
772                         break;
773
774                 case ARG_KILL_SIGNAL:
775                         arg_kill_signal = signal_from_string_try_harder(optarg);
776                         if (arg_kill_signal < 0) {
777                                 log_error("Cannot parse signal: %s", optarg);
778                                 return -EINVAL;
779                         }
780
781                         break;
782
783                 case '?':
784                         return -EINVAL;
785
786                 default:
787                         assert_not_reached("Unhandled option");
788                 }
789
790         if (arg_share_system)
791                 arg_register = false;
792
793         if (arg_boot && arg_share_system) {
794                 log_error("--boot and --share-system may not be combined.");
795                 return -EINVAL;
796         }
797
798         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799                 log_error("--keep-unit may not be used when invoked from a user session.");
800                 return -EINVAL;
801         }
802
803         if (arg_directory && arg_image) {
804                 log_error("--directory= and --image= may not be combined.");
805                 return -EINVAL;
806         }
807
808         if (arg_template && arg_image) {
809                 log_error("--template= and --image= may not be combined.");
810                 return -EINVAL;
811         }
812
813         if (arg_template && !(arg_directory || arg_machine)) {
814                 log_error("--template= needs --directory= or --machine=.");
815                 return -EINVAL;
816         }
817
818         if (arg_ephemeral && arg_template) {
819                 log_error("--ephemeral and --template= may not be combined.");
820                 return -EINVAL;
821         }
822
823         if (arg_ephemeral && arg_image) {
824                 log_error("--ephemeral and --image= may not be combined.");
825                 return -EINVAL;
826         }
827
828         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829                 log_error("--ephemeral and --link-journal= may not be combined.");
830                 return -EINVAL;
831         }
832
833         if (arg_volatile != VOLATILE_NO && arg_read_only) {
834                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
835                 return -EINVAL;
836         }
837
838         if (arg_expose_ports && !arg_private_network) {
839                 log_error("Cannot use --port= without private networking.");
840                 return -EINVAL;
841         }
842
843         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
844
845         if (arg_boot && arg_kill_signal <= 0)
846                 arg_kill_signal = SIGRTMIN+3;
847
848         return 1;
849 }
850
851 static int mount_all(const char *dest) {
852
853         typedef struct MountPoint {
854                 const char *what;
855                 const char *where;
856                 const char *type;
857                 const char *options;
858                 unsigned long flags;
859                 bool fatal;
860         } MountPoint;
861
862         static const MountPoint mount_table[] = {
863                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
864                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
865                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
866                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
867                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
868                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
869                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
870                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
871                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
872 #ifdef HAVE_SELINUX
873                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
874                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
875 #endif
876         };
877
878         unsigned k;
879         int r = 0;
880
881         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
882                 _cleanup_free_ char *where = NULL, *options = NULL;
883                 const char *o;
884                 int t;
885
886                 where = strjoin(dest, "/", mount_table[k].where, NULL);
887                 if (!where)
888                         return log_oom();
889
890                 t = path_is_mount_point(where, true);
891                 if (t < 0) {
892                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
893
894                         if (r == 0)
895                                 r = t;
896
897                         continue;
898                 }
899
900                 /* Skip this entry if it is not a remount. */
901                 if (mount_table[k].what && t > 0)
902                         continue;
903
904                 t = mkdir_p(where, 0755);
905                 if (t < 0) {
906                         if (mount_table[k].fatal) {
907                                log_error_errno(t, "Failed to create directory %s: %m", where);
908
909                                 if (r == 0)
910                                         r = t;
911                         } else
912                                log_warning_errno(t, "Failed to create directory %s: %m", where);
913
914                         continue;
915                 }
916
917 #ifdef HAVE_SELINUX
918                 if (arg_selinux_apifs_context &&
919                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
921                         if (!options)
922                                 return log_oom();
923
924                         o = options;
925                 } else
926 #endif
927                         o = mount_table[k].options;
928
929                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930                         char *uid_options = NULL;
931
932                         if (o)
933                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
934                         else
935                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
936                         if (!uid_options)
937                                 return log_oom();
938
939                         free(options);
940                         o = options = uid_options;
941                 }
942
943                 if (mount(mount_table[k].what,
944                           where,
945                           mount_table[k].type,
946                           mount_table[k].flags,
947                           o) < 0) {
948
949                         if (mount_table[k].fatal) {
950                                 log_error_errno(errno, "mount(%s) failed: %m", where);
951
952                                 if (r == 0)
953                                         r = -errno;
954                         } else
955                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
956                 }
957         }
958
959         return r;
960 }
961
962 static int mount_binds(const char *dest, char **l, bool ro) {
963         char **x, **y;
964
965         STRV_FOREACH_PAIR(x, y, l) {
966                 _cleanup_free_ char *where = NULL;
967                 struct stat source_st, dest_st;
968                 int r;
969
970                 if (stat(*x, &source_st) < 0)
971                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
972
973                 where = strappend(dest, *y);
974                 if (!where)
975                         return log_oom();
976
977                 r = stat(where, &dest_st);
978                 if (r == 0) {
979                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
981                                 return -EINVAL;
982                         }
983                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
985                                 return -EINVAL;
986                         }
987                 } else if (errno == ENOENT) {
988                         r = mkdir_parents_label(where, 0755);
989                         if (r < 0)
990                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
991                 } else {
992                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
993                         return -errno;
994                 }
995
996                 /* Create the mount point. Any non-directory file can be
997                  * mounted on any non-directory file (regular, fifo, socket,
998                  * char, block).
999                  */
1000                 if (S_ISDIR(source_st.st_mode)) {
1001                         r = mkdir_label(where, 0755);
1002                         if (r < 0 && errno != EEXIST)
1003                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1004                 } else {
1005                         r = touch(where);
1006                         if (r < 0)
1007                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1008                 }
1009
1010                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1012
1013                 if (ro) {
1014                         r = bind_remount_recursive(where, true);
1015                         if (r < 0)
1016                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1017                 }
1018         }
1019
1020         return 0;
1021 }
1022
1023 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1024         char *to;
1025         int r;
1026
1027         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1028
1029         r = path_is_mount_point(to, false);
1030         if (r < 0)
1031                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1032         if (r > 0)
1033                 return 0;
1034
1035         mkdir_p(to, 0755);
1036
1037         /* The superblock mount options of the mount point need to be
1038          * identical to the hosts', and hence writable... */
1039         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1040                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1041
1042         /* ... hence let's only make the bind mount read-only, not the
1043          * superblock. */
1044         if (read_only) {
1045                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1047         }
1048         return 1;
1049 }
1050
1051 static int mount_cgroup(const char *dest) {
1052         _cleanup_set_free_free_ Set *controllers = NULL;
1053         _cleanup_free_ char *own_cgroup_path = NULL;
1054         const char *cgroup_root, *systemd_root, *systemd_own;
1055         int r;
1056
1057         controllers = set_new(&string_hash_ops);
1058         if (!controllers)
1059                 return log_oom();
1060
1061         r = cg_kernel_controllers(controllers);
1062         if (r < 0)
1063                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1064
1065         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1066         if (r < 0)
1067                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1068
1069         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1070         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1072
1073         for (;;) {
1074                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1075
1076                 controller = set_steal_first(controllers);
1077                 if (!controller)
1078                         break;
1079
1080                 origin = strappend("/sys/fs/cgroup/", controller);
1081                 if (!origin)
1082                         return log_oom();
1083
1084                 r = readlink_malloc(origin, &combined);
1085                 if (r == -EINVAL) {
1086                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1087
1088                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1089                         if (r < 0)
1090                                 return r;
1091
1092                 } else if (r < 0)
1093                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1094                 else {
1095                         _cleanup_free_ char *target = NULL;
1096
1097                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1098                         if (!target)
1099                                 return log_oom();
1100
1101                         /* A symbolic link, a combination of controllers in one hierarchy */
1102
1103                         if (!filename_is_valid(combined)) {
1104                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1105                                 continue;
1106                         }
1107
1108                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1109                         if (r < 0)
1110                                 return r;
1111
1112                         if (symlink(combined, target) < 0)
1113                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1114                 }
1115         }
1116
1117         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1118         if (r < 0)
1119                 return r;
1120
1121         /* Make our own cgroup a (writable) bind mount */
1122         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1123         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1124                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1125
1126         /* And then remount the systemd cgroup root read-only */
1127         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1128         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1130
1131         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1133
1134         return 0;
1135 }
1136
1137 static int mount_tmpfs(const char *dest) {
1138         char **i, **o;
1139
1140         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141                 _cleanup_free_ char *where = NULL;
1142                 int r;
1143
1144                 where = strappend(dest, *i);
1145                 if (!where)
1146                         return log_oom();
1147
1148                 r = mkdir_label(where, 0755);
1149                 if (r < 0 && r != -EEXIST)
1150                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1151
1152                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1154         }
1155
1156         return 0;
1157 }
1158
1159 static int setup_timezone(const char *dest) {
1160         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1161         char *z, *y;
1162         int r;
1163
1164         assert(dest);
1165
1166         /* Fix the timezone, if possible */
1167         r = readlink_malloc("/etc/localtime", &p);
1168         if (r < 0) {
1169                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1170                 return 0;
1171         }
1172
1173         z = path_startswith(p, "../usr/share/zoneinfo/");
1174         if (!z)
1175                 z = path_startswith(p, "/usr/share/zoneinfo/");
1176         if (!z) {
1177                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1178                 return 0;
1179         }
1180
1181         where = strappend(dest, "/etc/localtime");
1182         if (!where)
1183                 return log_oom();
1184
1185         r = readlink_malloc(where, &q);
1186         if (r >= 0) {
1187                 y = path_startswith(q, "../usr/share/zoneinfo/");
1188                 if (!y)
1189                         y = path_startswith(q, "/usr/share/zoneinfo/");
1190
1191                 /* Already pointing to the right place? Then do nothing .. */
1192                 if (y && streq(y, z))
1193                         return 0;
1194         }
1195
1196         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1197         if (!check)
1198                 return log_oom();
1199
1200         if (access(check, F_OK) < 0) {
1201                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1202                 return 0;
1203         }
1204
1205         what = strappend("../usr/share/zoneinfo/", z);
1206         if (!what)
1207                 return log_oom();
1208
1209         r = mkdir_parents(where, 0755);
1210         if (r < 0) {
1211                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         r = unlink(where);
1217         if (r < 0 && errno != ENOENT) {
1218                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1219
1220                 return 0;
1221         }
1222
1223         if (symlink(what, where) < 0) {
1224                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1225                 return 0;
1226         }
1227
1228         return 0;
1229 }
1230
1231 static int setup_resolv_conf(const char *dest) {
1232         _cleanup_free_ char *where = NULL;
1233         int r;
1234
1235         assert(dest);
1236
1237         if (arg_private_network)
1238                 return 0;
1239
1240         /* Fix resolv.conf, if possible */
1241         where = strappend(dest, "/etc/resolv.conf");
1242         if (!where)
1243                 return log_oom();
1244
1245         /* We don't really care for the results of this really. If it
1246          * fails, it fails, but meh... */
1247         r = mkdir_parents(where, 0755);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1255         if (r < 0) {
1256                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1257
1258                 return 0;
1259         }
1260
1261         return 0;
1262 }
1263
1264 static int setup_volatile_state(const char *directory) {
1265         const char *p;
1266         int r;
1267
1268         assert(directory);
1269
1270         if (arg_volatile != VOLATILE_STATE)
1271                 return 0;
1272
1273         /* --volatile=state means we simply overmount /var
1274            with a tmpfs, and the rest read-only. */
1275
1276         r = bind_remount_recursive(directory, true);
1277         if (r < 0)
1278                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1279
1280         p = strjoina(directory, "/var");
1281         r = mkdir(p, 0755);
1282         if (r < 0 && errno != EEXIST)
1283                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1284
1285         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1287
1288         return 0;
1289 }
1290
1291 static int setup_volatile(const char *directory) {
1292         bool tmpfs_mounted = false, bind_mounted = false;
1293         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1294         const char *f, *t;
1295         int r;
1296
1297         assert(directory);
1298
1299         if (arg_volatile != VOLATILE_YES)
1300                 return 0;
1301
1302         /* --volatile=yes means we mount a tmpfs to the root dir, and
1303            the original /usr to use inside it, and that read-only. */
1304
1305         if (!mkdtemp(template))
1306                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1307
1308         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1309                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1310                 r = -errno;
1311                 goto fail;
1312         }
1313
1314         tmpfs_mounted = true;
1315
1316         f = strjoina(directory, "/usr");
1317         t = strjoina(template, "/usr");
1318
1319         r = mkdir(t, 0755);
1320         if (r < 0 && errno != EEXIST) {
1321                 log_error_errno(errno, "Failed to create %s: %m", t);
1322                 r = -errno;
1323                 goto fail;
1324         }
1325
1326         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1327                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1328                 r = -errno;
1329                 goto fail;
1330         }
1331
1332         bind_mounted = true;
1333
1334         r = bind_remount_recursive(t, true);
1335         if (r < 0) {
1336                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1337                 goto fail;
1338         }
1339
1340         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1341                 log_error_errno(errno, "Failed to move root mount: %m");
1342                 r = -errno;
1343                 goto fail;
1344         }
1345
1346         rmdir(template);
1347
1348         return 0;
1349
1350 fail:
1351         if (bind_mounted)
1352                 umount(t);
1353         if (tmpfs_mounted)
1354                 umount(template);
1355         rmdir(template);
1356         return r;
1357 }
1358
1359 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1360
1361         snprintf(s, 37,
1362                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363                  SD_ID128_FORMAT_VAL(id));
1364
1365         return s;
1366 }
1367
1368 static int setup_boot_id(const char *dest) {
1369         _cleanup_free_ char *from = NULL, *to = NULL;
1370         sd_id128_t rnd = {};
1371         char as_uuid[37];
1372         int r;
1373
1374         assert(dest);
1375
1376         if (arg_share_system)
1377                 return 0;
1378
1379         /* Generate a new randomized boot ID, so that each boot-up of
1380          * the container gets a new one */
1381
1382         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1383         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1384         if (!from || !to)
1385                 return log_oom();
1386
1387         r = sd_id128_randomize(&rnd);
1388         if (r < 0)
1389                 return log_error_errno(r, "Failed to generate random boot id: %m");
1390
1391         id128_format_as_uuid(rnd, as_uuid);
1392
1393         r = write_string_file(from, as_uuid);
1394         if (r < 0)
1395                 return log_error_errno(r, "Failed to write boot id: %m");
1396
1397         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1398                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1399                 r = -errno;
1400         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1401                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1402
1403         unlink(from);
1404         return r;
1405 }
1406
1407 static int copy_devnodes(const char *dest) {
1408
1409         static const char devnodes[] =
1410                 "null\0"
1411                 "zero\0"
1412                 "full\0"
1413                 "random\0"
1414                 "urandom\0"
1415                 "tty\0"
1416                 "net/tun\0";
1417
1418         const char *d;
1419         int r = 0;
1420         _cleanup_umask_ mode_t u;
1421
1422         assert(dest);
1423
1424         u = umask(0000);
1425
1426         NULSTR_FOREACH(d, devnodes) {
1427                 _cleanup_free_ char *from = NULL, *to = NULL;
1428                 struct stat st;
1429
1430                 from = strappend("/dev/", d);
1431                 to = strjoin(dest, "/dev/", d, NULL);
1432                 if (!from || !to)
1433                         return log_oom();
1434
1435                 if (stat(from, &st) < 0) {
1436
1437                         if (errno != ENOENT)
1438                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1439
1440                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1441
1442                         log_error("%s is not a char or block device, cannot copy", from);
1443                         return -EIO;
1444
1445                 } else {
1446                         r = mkdir_parents(to, 0775);
1447                         if (r < 0) {
1448                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1449                                 return -r;
1450                         }
1451
1452                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1453                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1454
1455                         if (arg_userns && arg_uid_shift != UID_INVALID)
1456                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1458                 }
1459         }
1460
1461         return r;
1462 }
1463
1464 static int setup_ptmx(const char *dest) {
1465         _cleanup_free_ char *p = NULL;
1466
1467         p = strappend(dest, "/dev/ptmx");
1468         if (!p)
1469                 return log_oom();
1470
1471         if (symlink("pts/ptmx", p) < 0)
1472                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1473
1474         if (arg_userns && arg_uid_shift != UID_INVALID)
1475                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1477
1478         return 0;
1479 }
1480
1481 static int setup_dev_console(const char *dest, const char *console) {
1482         _cleanup_umask_ mode_t u;
1483         const char *to;
1484         struct stat st;
1485         int r;
1486
1487         assert(dest);
1488         assert(console);
1489
1490         u = umask(0000);
1491
1492         if (stat("/dev/null", &st) < 0)
1493                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1494
1495         r = chmod_and_chown(console, 0600, 0, 0);
1496         if (r < 0)
1497                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1498
1499         /* We need to bind mount the right tty to /dev/console since
1500          * ptys can only exist on pts file systems. To have something
1501          * to bind mount things on we create a device node first, and
1502          * use /dev/null for that since we the cgroups device policy
1503          * allows us to create that freely, while we cannot create
1504          * /dev/console. (Note that the major minor doesn't actually
1505          * matter here, since we mount it over anyway). */
1506
1507         to = strjoina(dest, "/dev/console");
1508         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1510
1511         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1513
1514         return 0;
1515 }
1516
1517 static int setup_kmsg(const char *dest, int kmsg_socket) {
1518         _cleanup_free_ char *from = NULL, *to = NULL;
1519         _cleanup_umask_ mode_t u;
1520         int r, fd, k;
1521         union {
1522                 struct cmsghdr cmsghdr;
1523                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1524         } control = {};
1525         struct msghdr mh = {
1526                 .msg_control = &control,
1527                 .msg_controllen = sizeof(control),
1528         };
1529         struct cmsghdr *cmsg;
1530
1531         assert(dest);
1532         assert(kmsg_socket >= 0);
1533
1534         u = umask(0000);
1535
1536         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538          * on the reading side behave very similar to /proc/kmsg,
1539          * their writing side behaves differently from /dev/kmsg in
1540          * that writing blocks when nothing is reading. In order to
1541          * avoid any problems with containers deadlocking due to this
1542          * we simply make /dev/kmsg unavailable to the container. */
1543         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1545                 return log_oom();
1546
1547         if (mkfifo(from, 0600) < 0)
1548                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1549
1550         r = chmod_and_chown(from, 0600, 0, 0);
1551         if (r < 0)
1552                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1553
1554         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1556
1557         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1558         if (fd < 0)
1559                 return log_error_errno(errno, "Failed to open fifo: %m");
1560
1561         cmsg = CMSG_FIRSTHDR(&mh);
1562         cmsg->cmsg_level = SOL_SOCKET;
1563         cmsg->cmsg_type = SCM_RIGHTS;
1564         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1566
1567         mh.msg_controllen = cmsg->cmsg_len;
1568
1569         /* Store away the fd in the socket, so that it stays open as
1570          * long as we run the child */
1571         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1572         safe_close(fd);
1573
1574         if (k < 0)
1575                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1576
1577         /* And now make the FIFO unavailable as /dev/kmsg... */
1578         unlink(from);
1579         return 0;
1580 }
1581
1582 static int send_rtnl(int send_fd) {
1583         union {
1584                 struct cmsghdr cmsghdr;
1585                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1586         } control = {};
1587         struct msghdr mh = {
1588                 .msg_control = &control,
1589                 .msg_controllen = sizeof(control),
1590         };
1591         struct cmsghdr *cmsg;
1592         _cleanup_close_ int fd = -1;
1593         ssize_t k;
1594
1595         assert(send_fd >= 0);
1596
1597         if (!arg_expose_ports)
1598                 return 0;
1599
1600         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1601         if (fd < 0)
1602                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1603
1604         cmsg = CMSG_FIRSTHDR(&mh);
1605         cmsg->cmsg_level = SOL_SOCKET;
1606         cmsg->cmsg_type = SCM_RIGHTS;
1607         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1609
1610         mh.msg_controllen = cmsg->cmsg_len;
1611
1612         /* Store away the fd in the socket, so that it stays open as
1613          * long as we run the child */
1614         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1615         if (k < 0)
1616                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1617
1618         return 0;
1619 }
1620
1621 static int flush_ports(union in_addr_union *exposed) {
1622         ExposePort *p;
1623         int r, af = AF_INET;
1624
1625         assert(exposed);
1626
1627         if (!arg_expose_ports)
1628                 return 0;
1629
1630         if (in_addr_is_null(af, exposed))
1631                 return 0;
1632
1633         log_debug("Lost IP address.");
1634
1635         LIST_FOREACH(ports, p, arg_expose_ports) {
1636                 r = fw_add_local_dnat(false,
1637                                       af,
1638                                       p->protocol,
1639                                       NULL,
1640                                       NULL, 0,
1641                                       NULL, 0,
1642                                       p->host_port,
1643                                       exposed,
1644                                       p->container_port,
1645                                       NULL);
1646                 if (r < 0)
1647                         log_warning_errno(r, "Failed to modify firewall: %m");
1648         }
1649
1650         *exposed = IN_ADDR_NULL;
1651         return 0;
1652 }
1653
1654 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655         _cleanup_free_ struct local_address *addresses = NULL;
1656         _cleanup_free_ char *pretty = NULL;
1657         union in_addr_union new_exposed;
1658         ExposePort *p;
1659         bool add;
1660         int af = AF_INET, r;
1661
1662         assert(exposed);
1663
1664         /* Invoked each time an address is added or removed inside the
1665          * container */
1666
1667         if (!arg_expose_ports)
1668                 return 0;
1669
1670         r = local_addresses(rtnl, 0, af, &addresses);
1671         if (r < 0)
1672                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1673
1674         add = r > 0 &&
1675                 addresses[0].family == af &&
1676                 addresses[0].scope < RT_SCOPE_LINK;
1677
1678         if (!add)
1679                 return flush_ports(exposed);
1680
1681         new_exposed = addresses[0].address;
1682         if (in_addr_equal(af, exposed, &new_exposed))
1683                 return 0;
1684
1685         in_addr_to_string(af, &new_exposed, &pretty);
1686         log_debug("New container IP is %s.", strna(pretty));
1687
1688         LIST_FOREACH(ports, p, arg_expose_ports) {
1689
1690                 r = fw_add_local_dnat(true,
1691                                       af,
1692                                       p->protocol,
1693                                       NULL,
1694                                       NULL, 0,
1695                                       NULL, 0,
1696                                       p->host_port,
1697                                       &new_exposed,
1698                                       p->container_port,
1699                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1700                 if (r < 0)
1701                         log_warning_errno(r, "Failed to modify firewall: %m");
1702         }
1703
1704         *exposed = new_exposed;
1705         return 0;
1706 }
1707
1708 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709         union in_addr_union *exposed = userdata;
1710
1711         assert(rtnl);
1712         assert(m);
1713         assert(exposed);
1714
1715         expose_ports(rtnl, exposed);
1716         return 0;
1717 }
1718
1719 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1720         union {
1721                 struct cmsghdr cmsghdr;
1722                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1723         } control = {};
1724         struct msghdr mh = {
1725                 .msg_control = &control,
1726                 .msg_controllen = sizeof(control),
1727         };
1728         struct cmsghdr *cmsg;
1729         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730         int fd, r;
1731         ssize_t k;
1732
1733         assert(event);
1734         assert(recv_fd >= 0);
1735         assert(ret);
1736
1737         if (!arg_expose_ports)
1738                 return 0;
1739
1740         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1741         if (k < 0)
1742                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1743
1744         cmsg = CMSG_FIRSTHDR(&mh);
1745         assert(cmsg->cmsg_level == SOL_SOCKET);
1746         assert(cmsg->cmsg_type == SCM_RIGHTS);
1747         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1748         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1749
1750         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1751         if (r < 0) {
1752                 safe_close(fd);
1753                 return log_error_errno(r, "Failed to create rtnl object: %m");
1754         }
1755
1756         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1757         if (r < 0)
1758                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1759
1760         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1761         if (r < 0)
1762                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1763
1764         r = sd_rtnl_attach_event(rtnl, event, 0);
1765         if (r < 0)
1766                 return log_error_errno(r, "Failed to add to even loop: %m");
1767
1768         *ret = rtnl;
1769         rtnl = NULL;
1770
1771         return 0;
1772 }
1773
1774 static int setup_hostname(void) {
1775
1776         if (arg_share_system)
1777                 return 0;
1778
1779         if (sethostname_idempotent(arg_machine) < 0)
1780                 return -errno;
1781
1782         return 0;
1783 }
1784
1785 static int setup_journal(const char *directory) {
1786         sd_id128_t machine_id, this_id;
1787         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1788         char *id;
1789         int r;
1790
1791         /* Don't link journals in ephemeral mode */
1792         if (arg_ephemeral)
1793                 return 0;
1794
1795         p = strappend(directory, "/etc/machine-id");
1796         if (!p)
1797                 return log_oom();
1798
1799         r = read_one_line_file(p, &b);
1800         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1801                 return 0;
1802         else if (r < 0)
1803                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1804
1805         id = strstrip(b);
1806         if (isempty(id) && arg_link_journal == LINK_AUTO)
1807                 return 0;
1808
1809         /* Verify validity */
1810         r = sd_id128_from_string(id, &machine_id);
1811         if (r < 0)
1812                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1813
1814         r = sd_id128_get_machine(&this_id);
1815         if (r < 0)
1816                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1817
1818         if (sd_id128_equal(machine_id, this_id)) {
1819                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820                          "Host and machine ids are equal (%s): refusing to link journals", id);
1821                 if (arg_link_journal == LINK_AUTO)
1822                         return 0;
1823                 return -EEXIST;
1824         }
1825
1826         if (arg_link_journal == LINK_NO)
1827                 return 0;
1828
1829         free(p);
1830         p = strappend("/var/log/journal/", id);
1831         q = strjoin(directory, "/var/log/journal/", id, NULL);
1832         if (!p || !q)
1833                 return log_oom();
1834
1835         if (path_is_mount_point(p, false) > 0) {
1836                 if (arg_link_journal != LINK_AUTO) {
1837                         log_error("%s: already a mount point, refusing to use for journal", p);
1838                         return -EEXIST;
1839                 }
1840
1841                 return 0;
1842         }
1843
1844         if (path_is_mount_point(q, false) > 0) {
1845                 if (arg_link_journal != LINK_AUTO) {
1846                         log_error("%s: already a mount point, refusing to use for journal", q);
1847                         return -EEXIST;
1848                 }
1849
1850                 return 0;
1851         }
1852
1853         r = readlink_and_make_absolute(p, &d);
1854         if (r >= 0) {
1855                 if ((arg_link_journal == LINK_GUEST ||
1856                      arg_link_journal == LINK_AUTO) &&
1857                     path_equal(d, q)) {
1858
1859                         r = mkdir_p(q, 0755);
1860                         if (r < 0)
1861                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1862                         return 0;
1863                 }
1864
1865                 if (unlink(p) < 0)
1866                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1867         } else if (r == -EINVAL) {
1868
1869                 if (arg_link_journal == LINK_GUEST &&
1870                     rmdir(p) < 0) {
1871
1872                         if (errno == ENOTDIR) {
1873                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1874                                 return r;
1875                         } else {
1876                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1877                                 return -errno;
1878                         }
1879                 }
1880         } else if (r != -ENOENT) {
1881                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1882                 return r;
1883         }
1884
1885         if (arg_link_journal == LINK_GUEST) {
1886
1887                 if (symlink(q, p) < 0) {
1888                         if (arg_link_journal_try) {
1889                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1890                                 return 0;
1891                         } else {
1892                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1893                                 return -errno;
1894                         }
1895                 }
1896
1897                 r = mkdir_p(q, 0755);
1898                 if (r < 0)
1899                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1900                 return 0;
1901         }
1902
1903         if (arg_link_journal == LINK_HOST) {
1904                 /* don't create parents here -- if the host doesn't have
1905                  * permanent journal set up, don't force it here */
1906                 r = mkdir(p, 0755);
1907                 if (r < 0) {
1908                         if (arg_link_journal_try) {
1909                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1910                                 return 0;
1911                         } else {
1912                                 log_error_errno(errno, "Failed to create %s: %m", p);
1913                                 return r;
1914                         }
1915                 }
1916
1917         } else if (access(p, F_OK) < 0)
1918                 return 0;
1919
1920         if (dir_is_empty(q) == 0)
1921                 log_warning("%s is not empty, proceeding anyway.", q);
1922
1923         r = mkdir_p(q, 0755);
1924         if (r < 0) {
1925                 log_error_errno(errno, "Failed to create %s: %m", q);
1926                 return r;
1927         }
1928
1929         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1931
1932         return 0;
1933 }
1934
1935 static int drop_capabilities(void) {
1936         return capability_bounding_set_drop(~arg_retain, false);
1937 }
1938
1939 static int register_machine(pid_t pid, int local_ifindex) {
1940         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1941         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1942         int r;
1943
1944         if (!arg_register)
1945                 return 0;
1946
1947         r = sd_bus_default_system(&bus);
1948         if (r < 0)
1949                 return log_error_errno(r, "Failed to open system bus: %m");
1950
1951         if (arg_keep_unit) {
1952                 r = sd_bus_call_method(
1953                                 bus,
1954                                 "org.freedesktop.machine1",
1955                                 "/org/freedesktop/machine1",
1956                                 "org.freedesktop.machine1.Manager",
1957                                 "RegisterMachineWithNetwork",
1958                                 &error,
1959                                 NULL,
1960                                 "sayssusai",
1961                                 arg_machine,
1962                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1963                                 "nspawn",
1964                                 "container",
1965                                 (uint32_t) pid,
1966                                 strempty(arg_directory),
1967                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1968         } else {
1969                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1970                 char **i;
1971
1972                 r = sd_bus_message_new_method_call(
1973                                 bus,
1974                                 &m,
1975                                 "org.freedesktop.machine1",
1976                                 "/org/freedesktop/machine1",
1977                                 "org.freedesktop.machine1.Manager",
1978                                 "CreateMachineWithNetwork");
1979                 if (r < 0)
1980                         return bus_log_create_error(r);
1981
1982                 r = sd_bus_message_append(
1983                                 m,
1984                                 "sayssusai",
1985                                 arg_machine,
1986                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1987                                 "nspawn",
1988                                 "container",
1989                                 (uint32_t) pid,
1990                                 strempty(arg_directory),
1991                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1992                 if (r < 0)
1993                         return bus_log_create_error(r);
1994
1995                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1996                 if (r < 0)
1997                         return bus_log_create_error(r);
1998
1999                 if (!isempty(arg_slice)) {
2000                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2001                         if (r < 0)
2002                                 return bus_log_create_error(r);
2003                 }
2004
2005                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2006                 if (r < 0)
2007                         return bus_log_create_error(r);
2008
2009                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2010                                           /* Allow the container to
2011                                            * access and create the API
2012                                            * device nodes, so that
2013                                            * PrivateDevices= in the
2014                                            * container can work
2015                                            * fine */
2016                                           "/dev/null", "rwm",
2017                                           "/dev/zero", "rwm",
2018                                           "/dev/full", "rwm",
2019                                           "/dev/random", "rwm",
2020                                           "/dev/urandom", "rwm",
2021                                           "/dev/tty", "rwm",
2022                                           "/dev/net/tun", "rwm",
2023                                           /* Allow the container
2024                                            * access to ptys. However,
2025                                            * do not permit the
2026                                            * container to ever create
2027                                            * these device nodes. */
2028                                           "/dev/pts/ptmx", "rw",
2029                                           "char-pts", "rw");
2030                 if (r < 0)
2031                         return log_error_errno(r, "Failed to add device whitelist: %m");
2032
2033                 STRV_FOREACH(i, arg_property) {
2034                         r = sd_bus_message_open_container(m, 'r', "sv");
2035                         if (r < 0)
2036                                 return bus_log_create_error(r);
2037
2038                         r = bus_append_unit_property_assignment(m, *i);
2039                         if (r < 0)
2040                                 return r;
2041
2042                         r = sd_bus_message_close_container(m);
2043                         if (r < 0)
2044                                 return bus_log_create_error(r);
2045                 }
2046
2047                 r = sd_bus_message_close_container(m);
2048                 if (r < 0)
2049                         return bus_log_create_error(r);
2050
2051                 r = sd_bus_call(bus, m, 0, &error, NULL);
2052         }
2053
2054         if (r < 0) {
2055                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2056                 return r;
2057         }
2058
2059         return 0;
2060 }
2061
2062 static int terminate_machine(pid_t pid) {
2063         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2065         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2066         const char *path;
2067         int r;
2068
2069         if (!arg_register)
2070                 return 0;
2071
2072         r = sd_bus_default_system(&bus);
2073         if (r < 0)
2074                 return log_error_errno(r, "Failed to open system bus: %m");
2075
2076         r = sd_bus_call_method(
2077                         bus,
2078                         "org.freedesktop.machine1",
2079                         "/org/freedesktop/machine1",
2080                         "org.freedesktop.machine1.Manager",
2081                         "GetMachineByPID",
2082                         &error,
2083                         &reply,
2084                         "u",
2085                         (uint32_t) pid);
2086         if (r < 0) {
2087                 /* Note that the machine might already have been
2088                  * cleaned up automatically, hence don't consider it a
2089                  * failure if we cannot get the machine object. */
2090                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2091                 return 0;
2092         }
2093
2094         r = sd_bus_message_read(reply, "o", &path);
2095         if (r < 0)
2096                 return bus_log_parse_error(r);
2097
2098         r = sd_bus_call_method(
2099                         bus,
2100                         "org.freedesktop.machine1",
2101                         path,
2102                         "org.freedesktop.machine1.Machine",
2103                         "Terminate",
2104                         &error,
2105                         NULL,
2106                         NULL);
2107         if (r < 0) {
2108                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2109                 return 0;
2110         }
2111
2112         return 0;
2113 }
2114
2115 static int reset_audit_loginuid(void) {
2116         _cleanup_free_ char *p = NULL;
2117         int r;
2118
2119         if (arg_share_system)
2120                 return 0;
2121
2122         r = read_one_line_file("/proc/self/loginuid", &p);
2123         if (r == -ENOENT)
2124                 return 0;
2125         if (r < 0)
2126                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2127
2128         /* Already reset? */
2129         if (streq(p, "4294967295"))
2130                 return 0;
2131
2132         r = write_string_file("/proc/self/loginuid", "4294967295");
2133         if (r < 0) {
2134                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2139
2140                 sleep(5);
2141         }
2142
2143         return 0;
2144 }
2145
2146 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2148 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2149
2150 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2151         uint8_t result[8];
2152         size_t l, sz;
2153         uint8_t *v, *i;
2154         int r;
2155
2156         l = strlen(arg_machine);
2157         sz = sizeof(sd_id128_t) + l;
2158         if (idx > 0)
2159                 sz += sizeof(idx);
2160
2161         v = alloca(sz);
2162
2163         /* fetch some persistent data unique to the host */
2164         r = sd_id128_get_machine((sd_id128_t*) v);
2165         if (r < 0)
2166                 return r;
2167
2168         /* combine with some data unique (on this host) to this
2169          * container instance */
2170         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2171         if (idx > 0) {
2172                 idx = htole64(idx);
2173                 memcpy(i, &idx, sizeof(idx));
2174         }
2175
2176         /* Let's hash the host machine ID plus the container name. We
2177          * use a fixed, but originally randomly created hash key here. */
2178         siphash24(result, v, sz, hash_key.bytes);
2179
2180         assert_cc(ETH_ALEN <= sizeof(result));
2181         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2182
2183         /* see eth_random_addr in the kernel */
2184         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2185         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2186
2187         return 0;
2188 }
2189
2190 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2191         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2192         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2193         struct ether_addr mac_host, mac_container;
2194         int r, i;
2195
2196         if (!arg_private_network)
2197                 return 0;
2198
2199         if (!arg_network_veth)
2200                 return 0;
2201
2202         /* Use two different interface name prefixes depending whether
2203          * we are in bridge mode or not. */
2204         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2205                  arg_network_bridge ? "vb" : "ve", arg_machine);
2206
2207         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2210
2211         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2214
2215         r = sd_rtnl_open(&rtnl, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to connect to netlink: %m");
2218
2219         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2222
2223         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2226
2227         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2230
2231         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to open netlink container: %m");
2234
2235         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to open netlink container: %m");
2238
2239         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2240         if (r < 0)
2241                 return log_error_errno(r, "Failed to open netlink container: %m");
2242
2243         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2244         if (r < 0)
2245                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2246
2247         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2248         if (r < 0)
2249                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2250
2251         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2254
2255         r = sd_rtnl_message_close_container(m);
2256         if (r < 0)
2257                 return log_error_errno(r, "Failed to close netlink container: %m");
2258
2259         r = sd_rtnl_message_close_container(m);
2260         if (r < 0)
2261                 return log_error_errno(r, "Failed to close netlink container: %m");
2262
2263         r = sd_rtnl_message_close_container(m);
2264         if (r < 0)
2265                 return log_error_errno(r, "Failed to close netlink container: %m");
2266
2267         r = sd_rtnl_call(rtnl, m, 0, NULL);
2268         if (r < 0)
2269                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2270
2271         i = (int) if_nametoindex(iface_name);
2272         if (i <= 0)
2273                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2274
2275         *ifi = i;
2276
2277         return 0;
2278 }
2279
2280 static int setup_bridge(const char veth_name[], int *ifi) {
2281         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2283         int r, bridge;
2284
2285         if (!arg_private_network)
2286                 return 0;
2287
2288         if (!arg_network_veth)
2289                 return 0;
2290
2291         if (!arg_network_bridge)
2292                 return 0;
2293
2294         bridge = (int) if_nametoindex(arg_network_bridge);
2295         if (bridge <= 0)
2296                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2297
2298         *ifi = bridge;
2299
2300         r = sd_rtnl_open(&rtnl, 0);
2301         if (r < 0)
2302                 return log_error_errno(r, "Failed to connect to netlink: %m");
2303
2304         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2307
2308         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2309         if (r < 0)
2310                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2311
2312         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2313         if (r < 0)
2314                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2315
2316         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2317         if (r < 0)
2318                 return log_error_errno(r, "Failed to add netlink master field: %m");
2319
2320         r = sd_rtnl_call(rtnl, m, 0, NULL);
2321         if (r < 0)
2322                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2323
2324         return 0;
2325 }
2326
2327 static int parse_interface(struct udev *udev, const char *name) {
2328         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2330         int ifi;
2331
2332         ifi = (int) if_nametoindex(name);
2333         if (ifi <= 0)
2334                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2335
2336         sprintf(ifi_str, "n%i", ifi);
2337         d = udev_device_new_from_device_id(udev, ifi_str);
2338         if (!d)
2339                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2340
2341         if (udev_device_get_is_initialized(d) <= 0) {
2342                 log_error("Network interface %s is not initialized yet.", name);
2343                 return -EBUSY;
2344         }
2345
2346         return ifi;
2347 }
2348
2349 static int move_network_interfaces(pid_t pid) {
2350         _cleanup_udev_unref_ struct udev *udev = NULL;
2351         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2352         char **i;
2353         int r;
2354
2355         if (!arg_private_network)
2356                 return 0;
2357
2358         if (strv_isempty(arg_network_interfaces))
2359                 return 0;
2360
2361         r = sd_rtnl_open(&rtnl, 0);
2362         if (r < 0)
2363                 return log_error_errno(r, "Failed to connect to netlink: %m");
2364
2365         udev = udev_new();
2366         if (!udev) {
2367                 log_error("Failed to connect to udev.");
2368                 return -ENOMEM;
2369         }
2370
2371         STRV_FOREACH(i, arg_network_interfaces) {
2372                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2373                 int ifi;
2374
2375                 ifi = parse_interface(udev, *i);
2376                 if (ifi < 0)
2377                         return ifi;
2378
2379                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2380                 if (r < 0)
2381                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2382
2383                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2384                 if (r < 0)
2385                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2386
2387                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2388                 if (r < 0)
2389                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2390         }
2391
2392         return 0;
2393 }
2394
2395 static int setup_macvlan(pid_t pid) {
2396         _cleanup_udev_unref_ struct udev *udev = NULL;
2397         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2398         unsigned idx = 0;
2399         char **i;
2400         int r;
2401
2402         if (!arg_private_network)
2403                 return 0;
2404
2405         if (strv_isempty(arg_network_macvlan))
2406                 return 0;
2407
2408         r = sd_rtnl_open(&rtnl, 0);
2409         if (r < 0)
2410                 return log_error_errno(r, "Failed to connect to netlink: %m");
2411
2412         udev = udev_new();
2413         if (!udev) {
2414                 log_error("Failed to connect to udev.");
2415                 return -ENOMEM;
2416         }
2417
2418         STRV_FOREACH(i, arg_network_macvlan) {
2419                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420                 _cleanup_free_ char *n = NULL;
2421                 struct ether_addr mac;
2422                 int ifi;
2423
2424                 ifi = parse_interface(udev, *i);
2425                 if (ifi < 0)
2426                         return ifi;
2427
2428                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2431
2432                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2435
2436                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2437                 if (r < 0)
2438                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2439
2440                 n = strappend("mv-", *i);
2441                 if (!n)
2442                         return log_oom();
2443
2444                 strshorten(n, IFNAMSIZ-1);
2445
2446                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2449
2450                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2453
2454                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2457
2458                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to open netlink container: %m");
2461
2462                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to open netlink container: %m");
2465
2466                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2469
2470                 r = sd_rtnl_message_close_container(m);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to close netlink container: %m");
2473
2474                 r = sd_rtnl_message_close_container(m);
2475                 if (r < 0)
2476                         return log_error_errno(r, "Failed to close netlink container: %m");
2477
2478                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2479                 if (r < 0)
2480                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2481         }
2482
2483         return 0;
2484 }
2485
2486 static int setup_ipvlan(pid_t pid) {
2487         _cleanup_udev_unref_ struct udev *udev = NULL;
2488         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2489         char **i;
2490         int r;
2491
2492         if (!arg_private_network)
2493                 return 0;
2494
2495         if (strv_isempty(arg_network_ipvlan))
2496                 return 0;
2497
2498         r = sd_rtnl_open(&rtnl, 0);
2499         if (r < 0)
2500                 return log_error_errno(r, "Failed to connect to netlink: %m");
2501
2502         udev = udev_new();
2503         if (!udev) {
2504                 log_error("Failed to connect to udev.");
2505                 return -ENOMEM;
2506         }
2507
2508         STRV_FOREACH(i, arg_network_ipvlan) {
2509                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510                 _cleanup_free_ char *n = NULL;
2511                 int ifi;
2512
2513                 ifi = parse_interface(udev, *i);
2514                 if (ifi < 0)
2515                         return ifi;
2516
2517                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2518                 if (r < 0)
2519                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2520
2521                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2522                 if (r < 0)
2523                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2524
2525                 n = strappend("iv-", *i);
2526                 if (!n)
2527                         return log_oom();
2528
2529                 strshorten(n, IFNAMSIZ-1);
2530
2531                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2532                 if (r < 0)
2533                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2534
2535                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2536                 if (r < 0)
2537                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2538
2539                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2540                 if (r < 0)
2541                         return log_error_errno(r, "Failed to open netlink container: %m");
2542
2543                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2544                 if (r < 0)
2545                         return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2548                 if (r < 0)
2549                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2550
2551                 r = sd_rtnl_message_close_container(m);
2552                 if (r < 0)
2553                         return log_error_errno(r, "Failed to close netlink container: %m");
2554
2555                 r = sd_rtnl_message_close_container(m);
2556                 if (r < 0)
2557                         return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2560                 if (r < 0)
2561                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2562         }
2563
2564         return 0;
2565 }
2566
2567 static int setup_seccomp(void) {
2568
2569 #ifdef HAVE_SECCOMP
2570         static const struct {
2571                 uint64_t capability;
2572                 int syscall_num;
2573         } blacklist[] = {
2574                 { CAP_SYS_RAWIO,  SCMP_SYS(iopl)},
2575                 { CAP_SYS_RAWIO,  SCMP_SYS(ioperm)},
2576                 { CAP_SYS_BOOT,   SCMP_SYS(kexec_load)},
2577                 { CAP_SYS_ADMIN,  SCMP_SYS(swapon)},
2578                 { CAP_SYS_ADMIN,  SCMP_SYS(swapoff)},
2579                 { CAP_SYS_ADMIN,  SCMP_SYS(open_by_handle_at)},
2580                 { CAP_SYS_MODULE, SCMP_SYS(init_module)},
2581                 { CAP_SYS_MODULE, SCMP_SYS(finit_module)},
2582                 { CAP_SYS_MODULE, SCMP_SYS(delete_module)},
2583         };
2584
2585         scmp_filter_ctx seccomp;
2586         unsigned i;
2587         int r;
2588
2589         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2590         if (!seccomp)
2591                 return log_oom();
2592
2593         r = seccomp_add_secondary_archs(seccomp);
2594         if (r < 0) {
2595                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2596                 goto finish;
2597         }
2598
2599         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2600                 if (arg_retain & (1ULL << blacklist[i].capability))
2601                         continue;
2602
2603                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
2604                 if (r == -EFAULT)
2605                         continue; /* unknown syscall */
2606                 if (r < 0) {
2607                         log_error_errno(r, "Failed to block syscall: %m");
2608                         goto finish;
2609                 }
2610         }
2611
2612
2613         /*
2614            Audit is broken in containers, much of the userspace audit
2615            hookup will fail if running inside a container. We don't
2616            care and just turn off creation of audit sockets.
2617
2618            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2619            with EAFNOSUPPORT which audit userspace uses as indication
2620            that audit is disabled in the kernel.
2621          */
2622
2623         r = seccomp_rule_add(
2624                         seccomp,
2625                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2626                         SCMP_SYS(socket),
2627                         2,
2628                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2629                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2630         if (r < 0) {
2631                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2632                 goto finish;
2633         }
2634
2635         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2636         if (r < 0) {
2637                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2638                 goto finish;
2639         }
2640
2641         r = seccomp_load(seccomp);
2642         if (r < 0)
2643                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2644
2645 finish:
2646         seccomp_release(seccomp);
2647         return r;
2648 #else
2649         return 0;
2650 #endif
2651
2652 }
2653
2654 static int setup_propagate(const char *root) {
2655         const char *p, *q;
2656
2657         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2658         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2659         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2660         (void) mkdir_p(p, 0600);
2661
2662         q = strjoina(root, "/run/systemd/nspawn/incoming");
2663         mkdir_parents(q, 0755);
2664         mkdir_p(q, 0600);
2665
2666         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2667                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2668
2669         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2670                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2671
2672         return 0;
2673 }
2674
2675 static int setup_image(char **device_path, int *loop_nr) {
2676         struct loop_info64 info = {
2677                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2678         };
2679         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2680         _cleanup_free_ char* loopdev = NULL;
2681         struct stat st;
2682         int r, nr;
2683
2684         assert(device_path);
2685         assert(loop_nr);
2686         assert(arg_image);
2687
2688         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2689         if (fd < 0)
2690                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2691
2692         if (fstat(fd, &st) < 0)
2693                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2694
2695         if (S_ISBLK(st.st_mode)) {
2696                 char *p;
2697
2698                 p = strdup(arg_image);
2699                 if (!p)
2700                         return log_oom();
2701
2702                 *device_path = p;
2703
2704                 *loop_nr = -1;
2705
2706                 r = fd;
2707                 fd = -1;
2708
2709                 return r;
2710         }
2711
2712         if (!S_ISREG(st.st_mode)) {
2713                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2714                 return -EINVAL;
2715         }
2716
2717         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2718         if (control < 0)
2719                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2720
2721         nr = ioctl(control, LOOP_CTL_GET_FREE);
2722         if (nr < 0)
2723                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2724
2725         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2726                 return log_oom();
2727
2728         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2729         if (loop < 0)
2730                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2731
2732         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2733                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2734
2735         if (arg_read_only)
2736                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2737
2738         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2739                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2740
2741         *device_path = loopdev;
2742         loopdev = NULL;
2743
2744         *loop_nr = nr;
2745
2746         r = loop;
2747         loop = -1;
2748
2749         return r;
2750 }
2751
2752 #define PARTITION_TABLE_BLURB \
2753         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2754         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2755         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2756         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2757         "to be bootable with systemd-nspawn."
2758
2759 static int dissect_image(
2760                 int fd,
2761                 char **root_device, bool *root_device_rw,
2762                 char **home_device, bool *home_device_rw,
2763                 char **srv_device, bool *srv_device_rw,
2764                 bool *secondary) {
2765
2766 #ifdef HAVE_BLKID
2767         int home_nr = -1, srv_nr = -1;
2768 #ifdef GPT_ROOT_NATIVE
2769         int root_nr = -1;
2770 #endif
2771 #ifdef GPT_ROOT_SECONDARY
2772         int secondary_root_nr = -1;
2773 #endif
2774         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2775         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2776         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2777         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2778         _cleanup_udev_unref_ struct udev *udev = NULL;
2779         struct udev_list_entry *first, *item;
2780         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2781         bool is_gpt, is_mbr, multiple_generic = false;
2782         const char *pttype = NULL;
2783         blkid_partlist pl;
2784         struct stat st;
2785         unsigned i;
2786         int r;
2787
2788         assert(fd >= 0);
2789         assert(root_device);
2790         assert(home_device);
2791         assert(srv_device);
2792         assert(secondary);
2793         assert(arg_image);
2794
2795         b = blkid_new_probe();
2796         if (!b)
2797                 return log_oom();
2798
2799         errno = 0;
2800         r = blkid_probe_set_device(b, fd, 0, 0);
2801         if (r != 0) {
2802                 if (errno == 0)
2803                         return log_oom();
2804
2805                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2806                 return -errno;
2807         }
2808
2809         blkid_probe_enable_partitions(b, 1);
2810         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2811
2812         errno = 0;
2813         r = blkid_do_safeprobe(b);
2814         if (r == -2 || r == 1) {
2815                 log_error("Failed to identify any partition table on\n"
2816                           "    %s\n"
2817                           PARTITION_TABLE_BLURB, arg_image);
2818                 return -EINVAL;
2819         } else if (r != 0) {
2820                 if (errno == 0)
2821                         errno = EIO;
2822                 log_error_errno(errno, "Failed to probe: %m");
2823                 return -errno;
2824         }
2825
2826         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2827
2828         is_gpt = streq_ptr(pttype, "gpt");
2829         is_mbr = streq_ptr(pttype, "dos");
2830
2831         if (!is_gpt && !is_mbr) {
2832                 log_error("No GPT or MBR partition table discovered on\n"
2833                           "    %s\n"
2834                           PARTITION_TABLE_BLURB, arg_image);
2835                 return -EINVAL;
2836         }
2837
2838         errno = 0;
2839         pl = blkid_probe_get_partitions(b);
2840         if (!pl) {
2841                 if (errno == 0)
2842                         return log_oom();
2843
2844                 log_error("Failed to list partitions of %s", arg_image);
2845                 return -errno;
2846         }
2847
2848         udev = udev_new();
2849         if (!udev)
2850                 return log_oom();
2851
2852         if (fstat(fd, &st) < 0)
2853                 return log_error_errno(errno, "Failed to stat block device: %m");
2854
2855         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2856         if (!d)
2857                 return log_oom();
2858
2859         for (i = 0;; i++) {
2860                 int n, m;
2861
2862                 if (i >= 10) {
2863                         log_error("Kernel partitions never appeared.");
2864                         return -ENXIO;
2865                 }
2866
2867                 e = udev_enumerate_new(udev);
2868                 if (!e)
2869                         return log_oom();
2870
2871                 r = udev_enumerate_add_match_parent(e, d);
2872                 if (r < 0)
2873                         return log_oom();
2874
2875                 r = udev_enumerate_scan_devices(e);
2876                 if (r < 0)
2877                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2878
2879                 /* Count the partitions enumerated by the kernel */
2880                 n = 0;
2881                 first = udev_enumerate_get_list_entry(e);
2882                 udev_list_entry_foreach(item, first)
2883                         n++;
2884
2885                 /* Count the partitions enumerated by blkid */
2886                 m = blkid_partlist_numof_partitions(pl);
2887                 if (n == m + 1)
2888                         break;
2889                 if (n > m + 1) {
2890                         log_error("blkid and kernel partition list do not match.");
2891                         return -EIO;
2892                 }
2893                 if (n < m + 1) {
2894                         unsigned j;
2895
2896                         /* The kernel has probed fewer partitions than
2897                          * blkid? Maybe the kernel prober is still
2898                          * running or it got EBUSY because udev
2899                          * already opened the device. Let's reprobe
2900                          * the device, which is a synchronous call
2901                          * that waits until probing is complete. */
2902
2903                         for (j = 0; j < 20; j++) {
2904
2905                                 r = ioctl(fd, BLKRRPART, 0);
2906                                 if (r < 0)
2907                                         r = -errno;
2908                                 if (r >= 0 || r != -EBUSY)
2909                                         break;
2910
2911                                 /* If something else has the device
2912                                  * open, such as an udev rule, the
2913                                  * ioctl will return EBUSY. Since
2914                                  * there's no way to wait until it
2915                                  * isn't busy anymore, let's just wait
2916                                  * a bit, and try again.
2917                                  *
2918                                  * This is really something they
2919                                  * should fix in the kernel! */
2920
2921                                 usleep(50 * USEC_PER_MSEC);
2922                         }
2923
2924                         if (r < 0)
2925                                 return log_error_errno(r, "Failed to reread partition table: %m");
2926                 }
2927
2928                 e = udev_enumerate_unref(e);
2929         }
2930
2931         first = udev_enumerate_get_list_entry(e);
2932         udev_list_entry_foreach(item, first) {
2933                 _cleanup_udev_device_unref_ struct udev_device *q;
2934                 const char *node;
2935                 unsigned long long flags;
2936                 blkid_partition pp;
2937                 dev_t qn;
2938                 int nr;
2939
2940                 errno = 0;
2941                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2942                 if (!q) {
2943                         if (!errno)
2944                                 errno = ENOMEM;
2945
2946                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2947                         return -errno;
2948                 }
2949
2950                 qn = udev_device_get_devnum(q);
2951                 if (major(qn) == 0)
2952                         continue;
2953
2954                 if (st.st_rdev == qn)
2955                         continue;
2956
2957                 node = udev_device_get_devnode(q);
2958                 if (!node)
2959                         continue;
2960
2961                 pp = blkid_partlist_devno_to_partition(pl, qn);
2962                 if (!pp)
2963                         continue;
2964
2965                 flags = blkid_partition_get_flags(pp);
2966
2967                 nr = blkid_partition_get_partno(pp);
2968                 if (nr < 0)
2969                         continue;
2970
2971                 if (is_gpt) {
2972                         sd_id128_t type_id;
2973                         const char *stype;
2974
2975                         if (flags & GPT_FLAG_NO_AUTO)
2976                                 continue;
2977
2978                         stype = blkid_partition_get_type_string(pp);
2979                         if (!stype)
2980                                 continue;
2981
2982                         if (sd_id128_from_string(stype, &type_id) < 0)
2983                                 continue;
2984
2985                         if (sd_id128_equal(type_id, GPT_HOME)) {
2986
2987                                 if (home && nr >= home_nr)
2988                                         continue;
2989
2990                                 home_nr = nr;
2991                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2992
2993                                 r = free_and_strdup(&home, node);
2994                                 if (r < 0)
2995                                         return log_oom();
2996
2997                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2998
2999                                 if (srv && nr >= srv_nr)
3000                                         continue;
3001
3002                                 srv_nr = nr;
3003                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3004
3005                                 r = free_and_strdup(&srv, node);
3006                                 if (r < 0)
3007                                         return log_oom();
3008                         }
3009 #ifdef GPT_ROOT_NATIVE
3010                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3011
3012                                 if (root && nr >= root_nr)
3013                                         continue;
3014
3015                                 root_nr = nr;
3016                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3017
3018                                 r = free_and_strdup(&root, node);
3019                                 if (r < 0)
3020                                         return log_oom();
3021                         }
3022 #endif
3023 #ifdef GPT_ROOT_SECONDARY
3024                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3025
3026                                 if (secondary_root && nr >= secondary_root_nr)
3027                                         continue;
3028
3029                                 secondary_root_nr = nr;
3030                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3031
3032                                 r = free_and_strdup(&secondary_root, node);
3033                                 if (r < 0)
3034                                         return log_oom();
3035                         }
3036 #endif
3037                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3038
3039                                 if (generic)
3040                                         multiple_generic = true;
3041                                 else {
3042                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3043
3044                                         r = free_and_strdup(&generic, node);
3045                                         if (r < 0)
3046                                                 return log_oom();
3047                                 }
3048                         }
3049
3050                 } else if (is_mbr) {
3051                         int type;
3052
3053                         if (flags != 0x80) /* Bootable flag */
3054                                 continue;
3055
3056                         type = blkid_partition_get_type(pp);
3057                         if (type != 0x83) /* Linux partition */
3058                                 continue;
3059
3060                         if (generic)
3061                                 multiple_generic = true;
3062                         else {
3063                                 generic_rw = true;
3064
3065                                 r = free_and_strdup(&root, node);
3066                                 if (r < 0)
3067                                         return log_oom();
3068                         }
3069                 }
3070         }
3071
3072         if (root) {
3073                 *root_device = root;
3074                 root = NULL;
3075
3076                 *root_device_rw = root_rw;
3077                 *secondary = false;
3078         } else if (secondary_root) {
3079                 *root_device = secondary_root;
3080                 secondary_root = NULL;
3081
3082                 *root_device_rw = secondary_root_rw;
3083                 *secondary = true;
3084         } else if (generic) {
3085
3086                 /* There were no partitions with precise meanings
3087                  * around, but we found generic partitions. In this
3088                  * case, if there's only one, we can go ahead and boot
3089                  * it, otherwise we bail out, because we really cannot
3090                  * make any sense of it. */
3091
3092                 if (multiple_generic) {
3093                         log_error("Identified multiple bootable Linux partitions on\n"
3094                                   "    %s\n"
3095                                   PARTITION_TABLE_BLURB, arg_image);
3096                         return -EINVAL;
3097                 }
3098
3099                 *root_device = generic;
3100                 generic = NULL;
3101
3102                 *root_device_rw = generic_rw;
3103                 *secondary = false;
3104         } else {
3105                 log_error("Failed to identify root partition in disk image\n"
3106                           "    %s\n"
3107                           PARTITION_TABLE_BLURB, arg_image);
3108                 return -EINVAL;
3109         }
3110
3111         if (home) {
3112                 *home_device = home;
3113                 home = NULL;
3114
3115                 *home_device_rw = home_rw;
3116         }
3117
3118         if (srv) {
3119                 *srv_device = srv;
3120                 srv = NULL;
3121
3122                 *srv_device_rw = srv_rw;
3123         }
3124
3125         return 0;
3126 #else
3127         log_error("--image= is not supported, compiled without blkid support.");
3128         return -EOPNOTSUPP;
3129 #endif
3130 }
3131
3132 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3133 #ifdef HAVE_BLKID
3134         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3135         const char *fstype, *p;
3136         int r;
3137
3138         assert(what);
3139         assert(where);
3140
3141         if (arg_read_only)
3142                 rw = false;
3143
3144         if (directory)
3145                 p = strjoina(where, directory);
3146         else
3147                 p = where;
3148
3149         errno = 0;
3150         b = blkid_new_probe_from_filename(what);
3151         if (!b) {
3152                 if (errno == 0)
3153                         return log_oom();
3154                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3155                 return -errno;
3156         }
3157
3158         blkid_probe_enable_superblocks(b, 1);
3159         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3160
3161         errno = 0;
3162         r = blkid_do_safeprobe(b);
3163         if (r == -1 || r == 1) {
3164                 log_error("Cannot determine file system type of %s", what);
3165                 return -EINVAL;
3166         } else if (r != 0) {
3167                 if (errno == 0)
3168                         errno = EIO;
3169                 log_error_errno(errno, "Failed to probe %s: %m", what);
3170                 return -errno;
3171         }
3172
3173         errno = 0;
3174         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3175                 if (errno == 0)
3176                         errno = EINVAL;
3177                 log_error("Failed to determine file system type of %s", what);
3178                 return -errno;
3179         }
3180
3181         if (streq(fstype, "crypto_LUKS")) {
3182                 log_error("nspawn currently does not support LUKS disk images.");
3183                 return -EOPNOTSUPP;
3184         }
3185
3186         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3187                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3188
3189         return 0;
3190 #else
3191         log_error("--image= is not supported, compiled without blkid support.");
3192         return -EOPNOTSUPP;
3193 #endif
3194 }
3195
3196 static int mount_devices(
3197                 const char *where,
3198                 const char *root_device, bool root_device_rw,
3199                 const char *home_device, bool home_device_rw,
3200                 const char *srv_device, bool srv_device_rw) {
3201         int r;
3202
3203         assert(where);
3204
3205         if (root_device) {
3206                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3207                 if (r < 0)
3208                         return log_error_errno(r, "Failed to mount root directory: %m");
3209         }
3210
3211         if (home_device) {
3212                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3213                 if (r < 0)
3214                         return log_error_errno(r, "Failed to mount home directory: %m");
3215         }
3216
3217         if (srv_device) {
3218                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3219                 if (r < 0)
3220                         return log_error_errno(r, "Failed to mount server data directory: %m");
3221         }
3222
3223         return 0;
3224 }
3225
3226 static void loop_remove(int nr, int *image_fd) {
3227         _cleanup_close_ int control = -1;
3228         int r;
3229
3230         if (nr < 0)
3231                 return;
3232
3233         if (image_fd && *image_fd >= 0) {
3234                 r = ioctl(*image_fd, LOOP_CLR_FD);
3235                 if (r < 0)
3236                         log_debug_errno(errno, "Failed to close loop image: %m");
3237                 *image_fd = safe_close(*image_fd);
3238         }
3239
3240         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3241         if (control < 0) {
3242                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3243                 return;
3244         }
3245
3246         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3247         if (r < 0)
3248                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3249 }
3250
3251 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3252         int pipe_fds[2];
3253         pid_t pid;
3254
3255         assert(database);
3256         assert(key);
3257         assert(rpid);
3258
3259         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3260                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3261
3262         pid = fork();
3263         if (pid < 0)
3264                 return log_error_errno(errno, "Failed to fork getent child: %m");
3265         else if (pid == 0) {
3266                 int nullfd;
3267                 char *empty_env = NULL;
3268
3269                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3270                         _exit(EXIT_FAILURE);
3271
3272                 if (pipe_fds[0] > 2)
3273                         safe_close(pipe_fds[0]);
3274                 if (pipe_fds[1] > 2)
3275                         safe_close(pipe_fds[1]);
3276
3277                 nullfd = open("/dev/null", O_RDWR);
3278                 if (nullfd < 0)
3279                         _exit(EXIT_FAILURE);
3280
3281                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3282                         _exit(EXIT_FAILURE);
3283
3284                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3285                         _exit(EXIT_FAILURE);
3286
3287                 if (nullfd > 2)
3288                         safe_close(nullfd);
3289
3290                 reset_all_signal_handlers();
3291                 close_all_fds(NULL, 0);
3292
3293                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3294                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3295                 _exit(EXIT_FAILURE);
3296         }
3297
3298         pipe_fds[1] = safe_close(pipe_fds[1]);
3299
3300         *rpid = pid;
3301
3302         return pipe_fds[0];
3303 }
3304
3305 static int change_uid_gid(char **_home) {
3306         char line[LINE_MAX], *x, *u, *g, *h;
3307         const char *word, *state;
3308         _cleanup_free_ uid_t *uids = NULL;
3309         _cleanup_free_ char *home = NULL;
3310         _cleanup_fclose_ FILE *f = NULL;
3311         _cleanup_close_ int fd = -1;
3312         unsigned n_uids = 0;
3313         size_t sz = 0, l;
3314         uid_t uid;
3315         gid_t gid;
3316         pid_t pid;
3317         int r;
3318
3319         assert(_home);
3320
3321         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3322                 /* Reset everything fully to 0, just in case */
3323
3324                 if (setgroups(0, NULL) < 0)
3325                         return log_error_errno(errno, "setgroups() failed: %m");
3326
3327                 if (setresgid(0, 0, 0) < 0)
3328                         return log_error_errno(errno, "setregid() failed: %m");
3329
3330                 if (setresuid(0, 0, 0) < 0)
3331                         return log_error_errno(errno, "setreuid() failed: %m");
3332
3333                 *_home = NULL;
3334                 return 0;
3335         }
3336
3337         /* First, get user credentials */
3338         fd = spawn_getent("passwd", arg_user, &pid);
3339         if (fd < 0)
3340                 return fd;
3341
3342         f = fdopen(fd, "r");
3343         if (!f)
3344                 return log_oom();
3345         fd = -1;
3346
3347         if (!fgets(line, sizeof(line), f)) {
3348
3349                 if (!ferror(f)) {
3350                         log_error("Failed to resolve user %s.", arg_user);
3351                         return -ESRCH;
3352                 }
3353
3354                 log_error_errno(errno, "Failed to read from getent: %m");
3355                 return -errno;
3356         }
3357
3358         truncate_nl(line);
3359
3360         wait_for_terminate_and_warn("getent passwd", pid, true);
3361
3362         x = strchr(line, ':');
3363         if (!x) {
3364                 log_error("/etc/passwd entry has invalid user field.");
3365                 return -EIO;
3366         }
3367
3368         u = strchr(x+1, ':');
3369         if (!u) {
3370                 log_error("/etc/passwd entry has invalid password field.");
3371                 return -EIO;
3372         }
3373
3374         u++;
3375         g = strchr(u, ':');
3376         if (!g) {
3377                 log_error("/etc/passwd entry has invalid UID field.");
3378                 return -EIO;
3379         }
3380
3381         *g = 0;
3382         g++;
3383         x = strchr(g, ':');
3384         if (!x) {
3385                 log_error("/etc/passwd entry has invalid GID field.");
3386                 return -EIO;
3387         }
3388
3389         *x = 0;
3390         h = strchr(x+1, ':');
3391         if (!h) {
3392                 log_error("/etc/passwd entry has invalid GECOS field.");
3393                 return -EIO;
3394         }
3395
3396         h++;
3397         x = strchr(h, ':');
3398         if (!x) {
3399                 log_error("/etc/passwd entry has invalid home directory field.");
3400                 return -EIO;
3401         }
3402
3403         *x = 0;
3404
3405         r = parse_uid(u, &uid);
3406         if (r < 0) {
3407                 log_error("Failed to parse UID of user.");
3408                 return -EIO;
3409         }
3410
3411         r = parse_gid(g, &gid);
3412         if (r < 0) {
3413                 log_error("Failed to parse GID of user.");
3414                 return -EIO;
3415         }
3416
3417         home = strdup(h);
3418         if (!home)
3419                 return log_oom();
3420
3421         /* Second, get group memberships */
3422         fd = spawn_getent("initgroups", arg_user, &pid);
3423         if (fd < 0)
3424                 return fd;
3425
3426         fclose(f);
3427         f = fdopen(fd, "r");
3428         if (!f)
3429                 return log_oom();
3430         fd = -1;
3431
3432         if (!fgets(line, sizeof(line), f)) {
3433                 if (!ferror(f)) {
3434                         log_error("Failed to resolve user %s.", arg_user);
3435                         return -ESRCH;
3436                 }
3437
3438                 log_error_errno(errno, "Failed to read from getent: %m");
3439                 return -errno;
3440         }
3441
3442         truncate_nl(line);
3443
3444         wait_for_terminate_and_warn("getent initgroups", pid, true);
3445
3446         /* Skip over the username and subsequent separator whitespace */
3447         x = line;
3448         x += strcspn(x, WHITESPACE);
3449         x += strspn(x, WHITESPACE);
3450
3451         FOREACH_WORD(word, l, x, state) {
3452                 char c[l+1];
3453
3454                 memcpy(c, word, l);
3455                 c[l] = 0;
3456
3457                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3458                         return log_oom();
3459
3460                 r = parse_uid(c, &uids[n_uids++]);
3461                 if (r < 0) {
3462                         log_error("Failed to parse group data from getent.");
3463                         return -EIO;
3464                 }
3465         }
3466
3467         r = mkdir_parents(home, 0775);
3468         if (r < 0)
3469                 return log_error_errno(r, "Failed to make home root directory: %m");
3470
3471         r = mkdir_safe(home, 0755, uid, gid);
3472         if (r < 0 && r != -EEXIST)
3473                 return log_error_errno(r, "Failed to make home directory: %m");
3474
3475         fchown(STDIN_FILENO, uid, gid);
3476         fchown(STDOUT_FILENO, uid, gid);
3477         fchown(STDERR_FILENO, uid, gid);
3478
3479         if (setgroups(n_uids, uids) < 0)
3480                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3481
3482         if (setresgid(gid, gid, gid) < 0)
3483                 return log_error_errno(errno, "setregid() failed: %m");
3484
3485         if (setresuid(uid, uid, uid) < 0)
3486                 return log_error_errno(errno, "setreuid() failed: %m");
3487
3488         if (_home) {
3489                 *_home = home;
3490                 home = NULL;
3491         }
3492
3493         return 0;
3494 }
3495
3496 /*
3497  * Return values:
3498  * < 0 : wait_for_terminate() failed to get the state of the
3499  *       container, the container was terminated by a signal, or
3500  *       failed for an unknown reason.  No change is made to the
3501  *       container argument.
3502  * > 0 : The program executed in the container terminated with an
3503  *       error.  The exit code of the program executed in the
3504  *       container is returned.  The container argument has been set
3505  *       to CONTAINER_TERMINATED.
3506  *   0 : The container is being rebooted, has been shut down or exited
3507  *       successfully.  The container argument has been set to either
3508  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3509  *
3510  * That is, success is indicated by a return value of zero, and an
3511  * error is indicated by a non-zero value.
3512  */
3513 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3514         siginfo_t status;
3515         int r;
3516
3517         r = wait_for_terminate(pid, &status);
3518         if (r < 0)
3519                 return log_warning_errno(r, "Failed to wait for container: %m");
3520
3521         switch (status.si_code) {
3522
3523         case CLD_EXITED:
3524                 if (status.si_status == 0) {
3525                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3526
3527                 } else
3528                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3529
3530                 *container = CONTAINER_TERMINATED;
3531                 return status.si_status;
3532
3533         case CLD_KILLED:
3534                 if (status.si_status == SIGINT) {
3535
3536                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3537                         *container = CONTAINER_TERMINATED;
3538                         return 0;
3539
3540                 } else if (status.si_status == SIGHUP) {
3541
3542                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3543                         *container = CONTAINER_REBOOTED;
3544                         return 0;
3545                 }
3546
3547                 /* CLD_KILLED fallthrough */
3548
3549         case CLD_DUMPED:
3550                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3551                 return -EIO;
3552
3553         default:
3554                 log_error("Container %s failed due to unknown reason.", arg_machine);
3555                 return -EIO;
3556         }
3557
3558         return r;
3559 }
3560
3561 static void nop_handler(int sig) {}
3562
3563 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3564         pid_t pid;
3565
3566         pid = PTR_TO_UINT32(userdata);
3567         if (pid > 0) {
3568                 if (kill(pid, arg_kill_signal) >= 0) {
3569                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3570                         sd_event_source_set_userdata(s, NULL);
3571                         return 0;
3572                 }
3573         }
3574
3575         sd_event_exit(sd_event_source_get_event(s), 0);
3576         return 0;
3577 }
3578
3579 static int determine_names(void) {
3580         int r;
3581
3582         if (!arg_image && !arg_directory) {
3583                 if (arg_machine) {
3584                         _cleanup_(image_unrefp) Image *i = NULL;
3585
3586                         r = image_find(arg_machine, &i);
3587                         if (r < 0)
3588                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3589                         else if (r == 0) {
3590                                 log_error("No image for machine '%s': %m", arg_machine);
3591                                 return -ENOENT;
3592                         }
3593
3594                         if (i->type == IMAGE_RAW)
3595                                 r = set_sanitized_path(&arg_image, i->path);
3596                         else
3597                                 r = set_sanitized_path(&arg_directory, i->path);
3598                         if (r < 0)
3599                                 return log_error_errno(r, "Invalid image directory: %m");
3600
3601                         arg_read_only = arg_read_only || i->read_only;
3602                 } else
3603                         arg_directory = get_current_dir_name();
3604
3605                 if (!arg_directory && !arg_machine) {
3606                         log_error("Failed to determine path, please use -D or -i.");
3607                         return -EINVAL;
3608                 }
3609         }
3610
3611         if (!arg_machine) {
3612                 if (arg_directory && path_equal(arg_directory, "/"))
3613                         arg_machine = gethostname_malloc();
3614                 else
3615                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3616
3617                 if (!arg_machine)
3618                         return log_oom();
3619
3620                 hostname_cleanup(arg_machine, false);
3621                 if (!machine_name_is_valid(arg_machine)) {
3622                         log_error("Failed to determine machine name automatically, please use -M.");
3623                         return -EINVAL;
3624                 }
3625
3626                 if (arg_ephemeral) {
3627                         char *b;
3628
3629                         /* Add a random suffix when this is an
3630                          * ephemeral machine, so that we can run many
3631                          * instances at once without manually having
3632                          * to specify -M each time. */
3633
3634                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3635                                 return log_oom();
3636
3637                         free(arg_machine);
3638                         arg_machine = b;
3639                 }
3640         }
3641
3642         return 0;
3643 }
3644
3645 static int determine_uid_shift(void) {
3646         int r;
3647
3648         if (!arg_userns)
3649                 return 0;
3650
3651         if (arg_uid_shift == UID_INVALID) {
3652                 struct stat st;
3653
3654                 r = stat(arg_directory, &st);
3655                 if (r < 0)
3656                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3657
3658                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3659
3660                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3661                         log_error("UID and GID base of %s don't match.", arg_directory);
3662                         return -EINVAL;
3663                 }
3664
3665                 arg_uid_range = UINT32_C(0x10000);
3666         }
3667
3668         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3669                 log_error("UID base too high for UID range.");
3670                 return -EINVAL;
3671         }
3672
3673         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3674         return 0;
3675 }
3676
3677 int main(int argc, char *argv[]) {
3678
3679         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3680         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3681         _cleanup_close_ int master = -1, image_fd = -1;
3682         _cleanup_fdset_free_ FDSet *fds = NULL;
3683         int r, n_fd_passed, loop_nr = -1;
3684         char veth_name[IFNAMSIZ];
3685         bool secondary = false, remove_subvol = false;
3686         sigset_t mask, mask_chld;
3687         pid_t pid = 0;
3688         int ret = EXIT_SUCCESS;
3689         union in_addr_union exposed = {};
3690         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3691         bool interactive;
3692
3693         log_parse_environment();
3694         log_open();
3695
3696         r = parse_argv(argc, argv);
3697         if (r <= 0)
3698                 goto finish;
3699
3700         r = determine_names();
3701         if (r < 0)
3702                 goto finish;
3703
3704         if (geteuid() != 0) {
3705                 log_error("Need to be root.");
3706                 r = -EPERM;
3707                 goto finish;
3708         }
3709
3710         if (sd_booted() <= 0) {
3711                 log_error("Not running on a systemd system.");
3712                 r = -EINVAL;
3713                 goto finish;
3714         }
3715
3716         log_close();
3717         n_fd_passed = sd_listen_fds(false);
3718         if (n_fd_passed > 0) {
3719                 r = fdset_new_listen_fds(&fds, false);
3720                 if (r < 0) {
3721                         log_error_errno(r, "Failed to collect file descriptors: %m");
3722                         goto finish;
3723                 }
3724         }
3725         fdset_close_others(fds);
3726         log_open();
3727
3728         if (arg_directory) {
3729                 assert(!arg_image);
3730
3731                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3732                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3733                         r = -EINVAL;
3734                         goto finish;
3735                 }
3736
3737                 if (arg_ephemeral) {
3738                         _cleanup_free_ char *np = NULL;
3739
3740                         /* If the specified path is a mount point we
3741                          * generate the new snapshot immediately
3742                          * inside it under a random name. However if
3743                          * the specified is not a mount point we
3744                          * create the new snapshot in the parent
3745                          * directory, just next to it. */
3746                         r = path_is_mount_point(arg_directory, false);
3747                         if (r < 0) {
3748                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3749                                 goto finish;
3750                         }
3751                         if (r > 0)
3752                                 r = tempfn_random_child(arg_directory, &np);
3753                         else
3754                                 r = tempfn_random(arg_directory, &np);
3755                         if (r < 0) {
3756                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3757                                 goto finish;
3758                         }
3759
3760                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3761                         if (r < 0) {
3762                                 log_error_errno(r, "Failed to lock %s: %m", np);
3763                                 goto finish;
3764                         }
3765
3766                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3767                         if (r < 0) {
3768                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3769                                 goto finish;
3770                         }
3771
3772                         free(arg_directory);
3773                         arg_directory = np;
3774                         np = NULL;
3775
3776                         remove_subvol = true;
3777
3778                 } else {
3779                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3780                         if (r == -EBUSY) {
3781                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3782                                 goto finish;
3783                         }
3784                         if (r < 0) {
3785                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3786                                 return r;
3787                         }
3788
3789                         if (arg_template) {
3790                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3791                                 if (r == -EEXIST) {
3792                                         if (!arg_quiet)
3793                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3794                                 } else if (r < 0) {
3795                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3796                                         goto finish;
3797                                 } else {
3798                                         if (!arg_quiet)
3799                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3800                                 }
3801                         }
3802                 }
3803
3804                 if (arg_boot) {
3805                         if (path_is_os_tree(arg_directory) <= 0) {
3806                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3807                                 r = -EINVAL;
3808                                 goto finish;
3809                         }
3810                 } else {
3811                         const char *p;
3812
3813                         p = strjoina(arg_directory,
3814                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3815                         if (access(p, F_OK) < 0) {
3816                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3817                                 r = -EINVAL;
3818                                 goto finish;
3819                         }
3820                 }
3821
3822         } else {
3823                 char template[] = "/tmp/nspawn-root-XXXXXX";
3824
3825                 assert(arg_image);
3826                 assert(!arg_template);
3827
3828                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3829                 if (r == -EBUSY) {
3830                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3831                         goto finish;
3832                 }
3833                 if (r < 0) {
3834                         r = log_error_errno(r, "Failed to create image lock: %m");
3835                         goto finish;
3836                 }
3837
3838                 if (!mkdtemp(template)) {
3839                         log_error_errno(errno, "Failed to create temporary directory: %m");
3840                         r = -errno;
3841                         goto finish;
3842                 }
3843
3844                 arg_directory = strdup(template);
3845                 if (!arg_directory) {
3846                         r = log_oom();
3847                         goto finish;
3848                 }
3849
3850                 image_fd = setup_image(&device_path, &loop_nr);
3851                 if (image_fd < 0) {
3852                         r = image_fd;
3853                         goto finish;
3854                 }
3855
3856                 r = dissect_image(image_fd,
3857                                   &root_device, &root_device_rw,
3858                                   &home_device, &home_device_rw,
3859                                   &srv_device, &srv_device_rw,
3860                                   &secondary);
3861                 if (r < 0)
3862                         goto finish;
3863         }
3864
3865         r = determine_uid_shift();
3866         if (r < 0)
3867                 goto finish;
3868
3869         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3870
3871         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3872         if (master < 0) {
3873                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3874                 goto finish;
3875         }
3876
3877         r = ptsname_malloc(master, &console);
3878         if (r < 0) {
3879                 r = log_error_errno(r, "Failed to determine tty name: %m");
3880                 goto finish;
3881         }
3882
3883         if (unlockpt(master) < 0) {
3884                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3885                 goto finish;
3886         }
3887
3888         if (!arg_quiet)
3889                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3890                          arg_machine, arg_image ?: arg_directory);
3891
3892         assert_se(sigemptyset(&mask) == 0);
3893         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3894         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3895
3896         assert_se(sigemptyset(&mask_chld) == 0);
3897         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3898
3899         for (;;) {
3900                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3901                 ContainerStatus container_status;
3902                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3903                 struct sigaction sa = {
3904                         .sa_handler = nop_handler,
3905                         .sa_flags = SA_NOCLDSTOP,
3906                 };
3907
3908                 r = barrier_create(&barrier);
3909                 if (r < 0) {
3910                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3911                         goto finish;
3912                 }
3913
3914                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3915                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3916                         goto finish;
3917                 }
3918
3919                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3920                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3921                         goto finish;
3922                 }
3923
3924                 /* Child can be killed before execv(), so handle SIGCHLD
3925                  * in order to interrupt parent's blocking calls and
3926                  * give it a chance to call wait() and terminate. */
3927                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3928                 if (r < 0) {
3929                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3930                         goto finish;
3931                 }
3932
3933                 r = sigaction(SIGCHLD, &sa, NULL);
3934                 if (r < 0) {
3935                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3936                         goto finish;
3937                 }
3938
3939                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3940                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3941                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3942                 if (pid < 0) {
3943                         if (errno == EINVAL)
3944                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3945                         else
3946                                 r = log_error_errno(errno, "clone() failed: %m");
3947
3948                         goto finish;
3949                 }
3950
3951                 if (pid == 0) {
3952                         /* child */
3953                         _cleanup_free_ char *home = NULL;
3954                         unsigned n_env = 2;
3955                         const char *envp[] = {
3956                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3957                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3958                                 NULL, /* TERM */
3959                                 NULL, /* HOME */
3960                                 NULL, /* USER */
3961                                 NULL, /* LOGNAME */
3962                                 NULL, /* container_uuid */
3963                                 NULL, /* LISTEN_FDS */
3964                                 NULL, /* LISTEN_PID */
3965                                 NULL
3966                         };
3967                         char **env_use;
3968
3969                         barrier_set_role(&barrier, BARRIER_CHILD);
3970
3971                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3972                         if (envp[n_env])
3973                                 n_env ++;
3974
3975                         master = safe_close(master);
3976
3977                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3978                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3979
3980                         reset_all_signal_handlers();
3981                         reset_signal_mask();
3982
3983                         if (interactive) {
3984                                 close_nointr(STDIN_FILENO);
3985                                 close_nointr(STDOUT_FILENO);
3986                                 close_nointr(STDERR_FILENO);
3987
3988                                 r = open_terminal(console, O_RDWR);
3989                                 if (r != STDIN_FILENO) {
3990                                         if (r >= 0) {
3991                                                 safe_close(r);
3992                                                 r = -EINVAL;
3993                                         }
3994
3995                                         log_error_errno(r, "Failed to open console: %m");
3996                                         _exit(EXIT_FAILURE);
3997                                 }
3998
3999                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4000                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4001                                         log_error_errno(errno, "Failed to duplicate console: %m");
4002                                         _exit(EXIT_FAILURE);
4003                                 }
4004                         }
4005
4006                         if (setsid() < 0) {
4007                                 log_error_errno(errno, "setsid() failed: %m");
4008                                 _exit(EXIT_FAILURE);
4009                         }
4010
4011                         if (reset_audit_loginuid() < 0)
4012                                 _exit(EXIT_FAILURE);
4013
4014                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4015                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4016                                 _exit(EXIT_FAILURE);
4017                         }
4018
4019                         if (arg_private_network)
4020                                 loopback_setup();
4021
4022                         /* Mark everything as slave, so that we still
4023                          * receive mounts from the real root, but don't
4024                          * propagate mounts to the real root. */
4025                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4026                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4027                                 _exit(EXIT_FAILURE);
4028                         }
4029
4030                         if (mount_devices(arg_directory,
4031                                           root_device, root_device_rw,
4032                                           home_device, home_device_rw,
4033                                           srv_device, srv_device_rw) < 0)
4034                                 _exit(EXIT_FAILURE);
4035
4036                         /* Turn directory into bind mount */
4037                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4038                                 log_error_errno(errno, "Failed to make bind mount: %m");
4039                                 _exit(EXIT_FAILURE);
4040                         }
4041
4042                         r = setup_volatile(arg_directory);
4043                         if (r < 0)
4044                                 _exit(EXIT_FAILURE);
4045
4046                         if (setup_volatile_state(arg_directory) < 0)
4047                                 _exit(EXIT_FAILURE);
4048
4049                         r = base_filesystem_create(arg_directory);
4050                         if (r < 0)
4051                                 _exit(EXIT_FAILURE);
4052
4053                         if (arg_read_only) {
4054                                 r = bind_remount_recursive(arg_directory, true);
4055                                 if (r < 0) {
4056                                         log_error_errno(r, "Failed to make tree read-only: %m");
4057                                         _exit(EXIT_FAILURE);
4058                                 }
4059                         }
4060
4061                         if (mount_all(arg_directory) < 0)
4062                                 _exit(EXIT_FAILURE);
4063
4064                         if (copy_devnodes(arg_directory) < 0)
4065                                 _exit(EXIT_FAILURE);
4066
4067                         if (setup_ptmx(arg_directory) < 0)
4068                                 _exit(EXIT_FAILURE);
4069
4070                         dev_setup(arg_directory);
4071
4072                         if (setup_propagate(arg_directory) < 0)
4073                                 _exit(EXIT_FAILURE);
4074
4075                         if (setup_seccomp() < 0)
4076                                 _exit(EXIT_FAILURE);
4077
4078                         if (setup_dev_console(arg_directory, console) < 0)
4079                                 _exit(EXIT_FAILURE);
4080
4081                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4082                                 _exit(EXIT_FAILURE);
4083                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4084
4085                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4086                                 _exit(EXIT_FAILURE);
4087                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4088
4089                         /* Tell the parent that we are ready, and that
4090                          * it can cgroupify us to that we lack access
4091                          * to certain devices and resources. */
4092                         (void) barrier_place(&barrier); /* #1 */
4093
4094                         if (setup_boot_id(arg_directory) < 0)
4095                                 _exit(EXIT_FAILURE);
4096
4097                         if (setup_timezone(arg_directory) < 0)
4098                                 _exit(EXIT_FAILURE);
4099
4100                         if (setup_resolv_conf(arg_directory) < 0)
4101                                 _exit(EXIT_FAILURE);
4102
4103                         if (setup_journal(arg_directory) < 0)
4104                                 _exit(EXIT_FAILURE);
4105
4106                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4107                                 _exit(EXIT_FAILURE);
4108
4109                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4110                                 _exit(EXIT_FAILURE);
4111
4112                         if (mount_tmpfs(arg_directory) < 0)
4113                                 _exit(EXIT_FAILURE);
4114
4115                         /* Wait until we are cgroup-ified, so that we
4116                          * can mount the right cgroup path writable */
4117                         (void) barrier_place_and_sync(&barrier); /* #2 */
4118
4119                         if (mount_cgroup(arg_directory) < 0)
4120                                 _exit(EXIT_FAILURE);
4121
4122                         if (chdir(arg_directory) < 0) {
4123                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4124                                 _exit(EXIT_FAILURE);
4125                         }
4126
4127                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4128                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4129                                 _exit(EXIT_FAILURE);
4130                         }
4131
4132                         if (chroot(".") < 0) {
4133                                 log_error_errno(errno, "chroot() failed: %m");
4134                                 _exit(EXIT_FAILURE);
4135                         }
4136
4137                         if (chdir("/") < 0) {
4138                                 log_error_errno(errno, "chdir() failed: %m");
4139                                 _exit(EXIT_FAILURE);
4140                         }
4141
4142                         if (arg_userns) {
4143                                 if (unshare(CLONE_NEWUSER) < 0) {
4144                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4145                                         _exit(EXIT_FAILURE);
4146                                 }
4147
4148                                 /* Tell the parent, that it now can
4149                                  * write the UID map. */
4150                                 (void) barrier_place(&barrier); /* #3 */
4151
4152                                 /* Wait until the parent wrote the UID
4153                                  * map */
4154                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4155                         }
4156
4157                         umask(0022);
4158
4159                         if (drop_capabilities() < 0) {
4160                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4161                                 _exit(EXIT_FAILURE);
4162                         }
4163
4164                         setup_hostname();
4165
4166                         if (arg_personality != 0xffffffffLU) {
4167                                 if (personality(arg_personality) < 0) {
4168                                         log_error_errno(errno, "personality() failed: %m");
4169                                         _exit(EXIT_FAILURE);
4170                                 }
4171                         } else if (secondary) {
4172                                 if (personality(PER_LINUX32) < 0) {
4173                                         log_error_errno(errno, "personality() failed: %m");
4174                                         _exit(EXIT_FAILURE);
4175                                 }
4176                         }
4177
4178 #ifdef HAVE_SELINUX
4179                         if (arg_selinux_context)
4180                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4181                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4182                                         _exit(EXIT_FAILURE);
4183                                 }
4184 #endif
4185
4186                         r = change_uid_gid(&home);
4187                         if (r < 0)
4188                                 _exit(EXIT_FAILURE);
4189
4190                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4191                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4192                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4193                                 log_oom();
4194                                 _exit(EXIT_FAILURE);
4195                         }
4196
4197                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4198                                 char as_uuid[37];
4199
4200                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4201                                         log_oom();
4202                                         _exit(EXIT_FAILURE);
4203                                 }
4204                         }
4205
4206                         if (fdset_size(fds) > 0) {
4207                                 r = fdset_cloexec(fds, false);
4208                                 if (r < 0) {
4209                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4210                                         _exit(EXIT_FAILURE);
4211                                 }
4212
4213                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4214                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4215                                         log_oom();
4216                                         _exit(EXIT_FAILURE);
4217                                 }
4218                         }
4219
4220                         if (!strv_isempty(arg_setenv)) {
4221                                 char **n;
4222
4223                                 n = strv_env_merge(2, envp, arg_setenv);
4224                                 if (!n) {
4225                                         log_oom();
4226                                         _exit(EXIT_FAILURE);
4227                                 }
4228
4229                                 env_use = n;
4230                         } else
4231                                 env_use = (char**) envp;
4232
4233                         /* Let the parent know that we are ready and
4234                          * wait until the parent is ready with the
4235                          * setup, too... */
4236                         (void) barrier_place_and_sync(&barrier); /* #5 */
4237
4238                         if (arg_boot) {
4239                                 char **a;
4240                                 size_t l;
4241
4242                                 /* Automatically search for the init system */
4243
4244                                 l = 1 + argc - optind;
4245                                 a = newa(char*, l + 1);
4246                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4247
4248                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4249                                 execve(a[0], a, env_use);
4250
4251                                 a[0] = (char*) "/lib/systemd/systemd";
4252                                 execve(a[0], a, env_use);
4253
4254                                 a[0] = (char*) "/sbin/init";
4255                                 execve(a[0], a, env_use);
4256                         } else if (argc > optind)
4257                                 execvpe(argv[optind], argv + optind, env_use);
4258                         else {
4259                                 chdir(home ? home : "/root");
4260                                 execle("/bin/bash", "-bash", NULL, env_use);
4261                                 execle("/bin/sh", "-sh", NULL, env_use);
4262                         }
4263
4264                         log_error_errno(errno, "execv() failed: %m");
4265                         _exit(EXIT_FAILURE);
4266                 }
4267
4268                 barrier_set_role(&barrier, BARRIER_PARENT);
4269                 fdset_free(fds);
4270                 fds = NULL;
4271
4272                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4273                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4274
4275                 (void) barrier_place(&barrier); /* #1 */
4276
4277                 /* Wait for the most basic Child-setup to be done,
4278                  * before we add hardware to it, and place it in a
4279                  * cgroup. */
4280                 if (barrier_sync(&barrier)) { /* #1 */
4281                         int ifi = 0;
4282
4283                         r = move_network_interfaces(pid);
4284                         if (r < 0)
4285                                 goto finish;
4286
4287                         r = setup_veth(pid, veth_name, &ifi);
4288                         if (r < 0)
4289                                 goto finish;
4290
4291                         r = setup_bridge(veth_name, &ifi);
4292                         if (r < 0)
4293                                 goto finish;
4294
4295                         r = setup_macvlan(pid);
4296                         if (r < 0)
4297                                 goto finish;
4298
4299                         r = setup_ipvlan(pid);
4300                         if (r < 0)
4301                                 goto finish;
4302
4303                         r = register_machine(pid, ifi);
4304                         if (r < 0)
4305                                 goto finish;
4306
4307                         /* Notify the child that the parent is ready with all
4308                          * its setup, and that the child can now hand over
4309                          * control to the code to run inside the container. */
4310                         (void) barrier_place(&barrier); /* #2 */
4311
4312                         if (arg_userns) {
4313                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4314
4315                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4316
4317                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4318                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4319                                 r = write_string_file(uid_map, line);
4320                                 if (r < 0) {
4321                                         log_error_errno(r, "Failed to write UID map: %m");
4322                                         goto finish;
4323                                 }
4324
4325                                 /* We always assign the same UID and GID ranges */
4326                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4327                                 r = write_string_file(uid_map, line);
4328                                 if (r < 0) {
4329                                         log_error_errno(r, "Failed to write GID map: %m");
4330                                         goto finish;
4331                                 }
4332
4333                                 (void) barrier_place(&barrier); /* #4 */
4334                         }
4335
4336                         /* Block SIGCHLD here, before notifying child.
4337                          * process_pty() will handle it with the other signals. */
4338                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4339                         if (r < 0)
4340                                 goto finish;
4341
4342                         /* Reset signal to default */
4343                         r = default_signals(SIGCHLD, -1);
4344                         if (r < 0)
4345                                 goto finish;
4346
4347                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4348                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4349                                 _cleanup_event_unref_ sd_event *event = NULL;
4350                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4351                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4352                                 char last_char = 0;
4353
4354                                 sd_notifyf(false,
4355                                            "READY=1\n"
4356                                            "STATUS=Container running.\n"
4357                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4358
4359                                 r = sd_event_new(&event);
4360                                 if (r < 0) {
4361                                         log_error_errno(r, "Failed to get default event source: %m");
4362                                         goto finish;
4363                                 }
4364
4365                                 if (arg_kill_signal > 0) {
4366                                         /* Try to kill the init system on SIGINT or SIGTERM */
4367                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4368                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4369                                 } else {
4370                                         /* Immediately exit */
4371                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4372                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4373                                 }
4374
4375                                 /* simply exit on sigchld */
4376                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4377
4378                                 if (arg_expose_ports) {
4379                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4380                                         if (r < 0)
4381                                                 goto finish;
4382
4383                                         (void) expose_ports(rtnl, &exposed);
4384                                 }
4385
4386                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4387
4388                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4389                                 if (r < 0) {
4390                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4391                                         goto finish;
4392                                 }
4393
4394                                 r = sd_event_loop(event);
4395                                 if (r < 0) {
4396                                         log_error_errno(r, "Failed to run event loop: %m");
4397                                         goto finish;
4398                                 }
4399
4400                                 pty_forward_get_last_char(forward, &last_char);
4401
4402                                 forward = pty_forward_free(forward);
4403
4404                                 if (!arg_quiet && last_char != '\n')
4405                                         putc('\n', stdout);
4406
4407                                 /* Kill if it is not dead yet anyway */
4408                                 terminate_machine(pid);
4409                         }
4410                 }
4411
4412                 /* Normally redundant, but better safe than sorry */
4413                 kill(pid, SIGKILL);
4414
4415                 r = wait_for_container(pid, &container_status);
4416                 pid = 0;
4417
4418                 if (r < 0)
4419                         /* We failed to wait for the container, or the
4420                          * container exited abnormally */
4421                         goto finish;
4422                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4423                         /* The container exited with a non-zero
4424                          * status, or with zero status and no reboot
4425                          * was requested. */
4426                         ret = r;
4427                         break;
4428                 }
4429
4430                 /* CONTAINER_REBOOTED, loop again */
4431
4432                 if (arg_keep_unit) {
4433                         /* Special handling if we are running as a
4434                          * service: instead of simply restarting the
4435                          * machine we want to restart the entire
4436                          * service, so let's inform systemd about this
4437                          * with the special exit code 133. The service
4438                          * file uses RestartForceExitStatus=133 so
4439                          * that this results in a full nspawn
4440                          * restart. This is necessary since we might
4441                          * have cgroup parameters set we want to have
4442                          * flushed out. */
4443                         ret = 133;
4444                         r = 0;
4445                         break;
4446                 }
4447
4448                 flush_ports(&exposed);
4449         }
4450
4451 finish:
4452         sd_notify(false,
4453                   "STOPPING=1\n"
4454                   "STATUS=Terminating...");
4455
4456         loop_remove(loop_nr, &image_fd);
4457
4458         if (pid > 0)
4459                 kill(pid, SIGKILL);
4460
4461         if (remove_subvol && arg_directory) {
4462                 int k;
4463
4464                 k = btrfs_subvol_remove(arg_directory);
4465                 if (k < 0)
4466                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4467         }
4468
4469         if (arg_machine) {
4470                 const char *p;
4471
4472                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4473                 (void) rm_rf(p, false, true, false);
4474         }
4475
4476         free(arg_directory);
4477         free(arg_template);
4478         free(arg_image);
4479         free(arg_machine);
4480         free(arg_user);
4481         strv_free(arg_setenv);
4482         strv_free(arg_network_interfaces);
4483         strv_free(arg_network_macvlan);
4484         strv_free(arg_network_ipvlan);
4485         strv_free(arg_bind);
4486         strv_free(arg_bind_ro);
4487         strv_free(arg_tmpfs);
4488
4489         flush_ports(&exposed);
4490
4491         while (arg_expose_ports) {
4492                 ExposePort *p = arg_expose_ports;
4493                 LIST_REMOVE(ports, arg_expose_ports, p);
4494                 free(p);
4495         }
4496
4497         return r < 0 ? EXIT_FAILURE : ret;
4498 }