chiark / gitweb /
8ce5fbeb629f6b25db1bd7cd0c297bf21c0e5067
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "macro.h"
63 #include "missing.h"
64 #include "cgroup-util.h"
65 #include "strv.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
69 #include "fdset.h"
70 #include "build.h"
71 #include "fileio.h"
72 #include "bus-util.h"
73 #include "bus-error.h"
74 #include "ptyfwd.h"
75 #include "env-util.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
79 #include "gpt.h"
80 #include "siphash24.h"
81 #include "copy.h"
82 #include "base-filesystem.h"
83 #include "barrier.h"
84 #include "event-util.h"
85 #include "capability.h"
86 #include "cap-list.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
89 #include "list.h"
90 #include "in-addr-util.h"
91 #include "fw-util.h"
92 #include "local-addresses.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef struct ExposePort {
99         int protocol;
100         uint16_t host_port;
101         uint16_t container_port;
102         LIST_FIELDS(struct ExposePort, ports);
103 } ExposePort;
104
105 typedef enum ContainerStatus {
106         CONTAINER_TERMINATED,
107         CONTAINER_REBOOTED
108 } ContainerStatus;
109
110 typedef enum LinkJournal {
111         LINK_NO,
112         LINK_AUTO,
113         LINK_HOST,
114         LINK_GUEST
115 } LinkJournal;
116
117 typedef enum Volatile {
118         VOLATILE_NO,
119         VOLATILE_YES,
120         VOLATILE_STATE,
121 } Volatile;
122
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138         (1ULL << CAP_CHOWN) |
139         (1ULL << CAP_DAC_OVERRIDE) |
140         (1ULL << CAP_DAC_READ_SEARCH) |
141         (1ULL << CAP_FOWNER) |
142         (1ULL << CAP_FSETID) |
143         (1ULL << CAP_IPC_OWNER) |
144         (1ULL << CAP_KILL) |
145         (1ULL << CAP_LEASE) |
146         (1ULL << CAP_LINUX_IMMUTABLE) |
147         (1ULL << CAP_NET_BIND_SERVICE) |
148         (1ULL << CAP_NET_BROADCAST) |
149         (1ULL << CAP_NET_RAW) |
150         (1ULL << CAP_SETGID) |
151         (1ULL << CAP_SETFCAP) |
152         (1ULL << CAP_SETPCAP) |
153         (1ULL << CAP_SETUID) |
154         (1ULL << CAP_SYS_ADMIN) |
155         (1ULL << CAP_SYS_CHROOT) |
156         (1ULL << CAP_SYS_NICE) |
157         (1ULL << CAP_SYS_PTRACE) |
158         (1ULL << CAP_SYS_TTY_CONFIG) |
159         (1ULL << CAP_SYS_RESOURCE) |
160         (1ULL << CAP_SYS_BOOT) |
161         (1ULL << CAP_AUDIT_WRITE) |
162         (1ULL << CAP_AUDIT_CONTROL) |
163         (1ULL << CAP_MKNOD);
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184 static int arg_kill_signal = 0;
185
186 static void help(void) {
187         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
188                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
189                "  -h --help                 Show this help\n"
190                "     --version              Print version string\n"
191                "  -q --quiet                Do not show status information\n"
192                "  -D --directory=PATH       Root directory for the container\n"
193                "     --template=PATH        Initialize root directory from template directory,\n"
194                "                            if missing\n"
195                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
196                "                            remove it after exit\n"
197                "  -i --image=PATH           File system device or disk image for the container\n"
198                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
199                "  -u --user=USER            Run the command under specified user or uid\n"
200                "  -M --machine=NAME         Set the machine name for the container\n"
201                "     --uuid=UUID            Set a specific machine UUID for the container\n"
202                "  -S --slice=SLICE          Place the container in the specified slice\n"
203                "     --property=NAME=VALUE  Set scope unit property\n"
204                "     --private-network      Disable network in container\n"
205                "     --network-interface=INTERFACE\n"
206                "                            Assign an existing network interface to the\n"
207                "                            container\n"
208                "     --network-macvlan=INTERFACE\n"
209                "                            Create a macvlan network interface based on an\n"
210                "                            existing network interface to the container\n"
211                "     --network-ipvlan=INTERFACE\n"
212                "                            Create a ipvlan network interface based on an\n"
213                "                            existing network interface to the container\n"
214                "  -n --network-veth         Add a virtual ethernet connection between host\n"
215                "                            and container\n"
216                "     --network-bridge=INTERFACE\n"
217                "                            Add a virtual ethernet connection between host\n"
218                "                            and container and add it to an existing bridge on\n"
219                "                            the host\n"
220                "     --private-users[=UIDBASE[:NUIDS]]\n"
221                "                            Run within user namespace\n"
222                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
223                "                            Expose a container IP port on the host\n"
224                "  -Z --selinux-context=SECLABEL\n"
225                "                            Set the SELinux security context to be used by\n"
226                "                            processes in the container\n"
227                "  -L --selinux-apifs-context=SECLABEL\n"
228                "                            Set the SELinux security context to be used by\n"
229                "                            API/tmpfs file systems in the container\n"
230                "     --capability=CAP       In addition to the default, retain specified\n"
231                "                            capability\n"
232                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
233                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
234                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
235                "                            try-guest, try-host\n"
236                "  -j                        Equivalent to --link-journal=try-guest\n"
237                "     --read-only            Mount the root directory read-only\n"
238                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
239                "                            the container\n"
240                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
241                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
242                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
243                "     --share-system         Share system namespaces with host\n"
244                "     --register=BOOLEAN     Register container as machine\n"
245                "     --keep-unit            Do not register a scope for the machine, reuse\n"
246                "                            the service unit nspawn is running in\n"
247                "     --volatile[=MODE]      Run the system in volatile mode\n"
248                , program_invocation_short_name);
249 }
250
251 static int set_sanitized_path(char **b, const char *path) {
252         char *p;
253
254         assert(b);
255         assert(path);
256
257         p = canonicalize_file_name(path);
258         if (!p) {
259                 if (errno != ENOENT)
260                         return -errno;
261
262                 p = path_make_absolute_cwd(path);
263                 if (!p)
264                         return -ENOMEM;
265         }
266
267         free(*b);
268         *b = path_kill_slashes(p);
269         return 0;
270 }
271
272 static int parse_argv(int argc, char *argv[]) {
273
274         enum {
275                 ARG_VERSION = 0x100,
276                 ARG_PRIVATE_NETWORK,
277                 ARG_UUID,
278                 ARG_READ_ONLY,
279                 ARG_CAPABILITY,
280                 ARG_DROP_CAPABILITY,
281                 ARG_LINK_JOURNAL,
282                 ARG_BIND,
283                 ARG_BIND_RO,
284                 ARG_TMPFS,
285                 ARG_SETENV,
286                 ARG_SHARE_SYSTEM,
287                 ARG_REGISTER,
288                 ARG_KEEP_UNIT,
289                 ARG_NETWORK_INTERFACE,
290                 ARG_NETWORK_MACVLAN,
291                 ARG_NETWORK_IPVLAN,
292                 ARG_NETWORK_BRIDGE,
293                 ARG_PERSONALITY,
294                 ARG_VOLATILE,
295                 ARG_TEMPLATE,
296                 ARG_PROPERTY,
297                 ARG_PRIVATE_USERS,
298                 ARG_KILL_SIGNAL,
299         };
300
301         static const struct option options[] = {
302                 { "help",                  no_argument,       NULL, 'h'                   },
303                 { "version",               no_argument,       NULL, ARG_VERSION           },
304                 { "directory",             required_argument, NULL, 'D'                   },
305                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
306                 { "ephemeral",             no_argument,       NULL, 'x'                   },
307                 { "user",                  required_argument, NULL, 'u'                   },
308                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
309                 { "boot",                  no_argument,       NULL, 'b'                   },
310                 { "uuid",                  required_argument, NULL, ARG_UUID              },
311                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
312                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
313                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
314                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
315                 { "bind",                  required_argument, NULL, ARG_BIND              },
316                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
317                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
318                 { "machine",               required_argument, NULL, 'M'                   },
319                 { "slice",                 required_argument, NULL, 'S'                   },
320                 { "setenv",                required_argument, NULL, ARG_SETENV            },
321                 { "selinux-context",       required_argument, NULL, 'Z'                   },
322                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
323                 { "quiet",                 no_argument,       NULL, 'q'                   },
324                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
325                 { "register",              required_argument, NULL, ARG_REGISTER          },
326                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
327                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
328                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
329                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
330                 { "network-veth",          no_argument,       NULL, 'n'                   },
331                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
332                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
333                 { "image",                 required_argument, NULL, 'i'                   },
334                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
335                 { "port",                  required_argument, NULL, 'p'                   },
336                 { "property",              required_argument, NULL, ARG_PROPERTY          },
337                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
338                 { "kill-signal",           required_argument, NULL, ARG_KILL_SIGNAL       },
339                 {}
340         };
341
342         int c, r;
343         uint64_t plus = 0, minus = 0;
344
345         assert(argc >= 0);
346         assert(argv);
347
348         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
349
350                 switch (c) {
351
352                 case 'h':
353                         help();
354                         return 0;
355
356                 case ARG_VERSION:
357                         puts(PACKAGE_STRING);
358                         puts(SYSTEMD_FEATURES);
359                         return 0;
360
361                 case 'D':
362                         r = set_sanitized_path(&arg_directory, optarg);
363                         if (r < 0)
364                                 return log_error_errno(r, "Invalid root directory: %m");
365
366                         break;
367
368                 case ARG_TEMPLATE:
369                         r = set_sanitized_path(&arg_template, optarg);
370                         if (r < 0)
371                                 return log_error_errno(r, "Invalid template directory: %m");
372
373                         break;
374
375                 case 'i':
376                         r = set_sanitized_path(&arg_image, optarg);
377                         if (r < 0)
378                                 return log_error_errno(r, "Invalid image path: %m");
379
380                         break;
381
382                 case 'x':
383                         arg_ephemeral = true;
384                         break;
385
386                 case 'u':
387                         free(arg_user);
388                         arg_user = strdup(optarg);
389                         if (!arg_user)
390                                 return log_oom();
391
392                         break;
393
394                 case ARG_NETWORK_BRIDGE:
395                         arg_network_bridge = optarg;
396
397                         /* fall through */
398
399                 case 'n':
400                         arg_network_veth = true;
401                         arg_private_network = true;
402                         break;
403
404                 case ARG_NETWORK_INTERFACE:
405                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
406                                 return log_oom();
407
408                         arg_private_network = true;
409                         break;
410
411                 case ARG_NETWORK_MACVLAN:
412                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
413                                 return log_oom();
414
415                         arg_private_network = true;
416                         break;
417
418                 case ARG_NETWORK_IPVLAN:
419                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
420                                 return log_oom();
421
422                         /* fall through */
423
424                 case ARG_PRIVATE_NETWORK:
425                         arg_private_network = true;
426                         break;
427
428                 case 'b':
429                         arg_boot = true;
430                         break;
431
432                 case ARG_UUID:
433                         r = sd_id128_from_string(optarg, &arg_uuid);
434                         if (r < 0) {
435                                 log_error("Invalid UUID: %s", optarg);
436                                 return r;
437                         }
438                         break;
439
440                 case 'S':
441                         arg_slice = optarg;
442                         break;
443
444                 case 'M':
445                         if (isempty(optarg)) {
446                                 free(arg_machine);
447                                 arg_machine = NULL;
448                         } else {
449                                 if (!machine_name_is_valid(optarg)) {
450                                         log_error("Invalid machine name: %s", optarg);
451                                         return -EINVAL;
452                                 }
453
454                                 r = free_and_strdup(&arg_machine, optarg);
455                                 if (r < 0)
456                                         return log_oom();
457
458                                 break;
459                         }
460
461                 case 'Z':
462                         arg_selinux_context = optarg;
463                         break;
464
465                 case 'L':
466                         arg_selinux_apifs_context = optarg;
467                         break;
468
469                 case ARG_READ_ONLY:
470                         arg_read_only = true;
471                         break;
472
473                 case ARG_CAPABILITY:
474                 case ARG_DROP_CAPABILITY: {
475                         const char *state, *word;
476                         size_t length;
477
478                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
479                                 _cleanup_free_ char *t;
480
481                                 t = strndup(word, length);
482                                 if (!t)
483                                         return log_oom();
484
485                                 if (streq(t, "all")) {
486                                         if (c == ARG_CAPABILITY)
487                                                 plus = (uint64_t) -1;
488                                         else
489                                                 minus = (uint64_t) -1;
490                                 } else {
491                                         int cap;
492
493                                         cap = capability_from_name(t);
494                                         if (cap < 0) {
495                                                 log_error("Failed to parse capability %s.", t);
496                                                 return -EINVAL;
497                                         }
498
499                                         if (c == ARG_CAPABILITY)
500                                                 plus |= 1ULL << (uint64_t) cap;
501                                         else
502                                                 minus |= 1ULL << (uint64_t) cap;
503                                 }
504                         }
505
506                         break;
507                 }
508
509                 case 'j':
510                         arg_link_journal = LINK_GUEST;
511                         arg_link_journal_try = true;
512                         break;
513
514                 case ARG_LINK_JOURNAL:
515                         if (streq(optarg, "auto")) {
516                                 arg_link_journal = LINK_AUTO;
517                                 arg_link_journal_try = false;
518                         } else if (streq(optarg, "no")) {
519                                 arg_link_journal = LINK_NO;
520                                 arg_link_journal_try = false;
521                         } else if (streq(optarg, "guest")) {
522                                 arg_link_journal = LINK_GUEST;
523                                 arg_link_journal_try = false;
524                         } else if (streq(optarg, "host")) {
525                                 arg_link_journal = LINK_HOST;
526                                 arg_link_journal_try = false;
527                         } else if (streq(optarg, "try-guest")) {
528                                 arg_link_journal = LINK_GUEST;
529                                 arg_link_journal_try = true;
530                         } else if (streq(optarg, "try-host")) {
531                                 arg_link_journal = LINK_HOST;
532                                 arg_link_journal_try = true;
533                         } else {
534                                 log_error("Failed to parse link journal mode %s", optarg);
535                                 return -EINVAL;
536                         }
537
538                         break;
539
540                 case ARG_BIND:
541                 case ARG_BIND_RO: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544                         char ***x;
545
546                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
547
548                         e = strchr(optarg, ':');
549                         if (e) {
550                                 a = strndup(optarg, e - optarg);
551                                 b = strdup(e + 1);
552                         } else {
553                                 a = strdup(optarg);
554                                 b = strdup(optarg);
555                         }
556
557                         if (!a || !b)
558                                 return log_oom();
559
560                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
561                                 log_error("Invalid bind mount specification: %s", optarg);
562                                 return -EINVAL;
563                         }
564
565                         r = strv_extend(x, a);
566                         if (r < 0)
567                                 return log_oom();
568
569                         r = strv_extend(x, b);
570                         if (r < 0)
571                                 return log_oom();
572
573                         break;
574                 }
575
576                 case ARG_TMPFS: {
577                         _cleanup_free_ char *a = NULL, *b = NULL;
578                         char *e;
579
580                         e = strchr(optarg, ':');
581                         if (e) {
582                                 a = strndup(optarg, e - optarg);
583                                 b = strdup(e + 1);
584                         } else {
585                                 a = strdup(optarg);
586                                 b = strdup("mode=0755");
587                         }
588
589                         if (!a || !b)
590                                 return log_oom();
591
592                         if (!path_is_absolute(a)) {
593                                 log_error("Invalid tmpfs specification: %s", optarg);
594                                 return -EINVAL;
595                         }
596
597                         r = strv_push(&arg_tmpfs, a);
598                         if (r < 0)
599                                 return log_oom();
600
601                         a = NULL;
602
603                         r = strv_push(&arg_tmpfs, b);
604                         if (r < 0)
605                                 return log_oom();
606
607                         b = NULL;
608
609                         break;
610                 }
611
612                 case ARG_SETENV: {
613                         char **n;
614
615                         if (!env_assignment_is_valid(optarg)) {
616                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
617                                 return -EINVAL;
618                         }
619
620                         n = strv_env_set(arg_setenv, optarg);
621                         if (!n)
622                                 return log_oom();
623
624                         strv_free(arg_setenv);
625                         arg_setenv = n;
626                         break;
627                 }
628
629                 case 'q':
630                         arg_quiet = true;
631                         break;
632
633                 case ARG_SHARE_SYSTEM:
634                         arg_share_system = true;
635                         break;
636
637                 case ARG_REGISTER:
638                         r = parse_boolean(optarg);
639                         if (r < 0) {
640                                 log_error("Failed to parse --register= argument: %s", optarg);
641                                 return r;
642                         }
643
644                         arg_register = r;
645                         break;
646
647                 case ARG_KEEP_UNIT:
648                         arg_keep_unit = true;
649                         break;
650
651                 case ARG_PERSONALITY:
652
653                         arg_personality = personality_from_string(optarg);
654                         if (arg_personality == 0xffffffffLU) {
655                                 log_error("Unknown or unsupported personality '%s'.", optarg);
656                                 return -EINVAL;
657                         }
658
659                         break;
660
661                 case ARG_VOLATILE:
662
663                         if (!optarg)
664                                 arg_volatile = VOLATILE_YES;
665                         else {
666                                 r = parse_boolean(optarg);
667                                 if (r < 0) {
668                                         if (streq(optarg, "state"))
669                                                 arg_volatile = VOLATILE_STATE;
670                                         else {
671                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
672                                                 return r;
673                                         }
674                                 } else
675                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
676                         }
677
678                         break;
679
680                 case 'p': {
681                         const char *split, *e;
682                         uint16_t container_port, host_port;
683                         int protocol;
684                         ExposePort *p;
685
686                         if ((e = startswith(optarg, "tcp:")))
687                                 protocol = IPPROTO_TCP;
688                         else if ((e = startswith(optarg, "udp:")))
689                                 protocol = IPPROTO_UDP;
690                         else {
691                                 e = optarg;
692                                 protocol = IPPROTO_TCP;
693                         }
694
695                         split = strchr(e, ':');
696                         if (split) {
697                                 char v[split - e + 1];
698
699                                 memcpy(v, e, split - e);
700                                 v[split - e] = 0;
701
702                                 r = safe_atou16(v, &host_port);
703                                 if (r < 0 || host_port <= 0) {
704                                         log_error("Failed to parse host port: %s", optarg);
705                                         return -EINVAL;
706                                 }
707
708                                 r = safe_atou16(split + 1, &container_port);
709                         } else {
710                                 r = safe_atou16(e, &container_port);
711                                 host_port = container_port;
712                         }
713
714                         if (r < 0 || container_port <= 0) {
715                                 log_error("Failed to parse host port: %s", optarg);
716                                 return -EINVAL;
717                         }
718
719                         LIST_FOREACH(ports, p, arg_expose_ports) {
720                                 if (p->protocol == protocol && p->host_port == host_port) {
721                                         log_error("Duplicate port specification: %s", optarg);
722                                         return -EINVAL;
723                                 }
724                         }
725
726                         p = new(ExposePort, 1);
727                         if (!p)
728                                 return log_oom();
729
730                         p->protocol = protocol;
731                         p->host_port = host_port;
732                         p->container_port = container_port;
733
734                         LIST_PREPEND(ports, arg_expose_ports, p);
735
736                         break;
737                 }
738
739                 case ARG_PROPERTY:
740                         if (strv_extend(&arg_property, optarg) < 0)
741                                 return log_oom();
742
743                         break;
744
745                 case ARG_PRIVATE_USERS:
746                         if (optarg) {
747                                 _cleanup_free_ char *buffer = NULL;
748                                 const char *range, *shift;
749
750                                 range = strchr(optarg, ':');
751                                 if (range) {
752                                         buffer = strndup(optarg, range - optarg);
753                                         if (!buffer)
754                                                 return log_oom();
755                                         shift = buffer;
756
757                                         range++;
758                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
759                                                 log_error("Failed to parse UID range: %s", range);
760                                                 return -EINVAL;
761                                         }
762                                 } else
763                                         shift = optarg;
764
765                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
766                                         log_error("Failed to parse UID: %s", optarg);
767                                         return -EINVAL;
768                                 }
769                         }
770
771                         arg_userns = true;
772                         break;
773
774                 case ARG_KILL_SIGNAL:
775                         arg_kill_signal = signal_from_string_try_harder(optarg);
776                         if (arg_kill_signal < 0) {
777                                 log_error("Cannot parse signal: %s", optarg);
778                                 return -EINVAL;
779                         }
780
781                         break;
782
783                 case '?':
784                         return -EINVAL;
785
786                 default:
787                         assert_not_reached("Unhandled option");
788                 }
789
790         if (arg_share_system)
791                 arg_register = false;
792
793         if (arg_boot && arg_share_system) {
794                 log_error("--boot and --share-system may not be combined.");
795                 return -EINVAL;
796         }
797
798         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
799                 log_error("--keep-unit may not be used when invoked from a user session.");
800                 return -EINVAL;
801         }
802
803         if (arg_directory && arg_image) {
804                 log_error("--directory= and --image= may not be combined.");
805                 return -EINVAL;
806         }
807
808         if (arg_template && arg_image) {
809                 log_error("--template= and --image= may not be combined.");
810                 return -EINVAL;
811         }
812
813         if (arg_template && !(arg_directory || arg_machine)) {
814                 log_error("--template= needs --directory= or --machine=.");
815                 return -EINVAL;
816         }
817
818         if (arg_ephemeral && arg_template) {
819                 log_error("--ephemeral and --template= may not be combined.");
820                 return -EINVAL;
821         }
822
823         if (arg_ephemeral && arg_image) {
824                 log_error("--ephemeral and --image= may not be combined.");
825                 return -EINVAL;
826         }
827
828         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
829                 log_error("--ephemeral and --link-journal= may not be combined.");
830                 return -EINVAL;
831         }
832
833         if (arg_volatile != VOLATILE_NO && arg_read_only) {
834                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
835                 return -EINVAL;
836         }
837
838         if (arg_expose_ports && !arg_private_network) {
839                 log_error("Cannot use --port= without private networking.");
840                 return -EINVAL;
841         }
842
843         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
844
845         if (arg_boot && arg_kill_signal <= 0)
846                 arg_kill_signal = SIGRTMIN+3;
847
848         return 1;
849 }
850
851 static int mount_all(const char *dest) {
852
853         typedef struct MountPoint {
854                 const char *what;
855                 const char *where;
856                 const char *type;
857                 const char *options;
858                 unsigned long flags;
859                 bool fatal;
860         } MountPoint;
861
862         static const MountPoint mount_table[] = {
863                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
864                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
865                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
866                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
867                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
868                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
869                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
870                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
871                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
872 #ifdef HAVE_SELINUX
873                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
874                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
875 #endif
876         };
877
878         unsigned k;
879         int r = 0;
880
881         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
882                 _cleanup_free_ char *where = NULL, *options = NULL;
883                 const char *o;
884                 int t;
885
886                 where = strjoin(dest, "/", mount_table[k].where, NULL);
887                 if (!where)
888                         return log_oom();
889
890                 t = path_is_mount_point(where, true);
891                 if (t < 0) {
892                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
893
894                         if (r == 0)
895                                 r = t;
896
897                         continue;
898                 }
899
900                 /* Skip this entry if it is not a remount. */
901                 if (mount_table[k].what && t > 0)
902                         continue;
903
904                 t = mkdir_p(where, 0755);
905                 if (t < 0) {
906                         if (mount_table[k].fatal) {
907                                log_error_errno(t, "Failed to create directory %s: %m", where);
908
909                                 if (r == 0)
910                                         r = t;
911                         } else
912                                log_warning_errno(t, "Failed to create directory %s: %m", where);
913
914                         continue;
915                 }
916
917 #ifdef HAVE_SELINUX
918                 if (arg_selinux_apifs_context &&
919                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
920                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
921                         if (!options)
922                                 return log_oom();
923
924                         o = options;
925                 } else
926 #endif
927                         o = mount_table[k].options;
928
929                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
930                         char *uid_options = NULL;
931
932                         if (o)
933                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
934                         else
935                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
936                         if (!uid_options)
937                                 return log_oom();
938
939                         free(options);
940                         o = options = uid_options;
941                 }
942
943                 if (mount(mount_table[k].what,
944                           where,
945                           mount_table[k].type,
946                           mount_table[k].flags,
947                           o) < 0) {
948
949                         if (mount_table[k].fatal) {
950                                 log_error_errno(errno, "mount(%s) failed: %m", where);
951
952                                 if (r == 0)
953                                         r = -errno;
954                         } else
955                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
956                 }
957         }
958
959         return r;
960 }
961
962 static int mount_binds(const char *dest, char **l, bool ro) {
963         char **x, **y;
964
965         STRV_FOREACH_PAIR(x, y, l) {
966                 _cleanup_free_ char *where = NULL;
967                 struct stat source_st, dest_st;
968                 int r;
969
970                 if (stat(*x, &source_st) < 0)
971                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
972
973                 where = strappend(dest, *y);
974                 if (!where)
975                         return log_oom();
976
977                 r = stat(where, &dest_st);
978                 if (r == 0) {
979                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
980                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
981                                 return -EINVAL;
982                         }
983                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
984                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
985                                 return -EINVAL;
986                         }
987                 } else if (errno == ENOENT) {
988                         r = mkdir_parents_label(where, 0755);
989                         if (r < 0)
990                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
991                 } else {
992                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
993                         return -errno;
994                 }
995
996                 /* Create the mount point. Any non-directory file can be
997                  * mounted on any non-directory file (regular, fifo, socket,
998                  * char, block).
999                  */
1000                 if (S_ISDIR(source_st.st_mode)) {
1001                         r = mkdir_label(where, 0755);
1002                         if (r < 0 && errno != EEXIST)
1003                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1004                 } else {
1005                         r = touch(where);
1006                         if (r < 0)
1007                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
1008                 }
1009
1010                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
1011                         return log_error_errno(errno, "mount(%s) failed: %m", where);
1012
1013                 if (ro) {
1014                         r = bind_remount_recursive(where, true);
1015                         if (r < 0)
1016                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1017                 }
1018         }
1019
1020         return 0;
1021 }
1022
1023 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1024         char *to;
1025         int r;
1026
1027         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1028
1029         r = path_is_mount_point(to, false);
1030         if (r < 0)
1031                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1032         if (r > 0)
1033                 return 0;
1034
1035         mkdir_p(to, 0755);
1036
1037         /* The superblock mount options of the mount point need to be
1038          * identical to the hosts', and hence writable... */
1039         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1040                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1041
1042         /* ... hence let's only make the bind mount read-only, not the
1043          * superblock. */
1044         if (read_only) {
1045                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1046                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1047         }
1048         return 1;
1049 }
1050
1051 static int mount_cgroup(const char *dest) {
1052         _cleanup_set_free_free_ Set *controllers = NULL;
1053         _cleanup_free_ char *own_cgroup_path = NULL;
1054         const char *cgroup_root, *systemd_root, *systemd_own;
1055         int r;
1056
1057         controllers = set_new(&string_hash_ops);
1058         if (!controllers)
1059                 return log_oom();
1060
1061         r = cg_kernel_controllers(controllers);
1062         if (r < 0)
1063                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1064
1065         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1066         if (r < 0)
1067                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1068
1069         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1070         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1071                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1072
1073         for (;;) {
1074                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1075
1076                 controller = set_steal_first(controllers);
1077                 if (!controller)
1078                         break;
1079
1080                 origin = strappend("/sys/fs/cgroup/", controller);
1081                 if (!origin)
1082                         return log_oom();
1083
1084                 r = readlink_malloc(origin, &combined);
1085                 if (r == -EINVAL) {
1086                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1087
1088                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1089                         if (r < 0)
1090                                 return r;
1091
1092                 } else if (r < 0)
1093                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1094                 else {
1095                         _cleanup_free_ char *target = NULL;
1096
1097                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1098                         if (!target)
1099                                 return log_oom();
1100
1101                         /* A symbolic link, a combination of controllers in one hierarchy */
1102
1103                         if (!filename_is_valid(combined)) {
1104                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1105                                 continue;
1106                         }
1107
1108                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1109                         if (r < 0)
1110                                 return r;
1111
1112                         if (symlink(combined, target) < 0)
1113                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1114                 }
1115         }
1116
1117         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1118         if (r < 0)
1119                 return r;
1120
1121         /* Make our own cgroup a (writable) bind mount */
1122         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1123         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1124                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1125
1126         /* And then remount the systemd cgroup root read-only */
1127         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1128         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1129                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1130
1131         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1132                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1133
1134         return 0;
1135 }
1136
1137 static int mount_tmpfs(const char *dest) {
1138         char **i, **o;
1139
1140         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1141                 _cleanup_free_ char *where = NULL;
1142                 int r;
1143
1144                 where = strappend(dest, *i);
1145                 if (!where)
1146                         return log_oom();
1147
1148                 r = mkdir_label(where, 0755);
1149                 if (r < 0 && r != -EEXIST)
1150                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1151
1152                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1153                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1154         }
1155
1156         return 0;
1157 }
1158
1159 static int setup_timezone(const char *dest) {
1160         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1161         char *z, *y;
1162         int r;
1163
1164         assert(dest);
1165
1166         /* Fix the timezone, if possible */
1167         r = readlink_malloc("/etc/localtime", &p);
1168         if (r < 0) {
1169                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1170                 return 0;
1171         }
1172
1173         z = path_startswith(p, "../usr/share/zoneinfo/");
1174         if (!z)
1175                 z = path_startswith(p, "/usr/share/zoneinfo/");
1176         if (!z) {
1177                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1178                 return 0;
1179         }
1180
1181         where = strappend(dest, "/etc/localtime");
1182         if (!where)
1183                 return log_oom();
1184
1185         r = readlink_malloc(where, &q);
1186         if (r >= 0) {
1187                 y = path_startswith(q, "../usr/share/zoneinfo/");
1188                 if (!y)
1189                         y = path_startswith(q, "/usr/share/zoneinfo/");
1190
1191                 /* Already pointing to the right place? Then do nothing .. */
1192                 if (y && streq(y, z))
1193                         return 0;
1194         }
1195
1196         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1197         if (!check)
1198                 return log_oom();
1199
1200         if (access(check, F_OK) < 0) {
1201                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1202                 return 0;
1203         }
1204
1205         what = strappend("../usr/share/zoneinfo/", z);
1206         if (!what)
1207                 return log_oom();
1208
1209         r = mkdir_parents(where, 0755);
1210         if (r < 0) {
1211                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1212
1213                 return 0;
1214         }
1215
1216         r = unlink(where);
1217         if (r < 0 && errno != ENOENT) {
1218                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1219
1220                 return 0;
1221         }
1222
1223         if (symlink(what, where) < 0) {
1224                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1225                 return 0;
1226         }
1227
1228         return 0;
1229 }
1230
1231 static int setup_resolv_conf(const char *dest) {
1232         _cleanup_free_ char *where = NULL;
1233         int r;
1234
1235         assert(dest);
1236
1237         if (arg_private_network)
1238                 return 0;
1239
1240         /* Fix resolv.conf, if possible */
1241         where = strappend(dest, "/etc/resolv.conf");
1242         if (!where)
1243                 return log_oom();
1244
1245         /* We don't really care for the results of this really. If it
1246          * fails, it fails, but meh... */
1247         r = mkdir_parents(where, 0755);
1248         if (r < 0) {
1249                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1250
1251                 return 0;
1252         }
1253
1254         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1255         if (r < 0) {
1256                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1257
1258                 return 0;
1259         }
1260
1261         return 0;
1262 }
1263
1264 static int setup_volatile_state(const char *directory) {
1265         const char *p;
1266         int r;
1267
1268         assert(directory);
1269
1270         if (arg_volatile != VOLATILE_STATE)
1271                 return 0;
1272
1273         /* --volatile=state means we simply overmount /var
1274            with a tmpfs, and the rest read-only. */
1275
1276         r = bind_remount_recursive(directory, true);
1277         if (r < 0)
1278                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1279
1280         p = strjoina(directory, "/var");
1281         r = mkdir(p, 0755);
1282         if (r < 0 && errno != EEXIST)
1283                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1284
1285         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1286                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1287
1288         return 0;
1289 }
1290
1291 static int setup_volatile(const char *directory) {
1292         bool tmpfs_mounted = false, bind_mounted = false;
1293         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1294         const char *f, *t;
1295         int r;
1296
1297         assert(directory);
1298
1299         if (arg_volatile != VOLATILE_YES)
1300                 return 0;
1301
1302         /* --volatile=yes means we mount a tmpfs to the root dir, and
1303            the original /usr to use inside it, and that read-only. */
1304
1305         if (!mkdtemp(template))
1306                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1307
1308         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1309                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1310                 r = -errno;
1311                 goto fail;
1312         }
1313
1314         tmpfs_mounted = true;
1315
1316         f = strjoina(directory, "/usr");
1317         t = strjoina(template, "/usr");
1318
1319         r = mkdir(t, 0755);
1320         if (r < 0 && errno != EEXIST) {
1321                 log_error_errno(errno, "Failed to create %s: %m", t);
1322                 r = -errno;
1323                 goto fail;
1324         }
1325
1326         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1327                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1328                 r = -errno;
1329                 goto fail;
1330         }
1331
1332         bind_mounted = true;
1333
1334         r = bind_remount_recursive(t, true);
1335         if (r < 0) {
1336                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1337                 goto fail;
1338         }
1339
1340         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1341                 log_error_errno(errno, "Failed to move root mount: %m");
1342                 r = -errno;
1343                 goto fail;
1344         }
1345
1346         rmdir(template);
1347
1348         return 0;
1349
1350 fail:
1351         if (bind_mounted)
1352                 umount(t);
1353         if (tmpfs_mounted)
1354                 umount(template);
1355         rmdir(template);
1356         return r;
1357 }
1358
1359 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1360
1361         snprintf(s, 37,
1362                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1363                  SD_ID128_FORMAT_VAL(id));
1364
1365         return s;
1366 }
1367
1368 static int setup_boot_id(const char *dest) {
1369         _cleanup_free_ char *from = NULL, *to = NULL;
1370         sd_id128_t rnd = {};
1371         char as_uuid[37];
1372         int r;
1373
1374         assert(dest);
1375
1376         if (arg_share_system)
1377                 return 0;
1378
1379         /* Generate a new randomized boot ID, so that each boot-up of
1380          * the container gets a new one */
1381
1382         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1383         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1384         if (!from || !to)
1385                 return log_oom();
1386
1387         r = sd_id128_randomize(&rnd);
1388         if (r < 0)
1389                 return log_error_errno(r, "Failed to generate random boot id: %m");
1390
1391         id128_format_as_uuid(rnd, as_uuid);
1392
1393         r = write_string_file(from, as_uuid);
1394         if (r < 0)
1395                 return log_error_errno(r, "Failed to write boot id: %m");
1396
1397         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1398                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1399                 r = -errno;
1400         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1401                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1402
1403         unlink(from);
1404         return r;
1405 }
1406
1407 static int copy_devnodes(const char *dest) {
1408
1409         static const char devnodes[] =
1410                 "null\0"
1411                 "zero\0"
1412                 "full\0"
1413                 "random\0"
1414                 "urandom\0"
1415                 "tty\0"
1416                 "net/tun\0";
1417
1418         const char *d;
1419         int r = 0;
1420         _cleanup_umask_ mode_t u;
1421
1422         assert(dest);
1423
1424         u = umask(0000);
1425
1426         NULSTR_FOREACH(d, devnodes) {
1427                 _cleanup_free_ char *from = NULL, *to = NULL;
1428                 struct stat st;
1429
1430                 from = strappend("/dev/", d);
1431                 to = strjoin(dest, "/dev/", d, NULL);
1432                 if (!from || !to)
1433                         return log_oom();
1434
1435                 if (stat(from, &st) < 0) {
1436
1437                         if (errno != ENOENT)
1438                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1439
1440                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1441
1442                         log_error("%s is not a char or block device, cannot copy", from);
1443                         return -EIO;
1444
1445                 } else {
1446                         r = mkdir_parents(to, 0775);
1447                         if (r < 0) {
1448                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1449                                 return -r;
1450                         }
1451
1452                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1453                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1454
1455                         if (arg_userns && arg_uid_shift != UID_INVALID)
1456                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1457                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1458                 }
1459         }
1460
1461         return r;
1462 }
1463
1464 static int setup_ptmx(const char *dest) {
1465         _cleanup_free_ char *p = NULL;
1466
1467         p = strappend(dest, "/dev/ptmx");
1468         if (!p)
1469                 return log_oom();
1470
1471         if (symlink("pts/ptmx", p) < 0)
1472                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1473
1474         if (arg_userns && arg_uid_shift != UID_INVALID)
1475                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1476                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1477
1478         return 0;
1479 }
1480
1481 static int setup_dev_console(const char *dest, const char *console) {
1482         _cleanup_umask_ mode_t u;
1483         const char *to;
1484         struct stat st;
1485         int r;
1486
1487         assert(dest);
1488         assert(console);
1489
1490         u = umask(0000);
1491
1492         if (stat("/dev/null", &st) < 0)
1493                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1494
1495         r = chmod_and_chown(console, 0600, 0, 0);
1496         if (r < 0)
1497                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1498
1499         /* We need to bind mount the right tty to /dev/console since
1500          * ptys can only exist on pts file systems. To have something
1501          * to bind mount things on we create a device node first, and
1502          * use /dev/null for that since we the cgroups device policy
1503          * allows us to create that freely, while we cannot create
1504          * /dev/console. (Note that the major minor doesn't actually
1505          * matter here, since we mount it over anyway). */
1506
1507         to = strjoina(dest, "/dev/console");
1508         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1509                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1510
1511         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1512                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1513
1514         return 0;
1515 }
1516
1517 static int setup_kmsg(const char *dest, int kmsg_socket) {
1518         _cleanup_free_ char *from = NULL, *to = NULL;
1519         _cleanup_umask_ mode_t u;
1520         int r, fd, k;
1521         union {
1522                 struct cmsghdr cmsghdr;
1523                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1524         } control = {};
1525         struct msghdr mh = {
1526                 .msg_control = &control,
1527                 .msg_controllen = sizeof(control),
1528         };
1529         struct cmsghdr *cmsg;
1530
1531         assert(dest);
1532         assert(kmsg_socket >= 0);
1533
1534         u = umask(0000);
1535
1536         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1537          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1538          * on the reading side behave very similar to /proc/kmsg,
1539          * their writing side behaves differently from /dev/kmsg in
1540          * that writing blocks when nothing is reading. In order to
1541          * avoid any problems with containers deadlocking due to this
1542          * we simply make /dev/kmsg unavailable to the container. */
1543         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1544             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1545                 return log_oom();
1546
1547         if (mkfifo(from, 0600) < 0)
1548                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1549
1550         r = chmod_and_chown(from, 0600, 0, 0);
1551         if (r < 0)
1552                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1553
1554         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1555                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1556
1557         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1558         if (fd < 0)
1559                 return log_error_errno(errno, "Failed to open fifo: %m");
1560
1561         cmsg = CMSG_FIRSTHDR(&mh);
1562         cmsg->cmsg_level = SOL_SOCKET;
1563         cmsg->cmsg_type = SCM_RIGHTS;
1564         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1565         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1566
1567         mh.msg_controllen = cmsg->cmsg_len;
1568
1569         /* Store away the fd in the socket, so that it stays open as
1570          * long as we run the child */
1571         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1572         safe_close(fd);
1573
1574         if (k < 0)
1575                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1576
1577         /* And now make the FIFO unavailable as /dev/kmsg... */
1578         unlink(from);
1579         return 0;
1580 }
1581
1582 static int send_rtnl(int send_fd) {
1583         union {
1584                 struct cmsghdr cmsghdr;
1585                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1586         } control = {};
1587         struct msghdr mh = {
1588                 .msg_control = &control,
1589                 .msg_controllen = sizeof(control),
1590         };
1591         struct cmsghdr *cmsg;
1592         _cleanup_close_ int fd = -1;
1593         ssize_t k;
1594
1595         assert(send_fd >= 0);
1596
1597         if (!arg_expose_ports)
1598                 return 0;
1599
1600         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1601         if (fd < 0)
1602                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1603
1604         cmsg = CMSG_FIRSTHDR(&mh);
1605         cmsg->cmsg_level = SOL_SOCKET;
1606         cmsg->cmsg_type = SCM_RIGHTS;
1607         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1608         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1609
1610         mh.msg_controllen = cmsg->cmsg_len;
1611
1612         /* Store away the fd in the socket, so that it stays open as
1613          * long as we run the child */
1614         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1615         if (k < 0)
1616                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1617
1618         return 0;
1619 }
1620
1621 static int flush_ports(union in_addr_union *exposed) {
1622         ExposePort *p;
1623         int r, af = AF_INET;
1624
1625         assert(exposed);
1626
1627         if (!arg_expose_ports)
1628                 return 0;
1629
1630         if (in_addr_is_null(af, exposed))
1631                 return 0;
1632
1633         log_debug("Lost IP address.");
1634
1635         LIST_FOREACH(ports, p, arg_expose_ports) {
1636                 r = fw_add_local_dnat(false,
1637                                       af,
1638                                       p->protocol,
1639                                       NULL,
1640                                       NULL, 0,
1641                                       NULL, 0,
1642                                       p->host_port,
1643                                       exposed,
1644                                       p->container_port,
1645                                       NULL);
1646                 if (r < 0)
1647                         log_warning_errno(r, "Failed to modify firewall: %m");
1648         }
1649
1650         *exposed = IN_ADDR_NULL;
1651         return 0;
1652 }
1653
1654 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1655         _cleanup_free_ struct local_address *addresses = NULL;
1656         _cleanup_free_ char *pretty = NULL;
1657         union in_addr_union new_exposed;
1658         ExposePort *p;
1659         bool add;
1660         int af = AF_INET, r;
1661
1662         assert(exposed);
1663
1664         /* Invoked each time an address is added or removed inside the
1665          * container */
1666
1667         if (!arg_expose_ports)
1668                 return 0;
1669
1670         r = local_addresses(rtnl, 0, af, &addresses);
1671         if (r < 0)
1672                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1673
1674         add = r > 0 &&
1675                 addresses[0].family == af &&
1676                 addresses[0].scope < RT_SCOPE_LINK;
1677
1678         if (!add)
1679                 return flush_ports(exposed);
1680
1681         new_exposed = addresses[0].address;
1682         if (in_addr_equal(af, exposed, &new_exposed))
1683                 return 0;
1684
1685         in_addr_to_string(af, &new_exposed, &pretty);
1686         log_debug("New container IP is %s.", strna(pretty));
1687
1688         LIST_FOREACH(ports, p, arg_expose_ports) {
1689
1690                 r = fw_add_local_dnat(true,
1691                                       af,
1692                                       p->protocol,
1693                                       NULL,
1694                                       NULL, 0,
1695                                       NULL, 0,
1696                                       p->host_port,
1697                                       &new_exposed,
1698                                       p->container_port,
1699                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1700                 if (r < 0)
1701                         log_warning_errno(r, "Failed to modify firewall: %m");
1702         }
1703
1704         *exposed = new_exposed;
1705         return 0;
1706 }
1707
1708 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1709         union in_addr_union *exposed = userdata;
1710
1711         assert(rtnl);
1712         assert(m);
1713         assert(exposed);
1714
1715         expose_ports(rtnl, exposed);
1716         return 0;
1717 }
1718
1719 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1720         union {
1721                 struct cmsghdr cmsghdr;
1722                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1723         } control = {};
1724         struct msghdr mh = {
1725                 .msg_control = &control,
1726                 .msg_controllen = sizeof(control),
1727         };
1728         struct cmsghdr *cmsg;
1729         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730         int fd, r;
1731         ssize_t k;
1732
1733         assert(event);
1734         assert(recv_fd >= 0);
1735         assert(ret);
1736
1737         if (!arg_expose_ports)
1738                 return 0;
1739
1740         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1741         if (k < 0)
1742                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1743
1744         cmsg = CMSG_FIRSTHDR(&mh);
1745         assert(cmsg->cmsg_level == SOL_SOCKET);
1746         assert(cmsg->cmsg_type == SCM_RIGHTS);
1747         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1748         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1749
1750         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1751         if (r < 0) {
1752                 safe_close(fd);
1753                 return log_error_errno(r, "Failed to create rtnl object: %m");
1754         }
1755
1756         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1757         if (r < 0)
1758                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1759
1760         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1761         if (r < 0)
1762                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1763
1764         r = sd_rtnl_attach_event(rtnl, event, 0);
1765         if (r < 0)
1766                 return log_error_errno(r, "Failed to add to even loop: %m");
1767
1768         *ret = rtnl;
1769         rtnl = NULL;
1770
1771         return 0;
1772 }
1773
1774 static int setup_hostname(void) {
1775
1776         if (arg_share_system)
1777                 return 0;
1778
1779         if (sethostname_idempotent(arg_machine) < 0)
1780                 return -errno;
1781
1782         return 0;
1783 }
1784
1785 static int setup_journal(const char *directory) {
1786         sd_id128_t machine_id, this_id;
1787         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1788         char *id;
1789         int r;
1790
1791         /* Don't link journals in ephemeral mode */
1792         if (arg_ephemeral)
1793                 return 0;
1794
1795         p = strappend(directory, "/etc/machine-id");
1796         if (!p)
1797                 return log_oom();
1798
1799         r = read_one_line_file(p, &b);
1800         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1801                 return 0;
1802         else if (r < 0)
1803                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1804
1805         id = strstrip(b);
1806         if (isempty(id) && arg_link_journal == LINK_AUTO)
1807                 return 0;
1808
1809         /* Verify validity */
1810         r = sd_id128_from_string(id, &machine_id);
1811         if (r < 0)
1812                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1813
1814         r = sd_id128_get_machine(&this_id);
1815         if (r < 0)
1816                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1817
1818         if (sd_id128_equal(machine_id, this_id)) {
1819                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1820                          "Host and machine ids are equal (%s): refusing to link journals", id);
1821                 if (arg_link_journal == LINK_AUTO)
1822                         return 0;
1823                 return -EEXIST;
1824         }
1825
1826         if (arg_link_journal == LINK_NO)
1827                 return 0;
1828
1829         free(p);
1830         p = strappend("/var/log/journal/", id);
1831         q = strjoin(directory, "/var/log/journal/", id, NULL);
1832         if (!p || !q)
1833                 return log_oom();
1834
1835         if (path_is_mount_point(p, false) > 0) {
1836                 if (arg_link_journal != LINK_AUTO) {
1837                         log_error("%s: already a mount point, refusing to use for journal", p);
1838                         return -EEXIST;
1839                 }
1840
1841                 return 0;
1842         }
1843
1844         if (path_is_mount_point(q, false) > 0) {
1845                 if (arg_link_journal != LINK_AUTO) {
1846                         log_error("%s: already a mount point, refusing to use for journal", q);
1847                         return -EEXIST;
1848                 }
1849
1850                 return 0;
1851         }
1852
1853         r = readlink_and_make_absolute(p, &d);
1854         if (r >= 0) {
1855                 if ((arg_link_journal == LINK_GUEST ||
1856                      arg_link_journal == LINK_AUTO) &&
1857                     path_equal(d, q)) {
1858
1859                         r = mkdir_p(q, 0755);
1860                         if (r < 0)
1861                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1862                         return 0;
1863                 }
1864
1865                 if (unlink(p) < 0)
1866                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1867         } else if (r == -EINVAL) {
1868
1869                 if (arg_link_journal == LINK_GUEST &&
1870                     rmdir(p) < 0) {
1871
1872                         if (errno == ENOTDIR) {
1873                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1874                                 return r;
1875                         } else {
1876                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1877                                 return -errno;
1878                         }
1879                 }
1880         } else if (r != -ENOENT) {
1881                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1882                 return r;
1883         }
1884
1885         if (arg_link_journal == LINK_GUEST) {
1886
1887                 if (symlink(q, p) < 0) {
1888                         if (arg_link_journal_try) {
1889                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1890                                 return 0;
1891                         } else {
1892                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1893                                 return -errno;
1894                         }
1895                 }
1896
1897                 r = mkdir_p(q, 0755);
1898                 if (r < 0)
1899                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1900                 return 0;
1901         }
1902
1903         if (arg_link_journal == LINK_HOST) {
1904                 /* don't create parents here -- if the host doesn't have
1905                  * permanent journal set up, don't force it here */
1906                 r = mkdir(p, 0755);
1907                 if (r < 0) {
1908                         if (arg_link_journal_try) {
1909                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1910                                 return 0;
1911                         } else {
1912                                 log_error_errno(errno, "Failed to create %s: %m", p);
1913                                 return r;
1914                         }
1915                 }
1916
1917         } else if (access(p, F_OK) < 0)
1918                 return 0;
1919
1920         if (dir_is_empty(q) == 0)
1921                 log_warning("%s is not empty, proceeding anyway.", q);
1922
1923         r = mkdir_p(q, 0755);
1924         if (r < 0) {
1925                 log_error_errno(errno, "Failed to create %s: %m", q);
1926                 return r;
1927         }
1928
1929         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1930                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1931
1932         return 0;
1933 }
1934
1935 static int drop_capabilities(void) {
1936         return capability_bounding_set_drop(~arg_retain, false);
1937 }
1938
1939 static int register_machine(pid_t pid, int local_ifindex) {
1940         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1941         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1942         int r;
1943
1944         if (!arg_register)
1945                 return 0;
1946
1947         r = sd_bus_default_system(&bus);
1948         if (r < 0)
1949                 return log_error_errno(r, "Failed to open system bus: %m");
1950
1951         if (arg_keep_unit) {
1952                 r = sd_bus_call_method(
1953                                 bus,
1954                                 "org.freedesktop.machine1",
1955                                 "/org/freedesktop/machine1",
1956                                 "org.freedesktop.machine1.Manager",
1957                                 "RegisterMachineWithNetwork",
1958                                 &error,
1959                                 NULL,
1960                                 "sayssusai",
1961                                 arg_machine,
1962                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1963                                 "nspawn",
1964                                 "container",
1965                                 (uint32_t) pid,
1966                                 strempty(arg_directory),
1967                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1968         } else {
1969                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1970                 char **i;
1971
1972                 r = sd_bus_message_new_method_call(
1973                                 bus,
1974                                 &m,
1975                                 "org.freedesktop.machine1",
1976                                 "/org/freedesktop/machine1",
1977                                 "org.freedesktop.machine1.Manager",
1978                                 "CreateMachineWithNetwork");
1979                 if (r < 0)
1980                         return bus_log_create_error(r);
1981
1982                 r = sd_bus_message_append(
1983                                 m,
1984                                 "sayssusai",
1985                                 arg_machine,
1986                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1987                                 "nspawn",
1988                                 "container",
1989                                 (uint32_t) pid,
1990                                 strempty(arg_directory),
1991                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1992                 if (r < 0)
1993                         return bus_log_create_error(r);
1994
1995                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1996                 if (r < 0)
1997                         return bus_log_create_error(r);
1998
1999                 if (!isempty(arg_slice)) {
2000                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
2001                         if (r < 0)
2002                                 return bus_log_create_error(r);
2003                 }
2004
2005                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
2006                 if (r < 0)
2007                         return bus_log_create_error(r);
2008
2009                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
2010                                           /* Allow the container to
2011                                            * access and create the API
2012                                            * device nodes, so that
2013                                            * PrivateDevices= in the
2014                                            * container can work
2015                                            * fine */
2016                                           "/dev/null", "rwm",
2017                                           "/dev/zero", "rwm",
2018                                           "/dev/full", "rwm",
2019                                           "/dev/random", "rwm",
2020                                           "/dev/urandom", "rwm",
2021                                           "/dev/tty", "rwm",
2022                                           "/dev/net/tun", "rwm",
2023                                           /* Allow the container
2024                                            * access to ptys. However,
2025                                            * do not permit the
2026                                            * container to ever create
2027                                            * these device nodes. */
2028                                           "/dev/pts/ptmx", "rw",
2029                                           "char-pts", "rw");
2030                 if (r < 0)
2031                         return log_error_errno(r, "Failed to add device whitelist: %m");
2032
2033                 STRV_FOREACH(i, arg_property) {
2034                         r = sd_bus_message_open_container(m, 'r', "sv");
2035                         if (r < 0)
2036                                 return bus_log_create_error(r);
2037
2038                         r = bus_append_unit_property_assignment(m, *i);
2039                         if (r < 0)
2040                                 return r;
2041
2042                         r = sd_bus_message_close_container(m);
2043                         if (r < 0)
2044                                 return bus_log_create_error(r);
2045                 }
2046
2047                 r = sd_bus_message_close_container(m);
2048                 if (r < 0)
2049                         return bus_log_create_error(r);
2050
2051                 r = sd_bus_call(bus, m, 0, &error, NULL);
2052         }
2053
2054         if (r < 0) {
2055                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2056                 return r;
2057         }
2058
2059         return 0;
2060 }
2061
2062 static int terminate_machine(pid_t pid) {
2063         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2064         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2065         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2066         const char *path;
2067         int r;
2068
2069         if (!arg_register)
2070                 return 0;
2071
2072         r = sd_bus_default_system(&bus);
2073         if (r < 0)
2074                 return log_error_errno(r, "Failed to open system bus: %m");
2075
2076         r = sd_bus_call_method(
2077                         bus,
2078                         "org.freedesktop.machine1",
2079                         "/org/freedesktop/machine1",
2080                         "org.freedesktop.machine1.Manager",
2081                         "GetMachineByPID",
2082                         &error,
2083                         &reply,
2084                         "u",
2085                         (uint32_t) pid);
2086         if (r < 0) {
2087                 /* Note that the machine might already have been
2088                  * cleaned up automatically, hence don't consider it a
2089                  * failure if we cannot get the machine object. */
2090                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2091                 return 0;
2092         }
2093
2094         r = sd_bus_message_read(reply, "o", &path);
2095         if (r < 0)
2096                 return bus_log_parse_error(r);
2097
2098         r = sd_bus_call_method(
2099                         bus,
2100                         "org.freedesktop.machine1",
2101                         path,
2102                         "org.freedesktop.machine1.Machine",
2103                         "Terminate",
2104                         &error,
2105                         NULL,
2106                         NULL);
2107         if (r < 0) {
2108                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2109                 return 0;
2110         }
2111
2112         return 0;
2113 }
2114
2115 static int reset_audit_loginuid(void) {
2116         _cleanup_free_ char *p = NULL;
2117         int r;
2118
2119         if (arg_share_system)
2120                 return 0;
2121
2122         r = read_one_line_file("/proc/self/loginuid", &p);
2123         if (r == -ENOENT)
2124                 return 0;
2125         if (r < 0)
2126                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2127
2128         /* Already reset? */
2129         if (streq(p, "4294967295"))
2130                 return 0;
2131
2132         r = write_string_file("/proc/self/loginuid", "4294967295");
2133         if (r < 0) {
2134                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2135                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2136                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2137                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2138                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2139
2140                 sleep(5);
2141         }
2142
2143         return 0;
2144 }
2145
2146 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2147 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2148 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2149
2150 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2151         uint8_t result[8];
2152         size_t l, sz;
2153         uint8_t *v, *i;
2154         int r;
2155
2156         l = strlen(arg_machine);
2157         sz = sizeof(sd_id128_t) + l;
2158         if (idx > 0)
2159                 sz += sizeof(idx);
2160
2161         v = alloca(sz);
2162
2163         /* fetch some persistent data unique to the host */
2164         r = sd_id128_get_machine((sd_id128_t*) v);
2165         if (r < 0)
2166                 return r;
2167
2168         /* combine with some data unique (on this host) to this
2169          * container instance */
2170         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2171         if (idx > 0) {
2172                 idx = htole64(idx);
2173                 memcpy(i, &idx, sizeof(idx));
2174         }
2175
2176         /* Let's hash the host machine ID plus the container name. We
2177          * use a fixed, but originally randomly created hash key here. */
2178         siphash24(result, v, sz, hash_key.bytes);
2179
2180         assert_cc(ETH_ALEN <= sizeof(result));
2181         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2182
2183         /* see eth_random_addr in the kernel */
2184         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2185         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2186
2187         return 0;
2188 }
2189
2190 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2191         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2192         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2193         struct ether_addr mac_host, mac_container;
2194         int r, i;
2195
2196         if (!arg_private_network)
2197                 return 0;
2198
2199         if (!arg_network_veth)
2200                 return 0;
2201
2202         /* Use two different interface name prefixes depending whether
2203          * we are in bridge mode or not. */
2204         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2205                  arg_network_bridge ? "vb" : "ve", arg_machine);
2206
2207         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2210
2211         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2214
2215         r = sd_rtnl_open(&rtnl, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to connect to netlink: %m");
2218
2219         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2222
2223         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2226
2227         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2230
2231         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to open netlink container: %m");
2234
2235         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to open netlink container: %m");
2238
2239         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2240         if (r < 0)
2241                 return log_error_errno(r, "Failed to open netlink container: %m");
2242
2243         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2244         if (r < 0)
2245                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2246
2247         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2248         if (r < 0)
2249                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2250
2251         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2254
2255         r = sd_rtnl_message_close_container(m);
2256         if (r < 0)
2257                 return log_error_errno(r, "Failed to close netlink container: %m");
2258
2259         r = sd_rtnl_message_close_container(m);
2260         if (r < 0)
2261                 return log_error_errno(r, "Failed to close netlink container: %m");
2262
2263         r = sd_rtnl_message_close_container(m);
2264         if (r < 0)
2265                 return log_error_errno(r, "Failed to close netlink container: %m");
2266
2267         r = sd_rtnl_call(rtnl, m, 0, NULL);
2268         if (r < 0)
2269                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2270
2271         i = (int) if_nametoindex(iface_name);
2272         if (i <= 0)
2273                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2274
2275         *ifi = i;
2276
2277         return 0;
2278 }
2279
2280 static int setup_bridge(const char veth_name[], int *ifi) {
2281         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2282         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2283         int r, bridge;
2284
2285         if (!arg_private_network)
2286                 return 0;
2287
2288         if (!arg_network_veth)
2289                 return 0;
2290
2291         if (!arg_network_bridge)
2292                 return 0;
2293
2294         bridge = (int) if_nametoindex(arg_network_bridge);
2295         if (bridge <= 0)
2296                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2297
2298         *ifi = bridge;
2299
2300         r = sd_rtnl_open(&rtnl, 0);
2301         if (r < 0)
2302                 return log_error_errno(r, "Failed to connect to netlink: %m");
2303
2304         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2307
2308         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2309         if (r < 0)
2310                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2311
2312         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2313         if (r < 0)
2314                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2315
2316         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2317         if (r < 0)
2318                 return log_error_errno(r, "Failed to add netlink master field: %m");
2319
2320         r = sd_rtnl_call(rtnl, m, 0, NULL);
2321         if (r < 0)
2322                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2323
2324         return 0;
2325 }
2326
2327 static int parse_interface(struct udev *udev, const char *name) {
2328         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2329         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2330         int ifi;
2331
2332         ifi = (int) if_nametoindex(name);
2333         if (ifi <= 0)
2334                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2335
2336         sprintf(ifi_str, "n%i", ifi);
2337         d = udev_device_new_from_device_id(udev, ifi_str);
2338         if (!d)
2339                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2340
2341         if (udev_device_get_is_initialized(d) <= 0) {
2342                 log_error("Network interface %s is not initialized yet.", name);
2343                 return -EBUSY;
2344         }
2345
2346         return ifi;
2347 }
2348
2349 static int move_network_interfaces(pid_t pid) {
2350         _cleanup_udev_unref_ struct udev *udev = NULL;
2351         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2352         char **i;
2353         int r;
2354
2355         if (!arg_private_network)
2356                 return 0;
2357
2358         if (strv_isempty(arg_network_interfaces))
2359                 return 0;
2360
2361         r = sd_rtnl_open(&rtnl, 0);
2362         if (r < 0)
2363                 return log_error_errno(r, "Failed to connect to netlink: %m");
2364
2365         udev = udev_new();
2366         if (!udev) {
2367                 log_error("Failed to connect to udev.");
2368                 return -ENOMEM;
2369         }
2370
2371         STRV_FOREACH(i, arg_network_interfaces) {
2372                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2373                 int ifi;
2374
2375                 ifi = parse_interface(udev, *i);
2376                 if (ifi < 0)
2377                         return ifi;
2378
2379                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2380                 if (r < 0)
2381                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2382
2383                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2384                 if (r < 0)
2385                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2386
2387                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2388                 if (r < 0)
2389                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2390         }
2391
2392         return 0;
2393 }
2394
2395 static int setup_macvlan(pid_t pid) {
2396         _cleanup_udev_unref_ struct udev *udev = NULL;
2397         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2398         unsigned idx = 0;
2399         char **i;
2400         int r;
2401
2402         if (!arg_private_network)
2403                 return 0;
2404
2405         if (strv_isempty(arg_network_macvlan))
2406                 return 0;
2407
2408         r = sd_rtnl_open(&rtnl, 0);
2409         if (r < 0)
2410                 return log_error_errno(r, "Failed to connect to netlink: %m");
2411
2412         udev = udev_new();
2413         if (!udev) {
2414                 log_error("Failed to connect to udev.");
2415                 return -ENOMEM;
2416         }
2417
2418         STRV_FOREACH(i, arg_network_macvlan) {
2419                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2420                 _cleanup_free_ char *n = NULL;
2421                 struct ether_addr mac;
2422                 int ifi;
2423
2424                 ifi = parse_interface(udev, *i);
2425                 if (ifi < 0)
2426                         return ifi;
2427
2428                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2431
2432                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2435
2436                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2437                 if (r < 0)
2438                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2439
2440                 n = strappend("mv-", *i);
2441                 if (!n)
2442                         return log_oom();
2443
2444                 strshorten(n, IFNAMSIZ-1);
2445
2446                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2449
2450                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2453
2454                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2457
2458                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to open netlink container: %m");
2461
2462                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to open netlink container: %m");
2465
2466                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2469
2470                 r = sd_rtnl_message_close_container(m);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to close netlink container: %m");
2473
2474                 r = sd_rtnl_message_close_container(m);
2475                 if (r < 0)
2476                         return log_error_errno(r, "Failed to close netlink container: %m");
2477
2478                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2479                 if (r < 0)
2480                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2481         }
2482
2483         return 0;
2484 }
2485
2486 static int setup_ipvlan(pid_t pid) {
2487         _cleanup_udev_unref_ struct udev *udev = NULL;
2488         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2489         char **i;
2490         int r;
2491
2492         if (!arg_private_network)
2493                 return 0;
2494
2495         if (strv_isempty(arg_network_ipvlan))
2496                 return 0;
2497
2498         r = sd_rtnl_open(&rtnl, 0);
2499         if (r < 0)
2500                 return log_error_errno(r, "Failed to connect to netlink: %m");
2501
2502         udev = udev_new();
2503         if (!udev) {
2504                 log_error("Failed to connect to udev.");
2505                 return -ENOMEM;
2506         }
2507
2508         STRV_FOREACH(i, arg_network_ipvlan) {
2509                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2510                 _cleanup_free_ char *n = NULL;
2511                 int ifi;
2512
2513                 ifi = parse_interface(udev, *i);
2514                 if (ifi < 0)
2515                         return ifi;
2516
2517                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2518                 if (r < 0)
2519                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2520
2521                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2522                 if (r < 0)
2523                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2524
2525                 n = strappend("iv-", *i);
2526                 if (!n)
2527                         return log_oom();
2528
2529                 strshorten(n, IFNAMSIZ-1);
2530
2531                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2532                 if (r < 0)
2533                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2534
2535                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2536                 if (r < 0)
2537                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2538
2539                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2540                 if (r < 0)
2541                         return log_error_errno(r, "Failed to open netlink container: %m");
2542
2543                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2544                 if (r < 0)
2545                         return log_error_errno(r, "Failed to open netlink container: %m");
2546
2547                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2548                 if (r < 0)
2549                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2550
2551                 r = sd_rtnl_message_close_container(m);
2552                 if (r < 0)
2553                         return log_error_errno(r, "Failed to close netlink container: %m");
2554
2555                 r = sd_rtnl_message_close_container(m);
2556                 if (r < 0)
2557                         return log_error_errno(r, "Failed to close netlink container: %m");
2558
2559                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2560                 if (r < 0)
2561                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2562         }
2563
2564         return 0;
2565 }
2566
2567 static int setup_seccomp(void) {
2568
2569 #ifdef HAVE_SECCOMP
2570         static const int blacklist[] = {
2571                 SCMP_SYS(kexec_load),
2572                 SCMP_SYS(open_by_handle_at),
2573                 SCMP_SYS(iopl),
2574                 SCMP_SYS(ioperm),
2575                 SCMP_SYS(swapon),
2576                 SCMP_SYS(swapoff),
2577         };
2578
2579         static const int kmod_blacklist[] = {
2580                 SCMP_SYS(init_module),
2581                 SCMP_SYS(finit_module),
2582                 SCMP_SYS(delete_module),
2583         };
2584
2585         scmp_filter_ctx seccomp;
2586         unsigned i;
2587         int r;
2588
2589         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2590         if (!seccomp)
2591                 return log_oom();
2592
2593         r = seccomp_add_secondary_archs(seccomp);
2594         if (r < 0) {
2595                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2596                 goto finish;
2597         }
2598
2599         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2600                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2601                 if (r == -EFAULT)
2602                         continue; /* unknown syscall */
2603                 if (r < 0) {
2604                         log_error_errno(r, "Failed to block syscall: %m");
2605                         goto finish;
2606                 }
2607         }
2608
2609         /* If the CAP_SYS_MODULE capability is not requested then
2610          * we'll block the kmod syscalls too */
2611         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2612                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2613                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2614                         if (r == -EFAULT)
2615                                 continue; /* unknown syscall */
2616                         if (r < 0) {
2617                                 log_error_errno(r, "Failed to block syscall: %m");
2618                                 goto finish;
2619                         }
2620                 }
2621         }
2622
2623         /*
2624            Audit is broken in containers, much of the userspace audit
2625            hookup will fail if running inside a container. We don't
2626            care and just turn off creation of audit sockets.
2627
2628            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2629            with EAFNOSUPPORT which audit userspace uses as indication
2630            that audit is disabled in the kernel.
2631          */
2632
2633         r = seccomp_rule_add(
2634                         seccomp,
2635                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2636                         SCMP_SYS(socket),
2637                         2,
2638                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2639                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2640         if (r < 0) {
2641                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2642                 goto finish;
2643         }
2644
2645         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2646         if (r < 0) {
2647                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2648                 goto finish;
2649         }
2650
2651         r = seccomp_load(seccomp);
2652         if (r < 0)
2653                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2654
2655 finish:
2656         seccomp_release(seccomp);
2657         return r;
2658 #else
2659         return 0;
2660 #endif
2661
2662 }
2663
2664 static int setup_propagate(const char *root) {
2665         const char *p, *q;
2666
2667         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2668         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2669         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2670         (void) mkdir_p(p, 0600);
2671
2672         q = strjoina(root, "/run/systemd/nspawn/incoming");
2673         mkdir_parents(q, 0755);
2674         mkdir_p(q, 0600);
2675
2676         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2677                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2678
2679         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2680                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2681
2682         return 0;
2683 }
2684
2685 static int setup_image(char **device_path, int *loop_nr) {
2686         struct loop_info64 info = {
2687                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2688         };
2689         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2690         _cleanup_free_ char* loopdev = NULL;
2691         struct stat st;
2692         int r, nr;
2693
2694         assert(device_path);
2695         assert(loop_nr);
2696         assert(arg_image);
2697
2698         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2699         if (fd < 0)
2700                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2701
2702         if (fstat(fd, &st) < 0)
2703                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2704
2705         if (S_ISBLK(st.st_mode)) {
2706                 char *p;
2707
2708                 p = strdup(arg_image);
2709                 if (!p)
2710                         return log_oom();
2711
2712                 *device_path = p;
2713
2714                 *loop_nr = -1;
2715
2716                 r = fd;
2717                 fd = -1;
2718
2719                 return r;
2720         }
2721
2722         if (!S_ISREG(st.st_mode)) {
2723                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2724                 return -EINVAL;
2725         }
2726
2727         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2728         if (control < 0)
2729                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2730
2731         nr = ioctl(control, LOOP_CTL_GET_FREE);
2732         if (nr < 0)
2733                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2734
2735         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2736                 return log_oom();
2737
2738         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2739         if (loop < 0)
2740                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2741
2742         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2743                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2744
2745         if (arg_read_only)
2746                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2747
2748         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2749                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2750
2751         *device_path = loopdev;
2752         loopdev = NULL;
2753
2754         *loop_nr = nr;
2755
2756         r = loop;
2757         loop = -1;
2758
2759         return r;
2760 }
2761
2762 #define PARTITION_TABLE_BLURB \
2763         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2764         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2765         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2766         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2767         "to be bootable with systemd-nspawn."
2768
2769 static int dissect_image(
2770                 int fd,
2771                 char **root_device, bool *root_device_rw,
2772                 char **home_device, bool *home_device_rw,
2773                 char **srv_device, bool *srv_device_rw,
2774                 bool *secondary) {
2775
2776 #ifdef HAVE_BLKID
2777         int home_nr = -1, srv_nr = -1;
2778 #ifdef GPT_ROOT_NATIVE
2779         int root_nr = -1;
2780 #endif
2781 #ifdef GPT_ROOT_SECONDARY
2782         int secondary_root_nr = -1;
2783 #endif
2784         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2785         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2786         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2787         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2788         _cleanup_udev_unref_ struct udev *udev = NULL;
2789         struct udev_list_entry *first, *item;
2790         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2791         bool is_gpt, is_mbr, multiple_generic = false;
2792         const char *pttype = NULL;
2793         blkid_partlist pl;
2794         struct stat st;
2795         unsigned i;
2796         int r;
2797
2798         assert(fd >= 0);
2799         assert(root_device);
2800         assert(home_device);
2801         assert(srv_device);
2802         assert(secondary);
2803         assert(arg_image);
2804
2805         b = blkid_new_probe();
2806         if (!b)
2807                 return log_oom();
2808
2809         errno = 0;
2810         r = blkid_probe_set_device(b, fd, 0, 0);
2811         if (r != 0) {
2812                 if (errno == 0)
2813                         return log_oom();
2814
2815                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2816                 return -errno;
2817         }
2818
2819         blkid_probe_enable_partitions(b, 1);
2820         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2821
2822         errno = 0;
2823         r = blkid_do_safeprobe(b);
2824         if (r == -2 || r == 1) {
2825                 log_error("Failed to identify any partition table on\n"
2826                           "    %s\n"
2827                           PARTITION_TABLE_BLURB, arg_image);
2828                 return -EINVAL;
2829         } else if (r != 0) {
2830                 if (errno == 0)
2831                         errno = EIO;
2832                 log_error_errno(errno, "Failed to probe: %m");
2833                 return -errno;
2834         }
2835
2836         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2837
2838         is_gpt = streq_ptr(pttype, "gpt");
2839         is_mbr = streq_ptr(pttype, "dos");
2840
2841         if (!is_gpt && !is_mbr) {
2842                 log_error("No GPT or MBR partition table discovered on\n"
2843                           "    %s\n"
2844                           PARTITION_TABLE_BLURB, arg_image);
2845                 return -EINVAL;
2846         }
2847
2848         errno = 0;
2849         pl = blkid_probe_get_partitions(b);
2850         if (!pl) {
2851                 if (errno == 0)
2852                         return log_oom();
2853
2854                 log_error("Failed to list partitions of %s", arg_image);
2855                 return -errno;
2856         }
2857
2858         udev = udev_new();
2859         if (!udev)
2860                 return log_oom();
2861
2862         if (fstat(fd, &st) < 0)
2863                 return log_error_errno(errno, "Failed to stat block device: %m");
2864
2865         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2866         if (!d)
2867                 return log_oom();
2868
2869         for (i = 0;; i++) {
2870                 int n, m;
2871
2872                 if (i >= 10) {
2873                         log_error("Kernel partitions never appeared.");
2874                         return -ENXIO;
2875                 }
2876
2877                 e = udev_enumerate_new(udev);
2878                 if (!e)
2879                         return log_oom();
2880
2881                 r = udev_enumerate_add_match_parent(e, d);
2882                 if (r < 0)
2883                         return log_oom();
2884
2885                 r = udev_enumerate_scan_devices(e);
2886                 if (r < 0)
2887                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2888
2889                 /* Count the partitions enumerated by the kernel */
2890                 n = 0;
2891                 first = udev_enumerate_get_list_entry(e);
2892                 udev_list_entry_foreach(item, first)
2893                         n++;
2894
2895                 /* Count the partitions enumerated by blkid */
2896                 m = blkid_partlist_numof_partitions(pl);
2897                 if (n == m + 1)
2898                         break;
2899                 if (n > m + 1) {
2900                         log_error("blkid and kernel partition list do not match.");
2901                         return -EIO;
2902                 }
2903                 if (n < m + 1) {
2904                         unsigned j;
2905
2906                         /* The kernel has probed fewer partitions than
2907                          * blkid? Maybe the kernel prober is still
2908                          * running or it got EBUSY because udev
2909                          * already opened the device. Let's reprobe
2910                          * the device, which is a synchronous call
2911                          * that waits until probing is complete. */
2912
2913                         for (j = 0; j < 20; j++) {
2914
2915                                 r = ioctl(fd, BLKRRPART, 0);
2916                                 if (r < 0)
2917                                         r = -errno;
2918                                 if (r >= 0 || r != -EBUSY)
2919                                         break;
2920
2921                                 /* If something else has the device
2922                                  * open, such as an udev rule, the
2923                                  * ioctl will return EBUSY. Since
2924                                  * there's no way to wait until it
2925                                  * isn't busy anymore, let's just wait
2926                                  * a bit, and try again.
2927                                  *
2928                                  * This is really something they
2929                                  * should fix in the kernel! */
2930
2931                                 usleep(50 * USEC_PER_MSEC);
2932                         }
2933
2934                         if (r < 0)
2935                                 return log_error_errno(r, "Failed to reread partition table: %m");
2936                 }
2937
2938                 e = udev_enumerate_unref(e);
2939         }
2940
2941         first = udev_enumerate_get_list_entry(e);
2942         udev_list_entry_foreach(item, first) {
2943                 _cleanup_udev_device_unref_ struct udev_device *q;
2944                 const char *node;
2945                 unsigned long long flags;
2946                 blkid_partition pp;
2947                 dev_t qn;
2948                 int nr;
2949
2950                 errno = 0;
2951                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2952                 if (!q) {
2953                         if (!errno)
2954                                 errno = ENOMEM;
2955
2956                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2957                         return -errno;
2958                 }
2959
2960                 qn = udev_device_get_devnum(q);
2961                 if (major(qn) == 0)
2962                         continue;
2963
2964                 if (st.st_rdev == qn)
2965                         continue;
2966
2967                 node = udev_device_get_devnode(q);
2968                 if (!node)
2969                         continue;
2970
2971                 pp = blkid_partlist_devno_to_partition(pl, qn);
2972                 if (!pp)
2973                         continue;
2974
2975                 flags = blkid_partition_get_flags(pp);
2976
2977                 nr = blkid_partition_get_partno(pp);
2978                 if (nr < 0)
2979                         continue;
2980
2981                 if (is_gpt) {
2982                         sd_id128_t type_id;
2983                         const char *stype;
2984
2985                         if (flags & GPT_FLAG_NO_AUTO)
2986                                 continue;
2987
2988                         stype = blkid_partition_get_type_string(pp);
2989                         if (!stype)
2990                                 continue;
2991
2992                         if (sd_id128_from_string(stype, &type_id) < 0)
2993                                 continue;
2994
2995                         if (sd_id128_equal(type_id, GPT_HOME)) {
2996
2997                                 if (home && nr >= home_nr)
2998                                         continue;
2999
3000                                 home_nr = nr;
3001                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
3002
3003                                 r = free_and_strdup(&home, node);
3004                                 if (r < 0)
3005                                         return log_oom();
3006
3007                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
3008
3009                                 if (srv && nr >= srv_nr)
3010                                         continue;
3011
3012                                 srv_nr = nr;
3013                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
3014
3015                                 r = free_and_strdup(&srv, node);
3016                                 if (r < 0)
3017                                         return log_oom();
3018                         }
3019 #ifdef GPT_ROOT_NATIVE
3020                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3021
3022                                 if (root && nr >= root_nr)
3023                                         continue;
3024
3025                                 root_nr = nr;
3026                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3027
3028                                 r = free_and_strdup(&root, node);
3029                                 if (r < 0)
3030                                         return log_oom();
3031                         }
3032 #endif
3033 #ifdef GPT_ROOT_SECONDARY
3034                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3035
3036                                 if (secondary_root && nr >= secondary_root_nr)
3037                                         continue;
3038
3039                                 secondary_root_nr = nr;
3040                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3041
3042                                 r = free_and_strdup(&secondary_root, node);
3043                                 if (r < 0)
3044                                         return log_oom();
3045                         }
3046 #endif
3047                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3048
3049                                 if (generic)
3050                                         multiple_generic = true;
3051                                 else {
3052                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3053
3054                                         r = free_and_strdup(&generic, node);
3055                                         if (r < 0)
3056                                                 return log_oom();
3057                                 }
3058                         }
3059
3060                 } else if (is_mbr) {
3061                         int type;
3062
3063                         if (flags != 0x80) /* Bootable flag */
3064                                 continue;
3065
3066                         type = blkid_partition_get_type(pp);
3067                         if (type != 0x83) /* Linux partition */
3068                                 continue;
3069
3070                         if (generic)
3071                                 multiple_generic = true;
3072                         else {
3073                                 generic_rw = true;
3074
3075                                 r = free_and_strdup(&root, node);
3076                                 if (r < 0)
3077                                         return log_oom();
3078                         }
3079                 }
3080         }
3081
3082         if (root) {
3083                 *root_device = root;
3084                 root = NULL;
3085
3086                 *root_device_rw = root_rw;
3087                 *secondary = false;
3088         } else if (secondary_root) {
3089                 *root_device = secondary_root;
3090                 secondary_root = NULL;
3091
3092                 *root_device_rw = secondary_root_rw;
3093                 *secondary = true;
3094         } else if (generic) {
3095
3096                 /* There were no partitions with precise meanings
3097                  * around, but we found generic partitions. In this
3098                  * case, if there's only one, we can go ahead and boot
3099                  * it, otherwise we bail out, because we really cannot
3100                  * make any sense of it. */
3101
3102                 if (multiple_generic) {
3103                         log_error("Identified multiple bootable Linux partitions on\n"
3104                                   "    %s\n"
3105                                   PARTITION_TABLE_BLURB, arg_image);
3106                         return -EINVAL;
3107                 }
3108
3109                 *root_device = generic;
3110                 generic = NULL;
3111
3112                 *root_device_rw = generic_rw;
3113                 *secondary = false;
3114         } else {
3115                 log_error("Failed to identify root partition in disk image\n"
3116                           "    %s\n"
3117                           PARTITION_TABLE_BLURB, arg_image);
3118                 return -EINVAL;
3119         }
3120
3121         if (home) {
3122                 *home_device = home;
3123                 home = NULL;
3124
3125                 *home_device_rw = home_rw;
3126         }
3127
3128         if (srv) {
3129                 *srv_device = srv;
3130                 srv = NULL;
3131
3132                 *srv_device_rw = srv_rw;
3133         }
3134
3135         return 0;
3136 #else
3137         log_error("--image= is not supported, compiled without blkid support.");
3138         return -ENOTSUP;
3139 #endif
3140 }
3141
3142 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3143 #ifdef HAVE_BLKID
3144         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3145         const char *fstype, *p;
3146         int r;
3147
3148         assert(what);
3149         assert(where);
3150
3151         if (arg_read_only)
3152                 rw = false;
3153
3154         if (directory)
3155                 p = strjoina(where, directory);
3156         else
3157                 p = where;
3158
3159         errno = 0;
3160         b = blkid_new_probe_from_filename(what);
3161         if (!b) {
3162                 if (errno == 0)
3163                         return log_oom();
3164                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3165                 return -errno;
3166         }
3167
3168         blkid_probe_enable_superblocks(b, 1);
3169         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3170
3171         errno = 0;
3172         r = blkid_do_safeprobe(b);
3173         if (r == -1 || r == 1) {
3174                 log_error("Cannot determine file system type of %s", what);
3175                 return -EINVAL;
3176         } else if (r != 0) {
3177                 if (errno == 0)
3178                         errno = EIO;
3179                 log_error_errno(errno, "Failed to probe %s: %m", what);
3180                 return -errno;
3181         }
3182
3183         errno = 0;
3184         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3185                 if (errno == 0)
3186                         errno = EINVAL;
3187                 log_error("Failed to determine file system type of %s", what);
3188                 return -errno;
3189         }
3190
3191         if (streq(fstype, "crypto_LUKS")) {
3192                 log_error("nspawn currently does not support LUKS disk images.");
3193                 return -ENOTSUP;
3194         }
3195
3196         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3197                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3198
3199         return 0;
3200 #else
3201         log_error("--image= is not supported, compiled without blkid support.");
3202         return -ENOTSUP;
3203 #endif
3204 }
3205
3206 static int mount_devices(
3207                 const char *where,
3208                 const char *root_device, bool root_device_rw,
3209                 const char *home_device, bool home_device_rw,
3210                 const char *srv_device, bool srv_device_rw) {
3211         int r;
3212
3213         assert(where);
3214
3215         if (root_device) {
3216                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3217                 if (r < 0)
3218                         return log_error_errno(r, "Failed to mount root directory: %m");
3219         }
3220
3221         if (home_device) {
3222                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3223                 if (r < 0)
3224                         return log_error_errno(r, "Failed to mount home directory: %m");
3225         }
3226
3227         if (srv_device) {
3228                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3229                 if (r < 0)
3230                         return log_error_errno(r, "Failed to mount server data directory: %m");
3231         }
3232
3233         return 0;
3234 }
3235
3236 static void loop_remove(int nr, int *image_fd) {
3237         _cleanup_close_ int control = -1;
3238         int r;
3239
3240         if (nr < 0)
3241                 return;
3242
3243         if (image_fd && *image_fd >= 0) {
3244                 r = ioctl(*image_fd, LOOP_CLR_FD);
3245                 if (r < 0)
3246                         log_debug_errno(errno, "Failed to close loop image: %m");
3247                 *image_fd = safe_close(*image_fd);
3248         }
3249
3250         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3251         if (control < 0) {
3252                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3253                 return;
3254         }
3255
3256         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3257         if (r < 0)
3258                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3259 }
3260
3261 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3262         int pipe_fds[2];
3263         pid_t pid;
3264
3265         assert(database);
3266         assert(key);
3267         assert(rpid);
3268
3269         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3270                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3271
3272         pid = fork();
3273         if (pid < 0)
3274                 return log_error_errno(errno, "Failed to fork getent child: %m");
3275         else if (pid == 0) {
3276                 int nullfd;
3277                 char *empty_env = NULL;
3278
3279                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3280                         _exit(EXIT_FAILURE);
3281
3282                 if (pipe_fds[0] > 2)
3283                         safe_close(pipe_fds[0]);
3284                 if (pipe_fds[1] > 2)
3285                         safe_close(pipe_fds[1]);
3286
3287                 nullfd = open("/dev/null", O_RDWR);
3288                 if (nullfd < 0)
3289                         _exit(EXIT_FAILURE);
3290
3291                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3292                         _exit(EXIT_FAILURE);
3293
3294                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3295                         _exit(EXIT_FAILURE);
3296
3297                 if (nullfd > 2)
3298                         safe_close(nullfd);
3299
3300                 reset_all_signal_handlers();
3301                 close_all_fds(NULL, 0);
3302
3303                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3304                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3305                 _exit(EXIT_FAILURE);
3306         }
3307
3308         pipe_fds[1] = safe_close(pipe_fds[1]);
3309
3310         *rpid = pid;
3311
3312         return pipe_fds[0];
3313 }
3314
3315 static int change_uid_gid(char **_home) {
3316         char line[LINE_MAX], *x, *u, *g, *h;
3317         const char *word, *state;
3318         _cleanup_free_ uid_t *uids = NULL;
3319         _cleanup_free_ char *home = NULL;
3320         _cleanup_fclose_ FILE *f = NULL;
3321         _cleanup_close_ int fd = -1;
3322         unsigned n_uids = 0;
3323         size_t sz = 0, l;
3324         uid_t uid;
3325         gid_t gid;
3326         pid_t pid;
3327         int r;
3328
3329         assert(_home);
3330
3331         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3332                 /* Reset everything fully to 0, just in case */
3333
3334                 if (setgroups(0, NULL) < 0)
3335                         return log_error_errno(errno, "setgroups() failed: %m");
3336
3337                 if (setresgid(0, 0, 0) < 0)
3338                         return log_error_errno(errno, "setregid() failed: %m");
3339
3340                 if (setresuid(0, 0, 0) < 0)
3341                         return log_error_errno(errno, "setreuid() failed: %m");
3342
3343                 *_home = NULL;
3344                 return 0;
3345         }
3346
3347         /* First, get user credentials */
3348         fd = spawn_getent("passwd", arg_user, &pid);
3349         if (fd < 0)
3350                 return fd;
3351
3352         f = fdopen(fd, "r");
3353         if (!f)
3354                 return log_oom();
3355         fd = -1;
3356
3357         if (!fgets(line, sizeof(line), f)) {
3358
3359                 if (!ferror(f)) {
3360                         log_error("Failed to resolve user %s.", arg_user);
3361                         return -ESRCH;
3362                 }
3363
3364                 log_error_errno(errno, "Failed to read from getent: %m");
3365                 return -errno;
3366         }
3367
3368         truncate_nl(line);
3369
3370         wait_for_terminate_and_warn("getent passwd", pid, true);
3371
3372         x = strchr(line, ':');
3373         if (!x) {
3374                 log_error("/etc/passwd entry has invalid user field.");
3375                 return -EIO;
3376         }
3377
3378         u = strchr(x+1, ':');
3379         if (!u) {
3380                 log_error("/etc/passwd entry has invalid password field.");
3381                 return -EIO;
3382         }
3383
3384         u++;
3385         g = strchr(u, ':');
3386         if (!g) {
3387                 log_error("/etc/passwd entry has invalid UID field.");
3388                 return -EIO;
3389         }
3390
3391         *g = 0;
3392         g++;
3393         x = strchr(g, ':');
3394         if (!x) {
3395                 log_error("/etc/passwd entry has invalid GID field.");
3396                 return -EIO;
3397         }
3398
3399         *x = 0;
3400         h = strchr(x+1, ':');
3401         if (!h) {
3402                 log_error("/etc/passwd entry has invalid GECOS field.");
3403                 return -EIO;
3404         }
3405
3406         h++;
3407         x = strchr(h, ':');
3408         if (!x) {
3409                 log_error("/etc/passwd entry has invalid home directory field.");
3410                 return -EIO;
3411         }
3412
3413         *x = 0;
3414
3415         r = parse_uid(u, &uid);
3416         if (r < 0) {
3417                 log_error("Failed to parse UID of user.");
3418                 return -EIO;
3419         }
3420
3421         r = parse_gid(g, &gid);
3422         if (r < 0) {
3423                 log_error("Failed to parse GID of user.");
3424                 return -EIO;
3425         }
3426
3427         home = strdup(h);
3428         if (!home)
3429                 return log_oom();
3430
3431         /* Second, get group memberships */
3432         fd = spawn_getent("initgroups", arg_user, &pid);
3433         if (fd < 0)
3434                 return fd;
3435
3436         fclose(f);
3437         f = fdopen(fd, "r");
3438         if (!f)
3439                 return log_oom();
3440         fd = -1;
3441
3442         if (!fgets(line, sizeof(line), f)) {
3443                 if (!ferror(f)) {
3444                         log_error("Failed to resolve user %s.", arg_user);
3445                         return -ESRCH;
3446                 }
3447
3448                 log_error_errno(errno, "Failed to read from getent: %m");
3449                 return -errno;
3450         }
3451
3452         truncate_nl(line);
3453
3454         wait_for_terminate_and_warn("getent initgroups", pid, true);
3455
3456         /* Skip over the username and subsequent separator whitespace */
3457         x = line;
3458         x += strcspn(x, WHITESPACE);
3459         x += strspn(x, WHITESPACE);
3460
3461         FOREACH_WORD(word, l, x, state) {
3462                 char c[l+1];
3463
3464                 memcpy(c, word, l);
3465                 c[l] = 0;
3466
3467                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3468                         return log_oom();
3469
3470                 r = parse_uid(c, &uids[n_uids++]);
3471                 if (r < 0) {
3472                         log_error("Failed to parse group data from getent.");
3473                         return -EIO;
3474                 }
3475         }
3476
3477         r = mkdir_parents(home, 0775);
3478         if (r < 0)
3479                 return log_error_errno(r, "Failed to make home root directory: %m");
3480
3481         r = mkdir_safe(home, 0755, uid, gid);
3482         if (r < 0 && r != -EEXIST)
3483                 return log_error_errno(r, "Failed to make home directory: %m");
3484
3485         fchown(STDIN_FILENO, uid, gid);
3486         fchown(STDOUT_FILENO, uid, gid);
3487         fchown(STDERR_FILENO, uid, gid);
3488
3489         if (setgroups(n_uids, uids) < 0)
3490                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3491
3492         if (setresgid(gid, gid, gid) < 0)
3493                 return log_error_errno(errno, "setregid() failed: %m");
3494
3495         if (setresuid(uid, uid, uid) < 0)
3496                 return log_error_errno(errno, "setreuid() failed: %m");
3497
3498         if (_home) {
3499                 *_home = home;
3500                 home = NULL;
3501         }
3502
3503         return 0;
3504 }
3505
3506 /*
3507  * Return values:
3508  * < 0 : wait_for_terminate() failed to get the state of the
3509  *       container, the container was terminated by a signal, or
3510  *       failed for an unknown reason.  No change is made to the
3511  *       container argument.
3512  * > 0 : The program executed in the container terminated with an
3513  *       error.  The exit code of the program executed in the
3514  *       container is returned.  The container argument has been set
3515  *       to CONTAINER_TERMINATED.
3516  *   0 : The container is being rebooted, has been shut down or exited
3517  *       successfully.  The container argument has been set to either
3518  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3519  *
3520  * That is, success is indicated by a return value of zero, and an
3521  * error is indicated by a non-zero value.
3522  */
3523 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3524         siginfo_t status;
3525         int r;
3526
3527         r = wait_for_terminate(pid, &status);
3528         if (r < 0)
3529                 return log_warning_errno(r, "Failed to wait for container: %m");
3530
3531         switch (status.si_code) {
3532
3533         case CLD_EXITED:
3534                 if (status.si_status == 0) {
3535                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3536
3537                 } else
3538                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3539
3540                 *container = CONTAINER_TERMINATED;
3541                 return status.si_status;
3542
3543         case CLD_KILLED:
3544                 if (status.si_status == SIGINT) {
3545
3546                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3547                         *container = CONTAINER_TERMINATED;
3548                         return 0;
3549
3550                 } else if (status.si_status == SIGHUP) {
3551
3552                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3553                         *container = CONTAINER_REBOOTED;
3554                         return 0;
3555                 }
3556
3557                 /* CLD_KILLED fallthrough */
3558
3559         case CLD_DUMPED:
3560                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3561                 return -EIO;
3562
3563         default:
3564                 log_error("Container %s failed due to unknown reason.", arg_machine);
3565                 return -EIO;
3566         }
3567
3568         return r;
3569 }
3570
3571 static void nop_handler(int sig) {}
3572
3573 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3574         pid_t pid;
3575
3576         pid = PTR_TO_UINT32(userdata);
3577         if (pid > 0) {
3578                 if (kill(pid, arg_kill_signal) >= 0) {
3579                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3580                         sd_event_source_set_userdata(s, NULL);
3581                         return 0;
3582                 }
3583         }
3584
3585         sd_event_exit(sd_event_source_get_event(s), 0);
3586         return 0;
3587 }
3588
3589 static int determine_names(void) {
3590         int r;
3591
3592         if (!arg_image && !arg_directory) {
3593                 if (arg_machine) {
3594                         _cleanup_(image_unrefp) Image *i = NULL;
3595
3596                         r = image_find(arg_machine, &i);
3597                         if (r < 0)
3598                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3599                         else if (r == 0) {
3600                                 log_error("No image for machine '%s': %m", arg_machine);
3601                                 return -ENOENT;
3602                         }
3603
3604                         if (i->type == IMAGE_RAW)
3605                                 r = set_sanitized_path(&arg_image, i->path);
3606                         else
3607                                 r = set_sanitized_path(&arg_directory, i->path);
3608                         if (r < 0)
3609                                 return log_error_errno(r, "Invalid image directory: %m");
3610
3611                         arg_read_only = arg_read_only || i->read_only;
3612                 } else
3613                         arg_directory = get_current_dir_name();
3614
3615                 if (!arg_directory && !arg_machine) {
3616                         log_error("Failed to determine path, please use -D or -i.");
3617                         return -EINVAL;
3618                 }
3619         }
3620
3621         if (!arg_machine) {
3622                 if (arg_directory && path_equal(arg_directory, "/"))
3623                         arg_machine = gethostname_malloc();
3624                 else
3625                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3626
3627                 if (!arg_machine)
3628                         return log_oom();
3629
3630                 hostname_cleanup(arg_machine, false);
3631                 if (!machine_name_is_valid(arg_machine)) {
3632                         log_error("Failed to determine machine name automatically, please use -M.");
3633                         return -EINVAL;
3634                 }
3635
3636                 if (arg_ephemeral) {
3637                         char *b;
3638
3639                         /* Add a random suffix when this is an
3640                          * ephemeral machine, so that we can run many
3641                          * instances at once without manually having
3642                          * to specify -M each time. */
3643
3644                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3645                                 return log_oom();
3646
3647                         free(arg_machine);
3648                         arg_machine = b;
3649                 }
3650         }
3651
3652         return 0;
3653 }
3654
3655 static int determine_uid_shift(void) {
3656         int r;
3657
3658         if (!arg_userns)
3659                 return 0;
3660
3661         if (arg_uid_shift == UID_INVALID) {
3662                 struct stat st;
3663
3664                 r = stat(arg_directory, &st);
3665                 if (r < 0)
3666                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3667
3668                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3669
3670                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3671                         log_error("UID and GID base of %s don't match.", arg_directory);
3672                         return -EINVAL;
3673                 }
3674
3675                 arg_uid_range = UINT32_C(0x10000);
3676         }
3677
3678         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3679                 log_error("UID base too high for UID range.");
3680                 return -EINVAL;
3681         }
3682
3683         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3684         return 0;
3685 }
3686
3687 int main(int argc, char *argv[]) {
3688
3689         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3690         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3691         _cleanup_close_ int master = -1, image_fd = -1;
3692         _cleanup_fdset_free_ FDSet *fds = NULL;
3693         int r, n_fd_passed, loop_nr = -1;
3694         char veth_name[IFNAMSIZ];
3695         bool secondary = false, remove_subvol = false;
3696         sigset_t mask, mask_chld;
3697         pid_t pid = 0;
3698         int ret = EXIT_SUCCESS;
3699         union in_addr_union exposed = {};
3700         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3701         bool interactive;
3702
3703         log_parse_environment();
3704         log_open();
3705
3706         r = parse_argv(argc, argv);
3707         if (r <= 0)
3708                 goto finish;
3709
3710         r = determine_names();
3711         if (r < 0)
3712                 goto finish;
3713
3714         if (geteuid() != 0) {
3715                 log_error("Need to be root.");
3716                 r = -EPERM;
3717                 goto finish;
3718         }
3719
3720         if (sd_booted() <= 0) {
3721                 log_error("Not running on a systemd system.");
3722                 r = -EINVAL;
3723                 goto finish;
3724         }
3725
3726         log_close();
3727         n_fd_passed = sd_listen_fds(false);
3728         if (n_fd_passed > 0) {
3729                 r = fdset_new_listen_fds(&fds, false);
3730                 if (r < 0) {
3731                         log_error_errno(r, "Failed to collect file descriptors: %m");
3732                         goto finish;
3733                 }
3734         }
3735         fdset_close_others(fds);
3736         log_open();
3737
3738         if (arg_directory) {
3739                 assert(!arg_image);
3740
3741                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3742                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3743                         r = -EINVAL;
3744                         goto finish;
3745                 }
3746
3747                 if (arg_ephemeral) {
3748                         char *np;
3749
3750                         /* If the specified path is a mount point we
3751                          * generate the new snapshot immediately
3752                          * inside it under a random name. However if
3753                          * the specified is not a mount point we
3754                          * create the new snapshot in the parent
3755                          * directory, just next to it. */
3756                         r = path_is_mount_point(arg_directory, false);
3757                         if (r < 0) {
3758                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3759                                 goto finish;
3760                         }
3761                         if (r > 0)
3762                                 r = tempfn_random_child(arg_directory, &np);
3763                         else
3764                                 r = tempfn_random(arg_directory, &np);
3765                         if (r < 0) {
3766                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3767                                 goto finish;
3768                         }
3769
3770                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3771                         if (r < 0) {
3772                                 log_error_errno(r, "Failed to lock %s: %m", np);
3773                                 goto finish;
3774                         }
3775
3776                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3777                         if (r < 0) {
3778                                 free(np);
3779                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3780                                 goto finish;
3781                         }
3782
3783                         free(arg_directory);
3784                         arg_directory = np;
3785
3786                         remove_subvol = true;
3787
3788                 } else {
3789                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3790                         if (r == -EBUSY) {
3791                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3792                                 goto finish;
3793                         }
3794                         if (r < 0) {
3795                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3796                                 return r;
3797                         }
3798
3799                         if (arg_template) {
3800                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3801                                 if (r == -EEXIST) {
3802                                         if (!arg_quiet)
3803                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3804                                 } else if (r < 0) {
3805                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3806                                         goto finish;
3807                                 } else {
3808                                         if (!arg_quiet)
3809                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3810                                 }
3811                         }
3812                 }
3813
3814                 if (arg_boot) {
3815                         if (path_is_os_tree(arg_directory) <= 0) {
3816                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3817                                 r = -EINVAL;
3818                                 goto finish;
3819                         }
3820                 } else {
3821                         const char *p;
3822
3823                         p = strjoina(arg_directory,
3824                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3825                         if (access(p, F_OK) < 0) {
3826                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3827                                 r = -EINVAL;
3828                                 goto finish;
3829                         }
3830                 }
3831
3832         } else {
3833                 char template[] = "/tmp/nspawn-root-XXXXXX";
3834
3835                 assert(arg_image);
3836                 assert(!arg_template);
3837
3838                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3839                 if (r == -EBUSY) {
3840                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3841                         goto finish;
3842                 }
3843                 if (r < 0) {
3844                         r = log_error_errno(r, "Failed to create image lock: %m");
3845                         goto finish;
3846                 }
3847
3848                 if (!mkdtemp(template)) {
3849                         log_error_errno(errno, "Failed to create temporary directory: %m");
3850                         r = -errno;
3851                         goto finish;
3852                 }
3853
3854                 arg_directory = strdup(template);
3855                 if (!arg_directory) {
3856                         r = log_oom();
3857                         goto finish;
3858                 }
3859
3860                 image_fd = setup_image(&device_path, &loop_nr);
3861                 if (image_fd < 0) {
3862                         r = image_fd;
3863                         goto finish;
3864                 }
3865
3866                 r = dissect_image(image_fd,
3867                                   &root_device, &root_device_rw,
3868                                   &home_device, &home_device_rw,
3869                                   &srv_device, &srv_device_rw,
3870                                   &secondary);
3871                 if (r < 0)
3872                         goto finish;
3873         }
3874
3875         r = determine_uid_shift();
3876         if (r < 0)
3877                 goto finish;
3878
3879         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3880
3881         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3882         if (master < 0) {
3883                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3884                 goto finish;
3885         }
3886
3887         r = ptsname_malloc(master, &console);
3888         if (r < 0) {
3889                 r = log_error_errno(r, "Failed to determine tty name: %m");
3890                 goto finish;
3891         }
3892
3893         if (unlockpt(master) < 0) {
3894                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3895                 goto finish;
3896         }
3897
3898         if (!arg_quiet)
3899                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3900                          arg_machine, arg_image ?: arg_directory);
3901
3902         assert_se(sigemptyset(&mask) == 0);
3903         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3904         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3905
3906         assert_se(sigemptyset(&mask_chld) == 0);
3907         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3908
3909         for (;;) {
3910                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3911                 ContainerStatus container_status;
3912                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3913                 struct sigaction sa = {
3914                         .sa_handler = nop_handler,
3915                         .sa_flags = SA_NOCLDSTOP,
3916                 };
3917
3918                 r = barrier_create(&barrier);
3919                 if (r < 0) {
3920                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3921                         goto finish;
3922                 }
3923
3924                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3925                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3926                         goto finish;
3927                 }
3928
3929                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3930                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3931                         goto finish;
3932                 }
3933
3934                 /* Child can be killed before execv(), so handle SIGCHLD
3935                  * in order to interrupt parent's blocking calls and
3936                  * give it a chance to call wait() and terminate. */
3937                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3938                 if (r < 0) {
3939                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3940                         goto finish;
3941                 }
3942
3943                 r = sigaction(SIGCHLD, &sa, NULL);
3944                 if (r < 0) {
3945                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3946                         goto finish;
3947                 }
3948
3949                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3950                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3951                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3952                 if (pid < 0) {
3953                         if (errno == EINVAL)
3954                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3955                         else
3956                                 r = log_error_errno(errno, "clone() failed: %m");
3957
3958                         goto finish;
3959                 }
3960
3961                 if (pid == 0) {
3962                         /* child */
3963                         _cleanup_free_ char *home = NULL;
3964                         unsigned n_env = 2;
3965                         const char *envp[] = {
3966                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3967                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3968                                 NULL, /* TERM */
3969                                 NULL, /* HOME */
3970                                 NULL, /* USER */
3971                                 NULL, /* LOGNAME */
3972                                 NULL, /* container_uuid */
3973                                 NULL, /* LISTEN_FDS */
3974                                 NULL, /* LISTEN_PID */
3975                                 NULL
3976                         };
3977                         char **env_use;
3978
3979                         barrier_set_role(&barrier, BARRIER_CHILD);
3980
3981                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3982                         if (envp[n_env])
3983                                 n_env ++;
3984
3985                         master = safe_close(master);
3986
3987                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3988                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3989
3990                         reset_all_signal_handlers();
3991                         reset_signal_mask();
3992
3993                         if (interactive) {
3994                                 close_nointr(STDIN_FILENO);
3995                                 close_nointr(STDOUT_FILENO);
3996                                 close_nointr(STDERR_FILENO);
3997
3998                                 r = open_terminal(console, O_RDWR);
3999                                 if (r != STDIN_FILENO) {
4000                                         if (r >= 0) {
4001                                                 safe_close(r);
4002                                                 r = -EINVAL;
4003                                         }
4004
4005                                         log_error_errno(r, "Failed to open console: %m");
4006                                         _exit(EXIT_FAILURE);
4007                                 }
4008
4009                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
4010                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
4011                                         log_error_errno(errno, "Failed to duplicate console: %m");
4012                                         _exit(EXIT_FAILURE);
4013                                 }
4014                         }
4015
4016                         if (setsid() < 0) {
4017                                 log_error_errno(errno, "setsid() failed: %m");
4018                                 _exit(EXIT_FAILURE);
4019                         }
4020
4021                         if (reset_audit_loginuid() < 0)
4022                                 _exit(EXIT_FAILURE);
4023
4024                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4025                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4026                                 _exit(EXIT_FAILURE);
4027                         }
4028
4029                         if (arg_private_network)
4030                                 loopback_setup();
4031
4032                         /* Mark everything as slave, so that we still
4033                          * receive mounts from the real root, but don't
4034                          * propagate mounts to the real root. */
4035                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4036                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4037                                 _exit(EXIT_FAILURE);
4038                         }
4039
4040                         if (mount_devices(arg_directory,
4041                                           root_device, root_device_rw,
4042                                           home_device, home_device_rw,
4043                                           srv_device, srv_device_rw) < 0)
4044                                 _exit(EXIT_FAILURE);
4045
4046                         /* Turn directory into bind mount */
4047                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4048                                 log_error_errno(errno, "Failed to make bind mount: %m");
4049                                 _exit(EXIT_FAILURE);
4050                         }
4051
4052                         r = setup_volatile(arg_directory);
4053                         if (r < 0)
4054                                 _exit(EXIT_FAILURE);
4055
4056                         if (setup_volatile_state(arg_directory) < 0)
4057                                 _exit(EXIT_FAILURE);
4058
4059                         r = base_filesystem_create(arg_directory);
4060                         if (r < 0)
4061                                 _exit(EXIT_FAILURE);
4062
4063                         if (arg_read_only) {
4064                                 r = bind_remount_recursive(arg_directory, true);
4065                                 if (r < 0) {
4066                                         log_error_errno(r, "Failed to make tree read-only: %m");
4067                                         _exit(EXIT_FAILURE);
4068                                 }
4069                         }
4070
4071                         if (mount_all(arg_directory) < 0)
4072                                 _exit(EXIT_FAILURE);
4073
4074                         if (copy_devnodes(arg_directory) < 0)
4075                                 _exit(EXIT_FAILURE);
4076
4077                         if (setup_ptmx(arg_directory) < 0)
4078                                 _exit(EXIT_FAILURE);
4079
4080                         dev_setup(arg_directory);
4081
4082                         if (setup_propagate(arg_directory) < 0)
4083                                 _exit(EXIT_FAILURE);
4084
4085                         if (setup_seccomp() < 0)
4086                                 _exit(EXIT_FAILURE);
4087
4088                         if (setup_dev_console(arg_directory, console) < 0)
4089                                 _exit(EXIT_FAILURE);
4090
4091                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4092                                 _exit(EXIT_FAILURE);
4093                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4094
4095                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4096                                 _exit(EXIT_FAILURE);
4097                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4098
4099                         /* Tell the parent that we are ready, and that
4100                          * it can cgroupify us to that we lack access
4101                          * to certain devices and resources. */
4102                         (void) barrier_place(&barrier); /* #1 */
4103
4104                         if (setup_boot_id(arg_directory) < 0)
4105                                 _exit(EXIT_FAILURE);
4106
4107                         if (setup_timezone(arg_directory) < 0)
4108                                 _exit(EXIT_FAILURE);
4109
4110                         if (setup_resolv_conf(arg_directory) < 0)
4111                                 _exit(EXIT_FAILURE);
4112
4113                         if (setup_journal(arg_directory) < 0)
4114                                 _exit(EXIT_FAILURE);
4115
4116                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4117                                 _exit(EXIT_FAILURE);
4118
4119                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4120                                 _exit(EXIT_FAILURE);
4121
4122                         if (mount_tmpfs(arg_directory) < 0)
4123                                 _exit(EXIT_FAILURE);
4124
4125                         /* Wait until we are cgroup-ified, so that we
4126                          * can mount the right cgroup path writable */
4127                         (void) barrier_place_and_sync(&barrier); /* #2 */
4128
4129                         if (mount_cgroup(arg_directory) < 0)
4130                                 _exit(EXIT_FAILURE);
4131
4132                         if (chdir(arg_directory) < 0) {
4133                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4134                                 _exit(EXIT_FAILURE);
4135                         }
4136
4137                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4138                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4139                                 _exit(EXIT_FAILURE);
4140                         }
4141
4142                         if (chroot(".") < 0) {
4143                                 log_error_errno(errno, "chroot() failed: %m");
4144                                 _exit(EXIT_FAILURE);
4145                         }
4146
4147                         if (chdir("/") < 0) {
4148                                 log_error_errno(errno, "chdir() failed: %m");
4149                                 _exit(EXIT_FAILURE);
4150                         }
4151
4152                         if (arg_userns) {
4153                                 if (unshare(CLONE_NEWUSER) < 0) {
4154                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4155                                         _exit(EXIT_FAILURE);
4156                                 }
4157
4158                                 /* Tell the parent, that it now can
4159                                  * write the UID map. */
4160                                 (void) barrier_place(&barrier); /* #3 */
4161
4162                                 /* Wait until the parent wrote the UID
4163                                  * map */
4164                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4165                         }
4166
4167                         umask(0022);
4168
4169                         if (drop_capabilities() < 0) {
4170                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4171                                 _exit(EXIT_FAILURE);
4172                         }
4173
4174                         setup_hostname();
4175
4176                         if (arg_personality != 0xffffffffLU) {
4177                                 if (personality(arg_personality) < 0) {
4178                                         log_error_errno(errno, "personality() failed: %m");
4179                                         _exit(EXIT_FAILURE);
4180                                 }
4181                         } else if (secondary) {
4182                                 if (personality(PER_LINUX32) < 0) {
4183                                         log_error_errno(errno, "personality() failed: %m");
4184                                         _exit(EXIT_FAILURE);
4185                                 }
4186                         }
4187
4188 #ifdef HAVE_SELINUX
4189                         if (arg_selinux_context)
4190                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4191                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4192                                         _exit(EXIT_FAILURE);
4193                                 }
4194 #endif
4195
4196                         r = change_uid_gid(&home);
4197                         if (r < 0)
4198                                 _exit(EXIT_FAILURE);
4199
4200                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4201                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4202                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4203                                 log_oom();
4204                                 _exit(EXIT_FAILURE);
4205                         }
4206
4207                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4208                                 char as_uuid[37];
4209
4210                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4211                                         log_oom();
4212                                         _exit(EXIT_FAILURE);
4213                                 }
4214                         }
4215
4216                         if (fdset_size(fds) > 0) {
4217                                 r = fdset_cloexec(fds, false);
4218                                 if (r < 0) {
4219                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4220                                         _exit(EXIT_FAILURE);
4221                                 }
4222
4223                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4224                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4225                                         log_oom();
4226                                         _exit(EXIT_FAILURE);
4227                                 }
4228                         }
4229
4230                         if (!strv_isempty(arg_setenv)) {
4231                                 char **n;
4232
4233                                 n = strv_env_merge(2, envp, arg_setenv);
4234                                 if (!n) {
4235                                         log_oom();
4236                                         _exit(EXIT_FAILURE);
4237                                 }
4238
4239                                 env_use = n;
4240                         } else
4241                                 env_use = (char**) envp;
4242
4243                         /* Let the parent know that we are ready and
4244                          * wait until the parent is ready with the
4245                          * setup, too... */
4246                         (void) barrier_place_and_sync(&barrier); /* #5 */
4247
4248                         if (arg_boot) {
4249                                 char **a;
4250                                 size_t l;
4251
4252                                 /* Automatically search for the init system */
4253
4254                                 l = 1 + argc - optind;
4255                                 a = newa(char*, l + 1);
4256                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4257
4258                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4259                                 execve(a[0], a, env_use);
4260
4261                                 a[0] = (char*) "/lib/systemd/systemd";
4262                                 execve(a[0], a, env_use);
4263
4264                                 a[0] = (char*) "/sbin/init";
4265                                 execve(a[0], a, env_use);
4266                         } else if (argc > optind)
4267                                 execvpe(argv[optind], argv + optind, env_use);
4268                         else {
4269                                 chdir(home ? home : "/root");
4270                                 execle("/bin/bash", "-bash", NULL, env_use);
4271                                 execle("/bin/sh", "-sh", NULL, env_use);
4272                         }
4273
4274                         log_error_errno(errno, "execv() failed: %m");
4275                         _exit(EXIT_FAILURE);
4276                 }
4277
4278                 barrier_set_role(&barrier, BARRIER_PARENT);
4279                 fdset_free(fds);
4280                 fds = NULL;
4281
4282                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4283                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4284
4285                 (void) barrier_place(&barrier); /* #1 */
4286
4287                 /* Wait for the most basic Child-setup to be done,
4288                  * before we add hardware to it, and place it in a
4289                  * cgroup. */
4290                 if (barrier_sync(&barrier)) { /* #1 */
4291                         int ifi = 0;
4292
4293                         r = move_network_interfaces(pid);
4294                         if (r < 0)
4295                                 goto finish;
4296
4297                         r = setup_veth(pid, veth_name, &ifi);
4298                         if (r < 0)
4299                                 goto finish;
4300
4301                         r = setup_bridge(veth_name, &ifi);
4302                         if (r < 0)
4303                                 goto finish;
4304
4305                         r = setup_macvlan(pid);
4306                         if (r < 0)
4307                                 goto finish;
4308
4309                         r = setup_ipvlan(pid);
4310                         if (r < 0)
4311                                 goto finish;
4312
4313                         r = register_machine(pid, ifi);
4314                         if (r < 0)
4315                                 goto finish;
4316
4317                         /* Notify the child that the parent is ready with all
4318                          * its setup, and that the child can now hand over
4319                          * control to the code to run inside the container. */
4320                         (void) barrier_place(&barrier); /* #2 */
4321
4322                         if (arg_userns) {
4323                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4324
4325                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4326
4327                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4328                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4329                                 r = write_string_file(uid_map, line);
4330                                 if (r < 0) {
4331                                         log_error_errno(r, "Failed to write UID map: %m");
4332                                         goto finish;
4333                                 }
4334
4335                                 /* We always assign the same UID and GID ranges */
4336                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4337                                 r = write_string_file(uid_map, line);
4338                                 if (r < 0) {
4339                                         log_error_errno(r, "Failed to write GID map: %m");
4340                                         goto finish;
4341                                 }
4342
4343                                 (void) barrier_place(&barrier); /* #4 */
4344                         }
4345
4346                         /* Block SIGCHLD here, before notifying child.
4347                          * process_pty() will handle it with the other signals. */
4348                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4349                         if (r < 0)
4350                                 goto finish;
4351
4352                         /* Reset signal to default */
4353                         r = default_signals(SIGCHLD, -1);
4354                         if (r < 0)
4355                                 goto finish;
4356
4357                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4358                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4359                                 _cleanup_event_unref_ sd_event *event = NULL;
4360                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4361                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4362                                 char last_char = 0;
4363
4364                                 sd_notifyf(false,
4365                                            "READY=1\n"
4366                                            "STATUS=Container running.\n"
4367                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4368
4369                                 r = sd_event_new(&event);
4370                                 if (r < 0) {
4371                                         log_error_errno(r, "Failed to get default event source: %m");
4372                                         goto finish;
4373                                 }
4374
4375                                 if (arg_kill_signal > 0) {
4376                                         /* Try to kill the init system on SIGINT or SIGTERM */
4377                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4378                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4379                                 } else {
4380                                         /* Immediately exit */
4381                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4382                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4383                                 }
4384
4385                                 /* simply exit on sigchld */
4386                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4387
4388                                 if (arg_expose_ports) {
4389                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4390                                         if (r < 0)
4391                                                 goto finish;
4392
4393                                         (void) expose_ports(rtnl, &exposed);
4394                                 }
4395
4396                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4397
4398                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4399                                 if (r < 0) {
4400                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4401                                         goto finish;
4402                                 }
4403
4404                                 r = sd_event_loop(event);
4405                                 if (r < 0) {
4406                                         log_error_errno(r, "Failed to run event loop: %m");
4407                                         goto finish;
4408                                 }
4409
4410                                 pty_forward_get_last_char(forward, &last_char);
4411
4412                                 forward = pty_forward_free(forward);
4413
4414                                 if (!arg_quiet && last_char != '\n')
4415                                         putc('\n', stdout);
4416
4417                                 /* Kill if it is not dead yet anyway */
4418                                 terminate_machine(pid);
4419                         }
4420                 }
4421
4422                 /* Normally redundant, but better safe than sorry */
4423                 kill(pid, SIGKILL);
4424
4425                 r = wait_for_container(pid, &container_status);
4426                 pid = 0;
4427
4428                 if (r < 0)
4429                         /* We failed to wait for the container, or the
4430                          * container exited abnormally */
4431                         goto finish;
4432                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4433                         /* The container exited with a non-zero
4434                          * status, or with zero status and no reboot
4435                          * was requested. */
4436                         ret = r;
4437                         break;
4438                 }
4439
4440                 /* CONTAINER_REBOOTED, loop again */
4441
4442                 if (arg_keep_unit) {
4443                         /* Special handling if we are running as a
4444                          * service: instead of simply restarting the
4445                          * machine we want to restart the entire
4446                          * service, so let's inform systemd about this
4447                          * with the special exit code 133. The service
4448                          * file uses RestartForceExitStatus=133 so
4449                          * that this results in a full nspawn
4450                          * restart. This is necessary since we might
4451                          * have cgroup parameters set we want to have
4452                          * flushed out. */
4453                         ret = 133;
4454                         r = 0;
4455                         break;
4456                 }
4457
4458                 flush_ports(&exposed);
4459         }
4460
4461 finish:
4462         sd_notify(false,
4463                   "STOPPING=1\n"
4464                   "STATUS=Terminating...");
4465
4466         loop_remove(loop_nr, &image_fd);
4467
4468         if (pid > 0)
4469                 kill(pid, SIGKILL);
4470
4471         if (remove_subvol && arg_directory) {
4472                 int k;
4473
4474                 k = btrfs_subvol_remove(arg_directory);
4475                 if (k < 0)
4476                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4477         }
4478
4479         if (arg_machine) {
4480                 const char *p;
4481
4482                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4483                 (void) rm_rf(p, false, true, false);
4484         }
4485
4486         free(arg_directory);
4487         free(arg_template);
4488         free(arg_image);
4489         free(arg_machine);
4490         free(arg_user);
4491         strv_free(arg_setenv);
4492         strv_free(arg_network_interfaces);
4493         strv_free(arg_network_macvlan);
4494         strv_free(arg_network_ipvlan);
4495         strv_free(arg_bind);
4496         strv_free(arg_bind_ro);
4497         strv_free(arg_tmpfs);
4498
4499         flush_ports(&exposed);
4500
4501         while (arg_expose_ports) {
4502                 ExposePort *p = arg_expose_ports;
4503                 LIST_REMOVE(ports, arg_expose_ports, p);
4504                 free(p);
4505         }
4506
4507         return r < 0 ? EXIT_FAILURE : ret;
4508 }