chiark / gitweb /
machined,machinectl: add calls for changing container/VM quotas
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/mount.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <stdio.h>
30 #include <errno.h>
31 #include <sys/prctl.h>
32 #include <getopt.h>
33 #include <grp.h>
34 #include <linux/fs.h>
35 #include <sys/socket.h>
36 #include <linux/netlink.h>
37 #include <net/if.h>
38 #include <linux/veth.h>
39 #include <sys/personality.h>
40 #include <linux/loop.h>
41 #include <sys/file.h>
42
43 #ifdef HAVE_SELINUX
44 #include <selinux/selinux.h>
45 #endif
46
47 #ifdef HAVE_SECCOMP
48 #include <seccomp.h>
49 #endif
50
51 #ifdef HAVE_BLKID
52 #include <blkid/blkid.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "macro.h"
63 #include "missing.h"
64 #include "cgroup-util.h"
65 #include "strv.h"
66 #include "path-util.h"
67 #include "loopback-setup.h"
68 #include "dev-setup.h"
69 #include "fdset.h"
70 #include "build.h"
71 #include "fileio.h"
72 #include "bus-util.h"
73 #include "bus-error.h"
74 #include "ptyfwd.h"
75 #include "env-util.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78 #include "blkid-util.h"
79 #include "gpt.h"
80 #include "siphash24.h"
81 #include "copy.h"
82 #include "base-filesystem.h"
83 #include "barrier.h"
84 #include "event-util.h"
85 #include "capability.h"
86 #include "cap-list.h"
87 #include "btrfs-util.h"
88 #include "machine-image.h"
89 #include "list.h"
90 #include "in-addr-util.h"
91 #include "fw-util.h"
92 #include "local-addresses.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef struct ExposePort {
99         int protocol;
100         uint16_t host_port;
101         uint16_t container_port;
102         LIST_FIELDS(struct ExposePort, ports);
103 } ExposePort;
104
105 typedef enum ContainerStatus {
106         CONTAINER_TERMINATED,
107         CONTAINER_REBOOTED
108 } ContainerStatus;
109
110 typedef enum LinkJournal {
111         LINK_NO,
112         LINK_AUTO,
113         LINK_HOST,
114         LINK_GUEST
115 } LinkJournal;
116
117 typedef enum Volatile {
118         VOLATILE_NO,
119         VOLATILE_YES,
120         VOLATILE_STATE,
121 } Volatile;
122
123 static char *arg_directory = NULL;
124 static char *arg_template = NULL;
125 static char *arg_user = NULL;
126 static sd_id128_t arg_uuid = {};
127 static char *arg_machine = NULL;
128 static const char *arg_selinux_context = NULL;
129 static const char *arg_selinux_apifs_context = NULL;
130 static const char *arg_slice = NULL;
131 static bool arg_private_network = false;
132 static bool arg_read_only = false;
133 static bool arg_boot = false;
134 static bool arg_ephemeral = false;
135 static LinkJournal arg_link_journal = LINK_AUTO;
136 static bool arg_link_journal_try = false;
137 static uint64_t arg_retain =
138         (1ULL << CAP_CHOWN) |
139         (1ULL << CAP_DAC_OVERRIDE) |
140         (1ULL << CAP_DAC_READ_SEARCH) |
141         (1ULL << CAP_FOWNER) |
142         (1ULL << CAP_FSETID) |
143         (1ULL << CAP_IPC_OWNER) |
144         (1ULL << CAP_KILL) |
145         (1ULL << CAP_LEASE) |
146         (1ULL << CAP_LINUX_IMMUTABLE) |
147         (1ULL << CAP_NET_BIND_SERVICE) |
148         (1ULL << CAP_NET_BROADCAST) |
149         (1ULL << CAP_NET_RAW) |
150         (1ULL << CAP_SETGID) |
151         (1ULL << CAP_SETFCAP) |
152         (1ULL << CAP_SETPCAP) |
153         (1ULL << CAP_SETUID) |
154         (1ULL << CAP_SYS_ADMIN) |
155         (1ULL << CAP_SYS_CHROOT) |
156         (1ULL << CAP_SYS_NICE) |
157         (1ULL << CAP_SYS_PTRACE) |
158         (1ULL << CAP_SYS_TTY_CONFIG) |
159         (1ULL << CAP_SYS_RESOURCE) |
160         (1ULL << CAP_SYS_BOOT) |
161         (1ULL << CAP_AUDIT_WRITE) |
162         (1ULL << CAP_AUDIT_CONTROL) |
163         (1ULL << CAP_MKNOD);
164 static char **arg_bind = NULL;
165 static char **arg_bind_ro = NULL;
166 static char **arg_tmpfs = NULL;
167 static char **arg_setenv = NULL;
168 static bool arg_quiet = false;
169 static bool arg_share_system = false;
170 static bool arg_register = true;
171 static bool arg_keep_unit = false;
172 static char **arg_network_interfaces = NULL;
173 static char **arg_network_macvlan = NULL;
174 static char **arg_network_ipvlan = NULL;
175 static bool arg_network_veth = false;
176 static const char *arg_network_bridge = NULL;
177 static unsigned long arg_personality = 0xffffffffLU;
178 static char *arg_image = NULL;
179 static Volatile arg_volatile = VOLATILE_NO;
180 static ExposePort *arg_expose_ports = NULL;
181 static char **arg_property = NULL;
182 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
183 static bool arg_userns = false;
184
185 static void help(void) {
186         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
187                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
188                "  -h --help                 Show this help\n"
189                "     --version              Print version string\n"
190                "  -q --quiet                Do not show status information\n"
191                "  -D --directory=PATH       Root directory for the container\n"
192                "     --template=PATH        Initialize root directory from template directory,\n"
193                "                            if missing\n"
194                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
195                "                            remove it after exit\n"
196                "  -i --image=PATH           File system device or disk image for the container\n"
197                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
198                "  -u --user=USER            Run the command under specified user or uid\n"
199                "  -M --machine=NAME         Set the machine name for the container\n"
200                "     --uuid=UUID            Set a specific machine UUID for the container\n"
201                "  -S --slice=SLICE          Place the container in the specified slice\n"
202                "     --property=NAME=VALUE  Set scope unit property\n"
203                "     --private-network      Disable network in container\n"
204                "     --network-interface=INTERFACE\n"
205                "                            Assign an existing network interface to the\n"
206                "                            container\n"
207                "     --network-macvlan=INTERFACE\n"
208                "                            Create a macvlan network interface based on an\n"
209                "                            existing network interface to the container\n"
210                "     --network-ipvlan=INTERFACE\n"
211                "                            Create a ipvlan network interface based on an\n"
212                "                            existing network interface to the container\n"
213                "  -n --network-veth         Add a virtual ethernet connection between host\n"
214                "                            and container\n"
215                "     --network-bridge=INTERFACE\n"
216                "                            Add a virtual ethernet connection between host\n"
217                "                            and container and add it to an existing bridge on\n"
218                "                            the host\n"
219                "     --private-users[=UIDBASE[:NUIDS]]\n"
220                "                            Run within user namespace\n"
221                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
222                "                            Expose a container IP port on the host\n"
223                "  -Z --selinux-context=SECLABEL\n"
224                "                            Set the SELinux security context to be used by\n"
225                "                            processes in the container\n"
226                "  -L --selinux-apifs-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            API/tmpfs file systems in the container\n"
229                "     --capability=CAP       In addition to the default, retain specified\n"
230                "                            capability\n"
231                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
232                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
233                "                            try-guest, try-host\n"
234                "  -j                        Equivalent to --link-journal=try-guest\n"
235                "     --read-only            Mount the root directory read-only\n"
236                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
237                "                            the container\n"
238                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
239                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
240                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
241                "     --share-system         Share system namespaces with host\n"
242                "     --register=BOOLEAN     Register container as machine\n"
243                "     --keep-unit            Do not register a scope for the machine, reuse\n"
244                "                            the service unit nspawn is running in\n"
245                "     --volatile[=MODE]      Run the system in volatile mode\n"
246                , program_invocation_short_name);
247 }
248
249 static int set_sanitized_path(char **b, const char *path) {
250         char *p;
251
252         assert(b);
253         assert(path);
254
255         p = canonicalize_file_name(path);
256         if (!p) {
257                 if (errno != ENOENT)
258                         return -errno;
259
260                 p = path_make_absolute_cwd(path);
261                 if (!p)
262                         return -ENOMEM;
263         }
264
265         free(*b);
266         *b = path_kill_slashes(p);
267         return 0;
268 }
269
270 static int parse_argv(int argc, char *argv[]) {
271
272         enum {
273                 ARG_VERSION = 0x100,
274                 ARG_PRIVATE_NETWORK,
275                 ARG_UUID,
276                 ARG_READ_ONLY,
277                 ARG_CAPABILITY,
278                 ARG_DROP_CAPABILITY,
279                 ARG_LINK_JOURNAL,
280                 ARG_BIND,
281                 ARG_BIND_RO,
282                 ARG_TMPFS,
283                 ARG_SETENV,
284                 ARG_SHARE_SYSTEM,
285                 ARG_REGISTER,
286                 ARG_KEEP_UNIT,
287                 ARG_NETWORK_INTERFACE,
288                 ARG_NETWORK_MACVLAN,
289                 ARG_NETWORK_IPVLAN,
290                 ARG_NETWORK_BRIDGE,
291                 ARG_PERSONALITY,
292                 ARG_VOLATILE,
293                 ARG_TEMPLATE,
294                 ARG_PROPERTY,
295                 ARG_PRIVATE_USERS,
296         };
297
298         static const struct option options[] = {
299                 { "help",                  no_argument,       NULL, 'h'                   },
300                 { "version",               no_argument,       NULL, ARG_VERSION           },
301                 { "directory",             required_argument, NULL, 'D'                   },
302                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
303                 { "ephemeral",             no_argument,       NULL, 'x'                   },
304                 { "user",                  required_argument, NULL, 'u'                   },
305                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
306                 { "boot",                  no_argument,       NULL, 'b'                   },
307                 { "uuid",                  required_argument, NULL, ARG_UUID              },
308                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
309                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
310                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
311                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
312                 { "bind",                  required_argument, NULL, ARG_BIND              },
313                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
314                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
315                 { "machine",               required_argument, NULL, 'M'                   },
316                 { "slice",                 required_argument, NULL, 'S'                   },
317                 { "setenv",                required_argument, NULL, ARG_SETENV            },
318                 { "selinux-context",       required_argument, NULL, 'Z'                   },
319                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
320                 { "quiet",                 no_argument,       NULL, 'q'                   },
321                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
322                 { "register",              required_argument, NULL, ARG_REGISTER          },
323                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
324                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
325                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
326                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
327                 { "network-veth",          no_argument,       NULL, 'n'                   },
328                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
329                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
330                 { "image",                 required_argument, NULL, 'i'                   },
331                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
332                 { "port",                  required_argument, NULL, 'p'                   },
333                 { "property",              required_argument, NULL, ARG_PROPERTY          },
334                 { "private-users",         optional_argument, NULL, ARG_PRIVATE_USERS     },
335                 {}
336         };
337
338         int c, r;
339         uint64_t plus = 0, minus = 0;
340
341         assert(argc >= 0);
342         assert(argv);
343
344         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
345
346                 switch (c) {
347
348                 case 'h':
349                         help();
350                         return 0;
351
352                 case ARG_VERSION:
353                         puts(PACKAGE_STRING);
354                         puts(SYSTEMD_FEATURES);
355                         return 0;
356
357                 case 'D':
358                         r = set_sanitized_path(&arg_directory, optarg);
359                         if (r < 0)
360                                 return log_error_errno(r, "Invalid root directory: %m");
361
362                         break;
363
364                 case ARG_TEMPLATE:
365                         r = set_sanitized_path(&arg_template, optarg);
366                         if (r < 0)
367                                 return log_error_errno(r, "Invalid template directory: %m");
368
369                         break;
370
371                 case 'i':
372                         r = set_sanitized_path(&arg_image, optarg);
373                         if (r < 0)
374                                 return log_error_errno(r, "Invalid image path: %m");
375
376                         break;
377
378                 case 'x':
379                         arg_ephemeral = true;
380                         break;
381
382                 case 'u':
383                         free(arg_user);
384                         arg_user = strdup(optarg);
385                         if (!arg_user)
386                                 return log_oom();
387
388                         break;
389
390                 case ARG_NETWORK_BRIDGE:
391                         arg_network_bridge = optarg;
392
393                         /* fall through */
394
395                 case 'n':
396                         arg_network_veth = true;
397                         arg_private_network = true;
398                         break;
399
400                 case ARG_NETWORK_INTERFACE:
401                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
402                                 return log_oom();
403
404                         arg_private_network = true;
405                         break;
406
407                 case ARG_NETWORK_MACVLAN:
408                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
409                                 return log_oom();
410
411                         arg_private_network = true;
412                         break;
413
414                 case ARG_NETWORK_IPVLAN:
415                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
416                                 return log_oom();
417
418                         /* fall through */
419
420                 case ARG_PRIVATE_NETWORK:
421                         arg_private_network = true;
422                         break;
423
424                 case 'b':
425                         arg_boot = true;
426                         break;
427
428                 case ARG_UUID:
429                         r = sd_id128_from_string(optarg, &arg_uuid);
430                         if (r < 0) {
431                                 log_error("Invalid UUID: %s", optarg);
432                                 return r;
433                         }
434                         break;
435
436                 case 'S':
437                         arg_slice = optarg;
438                         break;
439
440                 case 'M':
441                         if (isempty(optarg)) {
442                                 free(arg_machine);
443                                 arg_machine = NULL;
444                         } else {
445                                 if (!machine_name_is_valid(optarg)) {
446                                         log_error("Invalid machine name: %s", optarg);
447                                         return -EINVAL;
448                                 }
449
450                                 r = free_and_strdup(&arg_machine, optarg);
451                                 if (r < 0)
452                                         return log_oom();
453
454                                 break;
455                         }
456
457                 case 'Z':
458                         arg_selinux_context = optarg;
459                         break;
460
461                 case 'L':
462                         arg_selinux_apifs_context = optarg;
463                         break;
464
465                 case ARG_READ_ONLY:
466                         arg_read_only = true;
467                         break;
468
469                 case ARG_CAPABILITY:
470                 case ARG_DROP_CAPABILITY: {
471                         const char *state, *word;
472                         size_t length;
473
474                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
475                                 _cleanup_free_ char *t;
476
477                                 t = strndup(word, length);
478                                 if (!t)
479                                         return log_oom();
480
481                                 if (streq(t, "all")) {
482                                         if (c == ARG_CAPABILITY)
483                                                 plus = (uint64_t) -1;
484                                         else
485                                                 minus = (uint64_t) -1;
486                                 } else {
487                                         int cap;
488
489                                         cap = capability_from_name(t);
490                                         if (cap < 0) {
491                                                 log_error("Failed to parse capability %s.", t);
492                                                 return -EINVAL;
493                                         }
494
495                                         if (c == ARG_CAPABILITY)
496                                                 plus |= 1ULL << (uint64_t) cap;
497                                         else
498                                                 minus |= 1ULL << (uint64_t) cap;
499                                 }
500                         }
501
502                         break;
503                 }
504
505                 case 'j':
506                         arg_link_journal = LINK_GUEST;
507                         arg_link_journal_try = true;
508                         break;
509
510                 case ARG_LINK_JOURNAL:
511                         if (streq(optarg, "auto")) {
512                                 arg_link_journal = LINK_AUTO;
513                                 arg_link_journal_try = false;
514                         } else if (streq(optarg, "no")) {
515                                 arg_link_journal = LINK_NO;
516                                 arg_link_journal_try = false;
517                         } else if (streq(optarg, "guest")) {
518                                 arg_link_journal = LINK_GUEST;
519                                 arg_link_journal_try = false;
520                         } else if (streq(optarg, "host")) {
521                                 arg_link_journal = LINK_HOST;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "try-guest")) {
524                                 arg_link_journal = LINK_GUEST;
525                                 arg_link_journal_try = true;
526                         } else if (streq(optarg, "try-host")) {
527                                 arg_link_journal = LINK_HOST;
528                                 arg_link_journal_try = true;
529                         } else {
530                                 log_error("Failed to parse link journal mode %s", optarg);
531                                 return -EINVAL;
532                         }
533
534                         break;
535
536                 case ARG_BIND:
537                 case ARG_BIND_RO: {
538                         _cleanup_free_ char *a = NULL, *b = NULL;
539                         char *e;
540                         char ***x;
541
542                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
543
544                         e = strchr(optarg, ':');
545                         if (e) {
546                                 a = strndup(optarg, e - optarg);
547                                 b = strdup(e + 1);
548                         } else {
549                                 a = strdup(optarg);
550                                 b = strdup(optarg);
551                         }
552
553                         if (!a || !b)
554                                 return log_oom();
555
556                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
557                                 log_error("Invalid bind mount specification: %s", optarg);
558                                 return -EINVAL;
559                         }
560
561                         r = strv_extend(x, a);
562                         if (r < 0)
563                                 return log_oom();
564
565                         r = strv_extend(x, b);
566                         if (r < 0)
567                                 return log_oom();
568
569                         break;
570                 }
571
572                 case ARG_TMPFS: {
573                         _cleanup_free_ char *a = NULL, *b = NULL;
574                         char *e;
575
576                         e = strchr(optarg, ':');
577                         if (e) {
578                                 a = strndup(optarg, e - optarg);
579                                 b = strdup(e + 1);
580                         } else {
581                                 a = strdup(optarg);
582                                 b = strdup("mode=0755");
583                         }
584
585                         if (!a || !b)
586                                 return log_oom();
587
588                         if (!path_is_absolute(a)) {
589                                 log_error("Invalid tmpfs specification: %s", optarg);
590                                 return -EINVAL;
591                         }
592
593                         r = strv_push(&arg_tmpfs, a);
594                         if (r < 0)
595                                 return log_oom();
596
597                         a = NULL;
598
599                         r = strv_push(&arg_tmpfs, b);
600                         if (r < 0)
601                                 return log_oom();
602
603                         b = NULL;
604
605                         break;
606                 }
607
608                 case ARG_SETENV: {
609                         char **n;
610
611                         if (!env_assignment_is_valid(optarg)) {
612                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
613                                 return -EINVAL;
614                         }
615
616                         n = strv_env_set(arg_setenv, optarg);
617                         if (!n)
618                                 return log_oom();
619
620                         strv_free(arg_setenv);
621                         arg_setenv = n;
622                         break;
623                 }
624
625                 case 'q':
626                         arg_quiet = true;
627                         break;
628
629                 case ARG_SHARE_SYSTEM:
630                         arg_share_system = true;
631                         break;
632
633                 case ARG_REGISTER:
634                         r = parse_boolean(optarg);
635                         if (r < 0) {
636                                 log_error("Failed to parse --register= argument: %s", optarg);
637                                 return r;
638                         }
639
640                         arg_register = r;
641                         break;
642
643                 case ARG_KEEP_UNIT:
644                         arg_keep_unit = true;
645                         break;
646
647                 case ARG_PERSONALITY:
648
649                         arg_personality = personality_from_string(optarg);
650                         if (arg_personality == 0xffffffffLU) {
651                                 log_error("Unknown or unsupported personality '%s'.", optarg);
652                                 return -EINVAL;
653                         }
654
655                         break;
656
657                 case ARG_VOLATILE:
658
659                         if (!optarg)
660                                 arg_volatile = VOLATILE_YES;
661                         else {
662                                 r = parse_boolean(optarg);
663                                 if (r < 0) {
664                                         if (streq(optarg, "state"))
665                                                 arg_volatile = VOLATILE_STATE;
666                                         else {
667                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
668                                                 return r;
669                                         }
670                                 } else
671                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
672                         }
673
674                         break;
675
676                 case 'p': {
677                         const char *split, *e;
678                         uint16_t container_port, host_port;
679                         int protocol;
680                         ExposePort *p;
681
682                         if ((e = startswith(optarg, "tcp:")))
683                                 protocol = IPPROTO_TCP;
684                         else if ((e = startswith(optarg, "udp:")))
685                                 protocol = IPPROTO_UDP;
686                         else {
687                                 e = optarg;
688                                 protocol = IPPROTO_TCP;
689                         }
690
691                         split = strchr(e, ':');
692                         if (split) {
693                                 char v[split - e + 1];
694
695                                 memcpy(v, e, split - e);
696                                 v[split - e] = 0;
697
698                                 r = safe_atou16(v, &host_port);
699                                 if (r < 0 || host_port <= 0) {
700                                         log_error("Failed to parse host port: %s", optarg);
701                                         return -EINVAL;
702                                 }
703
704                                 r = safe_atou16(split + 1, &container_port);
705                         } else {
706                                 r = safe_atou16(e, &container_port);
707                                 host_port = container_port;
708                         }
709
710                         if (r < 0 || container_port <= 0) {
711                                 log_error("Failed to parse host port: %s", optarg);
712                                 return -EINVAL;
713                         }
714
715                         LIST_FOREACH(ports, p, arg_expose_ports) {
716                                 if (p->protocol == protocol && p->host_port == host_port) {
717                                         log_error("Duplicate port specification: %s", optarg);
718                                         return -EINVAL;
719                                 }
720                         }
721
722                         p = new(ExposePort, 1);
723                         if (!p)
724                                 return log_oom();
725
726                         p->protocol = protocol;
727                         p->host_port = host_port;
728                         p->container_port = container_port;
729
730                         LIST_PREPEND(ports, arg_expose_ports, p);
731
732                         break;
733                 }
734
735                 case ARG_PROPERTY:
736                         if (strv_extend(&arg_property, optarg) < 0)
737                                 return log_oom();
738
739                         break;
740
741                 case ARG_PRIVATE_USERS:
742                         if (optarg) {
743                                 _cleanup_free_ char *buffer = NULL;
744                                 const char *range, *shift;
745
746                                 range = strchr(optarg, ':');
747                                 if (range) {
748                                         buffer = strndup(optarg, range - optarg);
749                                         if (!buffer)
750                                                 return log_oom();
751                                         shift = buffer;
752
753                                         range++;
754                                         if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
755                                                 log_error("Failed to parse UID range: %s", range);
756                                                 return -EINVAL;
757                                         }
758                                 } else
759                                         shift = optarg;
760
761                                 if (parse_uid(shift, &arg_uid_shift) < 0) {
762                                         log_error("Failed to parse UID: %s", optarg);
763                                         return -EINVAL;
764                                 }
765                         }
766
767                         arg_userns = true;
768                         break;
769
770                 case '?':
771                         return -EINVAL;
772
773                 default:
774                         assert_not_reached("Unhandled option");
775                 }
776
777         if (arg_share_system)
778                 arg_register = false;
779
780         if (arg_boot && arg_share_system) {
781                 log_error("--boot and --share-system may not be combined.");
782                 return -EINVAL;
783         }
784
785         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
786                 log_error("--keep-unit may not be used when invoked from a user session.");
787                 return -EINVAL;
788         }
789
790         if (arg_directory && arg_image) {
791                 log_error("--directory= and --image= may not be combined.");
792                 return -EINVAL;
793         }
794
795         if (arg_template && arg_image) {
796                 log_error("--template= and --image= may not be combined.");
797                 return -EINVAL;
798         }
799
800         if (arg_template && !(arg_directory || arg_machine)) {
801                 log_error("--template= needs --directory= or --machine=.");
802                 return -EINVAL;
803         }
804
805         if (arg_ephemeral && arg_template) {
806                 log_error("--ephemeral and --template= may not be combined.");
807                 return -EINVAL;
808         }
809
810         if (arg_ephemeral && arg_image) {
811                 log_error("--ephemeral and --image= may not be combined.");
812                 return -EINVAL;
813         }
814
815         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
816                 log_error("--ephemeral and --link-journal= may not be combined.");
817                 return -EINVAL;
818         }
819
820         if (arg_volatile != VOLATILE_NO && arg_read_only) {
821                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
822                 return -EINVAL;
823         }
824
825         if (arg_expose_ports && !arg_private_network) {
826                 log_error("Cannot use --port= without private networking.");
827                 return -EINVAL;
828         }
829
830         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
831
832         return 1;
833 }
834
835 static int mount_all(const char *dest) {
836
837         typedef struct MountPoint {
838                 const char *what;
839                 const char *where;
840                 const char *type;
841                 const char *options;
842                 unsigned long flags;
843                 bool fatal;
844         } MountPoint;
845
846         static const MountPoint mount_table[] = {
847                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
848                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
849                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
850                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
851                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
852                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
853                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
854                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
855                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
856 #ifdef HAVE_SELINUX
857                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
858                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
859 #endif
860         };
861
862         unsigned k;
863         int r = 0;
864
865         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
866                 _cleanup_free_ char *where = NULL, *options = NULL;
867                 const char *o;
868                 int t;
869
870                 where = strjoin(dest, "/", mount_table[k].where, NULL);
871                 if (!where)
872                         return log_oom();
873
874                 t = path_is_mount_point(where, true);
875                 if (t < 0) {
876                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
877
878                         if (r == 0)
879                                 r = t;
880
881                         continue;
882                 }
883
884                 /* Skip this entry if it is not a remount. */
885                 if (mount_table[k].what && t > 0)
886                         continue;
887
888                 t = mkdir_p(where, 0755);
889                 if (t < 0) {
890                         if (mount_table[k].fatal) {
891                                log_error_errno(t, "Failed to create directory %s: %m", where);
892
893                                 if (r == 0)
894                                         r = t;
895                         } else
896                                log_warning_errno(t, "Failed to create directory %s: %m", where);
897
898                         continue;
899                 }
900
901 #ifdef HAVE_SELINUX
902                 if (arg_selinux_apifs_context &&
903                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
904                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
905                         if (!options)
906                                 return log_oom();
907
908                         o = options;
909                 } else
910 #endif
911                         o = mount_table[k].options;
912
913                 if (arg_userns && arg_uid_shift != UID_INVALID && streq_ptr(mount_table[k].type, "tmpfs")) {
914                         char *uid_options = NULL;
915
916                         if (o)
917                                 asprintf(&uid_options, "%s,uid=" UID_FMT ",gid=" UID_FMT, o, arg_uid_shift, arg_uid_shift);
918                         else
919                                 asprintf(&uid_options, "uid=" UID_FMT ",gid=" UID_FMT, arg_uid_shift, arg_uid_shift);
920                         if (!uid_options)
921                                 return log_oom();
922
923                         free(options);
924                         o = options = uid_options;
925                 }
926
927                 if (mount(mount_table[k].what,
928                           where,
929                           mount_table[k].type,
930                           mount_table[k].flags,
931                           o) < 0) {
932
933                         if (mount_table[k].fatal) {
934                                 log_error_errno(errno, "mount(%s) failed: %m", where);
935
936                                 if (r == 0)
937                                         r = -errno;
938                         } else
939                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
940                 }
941         }
942
943         return r;
944 }
945
946 static int mount_binds(const char *dest, char **l, bool ro) {
947         char **x, **y;
948
949         STRV_FOREACH_PAIR(x, y, l) {
950                 _cleanup_free_ char *where = NULL;
951                 struct stat source_st, dest_st;
952                 int r;
953
954                 if (stat(*x, &source_st) < 0)
955                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
956
957                 where = strappend(dest, *y);
958                 if (!where)
959                         return log_oom();
960
961                 r = stat(where, &dest_st);
962                 if (r == 0) {
963                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
964                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
965                                 return -EINVAL;
966                         }
967                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
968                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
969                                 return -EINVAL;
970                         }
971                 } else if (errno == ENOENT) {
972                         r = mkdir_parents_label(where, 0755);
973                         if (r < 0)
974                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
975                 } else {
976                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
977                         return -errno;
978                 }
979
980                 /* Create the mount point. Any non-directory file can be
981                  * mounted on any non-directory file (regular, fifo, socket,
982                  * char, block).
983                  */
984                 if (S_ISDIR(source_st.st_mode)) {
985                         r = mkdir_label(where, 0755);
986                         if (r < 0 && errno != EEXIST)
987                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
988                 } else {
989                         r = touch(where);
990                         if (r < 0)
991                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
992                 }
993
994                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
995                         return log_error_errno(errno, "mount(%s) failed: %m", where);
996
997                 if (ro) {
998                         r = bind_remount_recursive(where, true);
999                         if (r < 0)
1000                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
1001                 }
1002         }
1003
1004         return 0;
1005 }
1006
1007 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
1008         char *to;
1009         int r;
1010
1011         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
1012
1013         r = path_is_mount_point(to, false);
1014         if (r < 0)
1015                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
1016         if (r > 0)
1017                 return 0;
1018
1019         mkdir_p(to, 0755);
1020
1021         /* The superblock mount options of the mount point need to be
1022          * identical to the hosts', and hence writable... */
1023         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
1024                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
1025
1026         /* ... hence let's only make the bind mount read-only, not the
1027          * superblock. */
1028         if (read_only) {
1029                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1030                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
1031         }
1032         return 1;
1033 }
1034
1035 static int mount_cgroup(const char *dest) {
1036         _cleanup_set_free_free_ Set *controllers = NULL;
1037         _cleanup_free_ char *own_cgroup_path = NULL;
1038         const char *cgroup_root, *systemd_root, *systemd_own;
1039         int r;
1040
1041         controllers = set_new(&string_hash_ops);
1042         if (!controllers)
1043                 return log_oom();
1044
1045         r = cg_kernel_controllers(controllers);
1046         if (r < 0)
1047                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1048
1049         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1050         if (r < 0)
1051                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1052
1053         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1054         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1055                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1056
1057         for (;;) {
1058                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1059
1060                 controller = set_steal_first(controllers);
1061                 if (!controller)
1062                         break;
1063
1064                 origin = strappend("/sys/fs/cgroup/", controller);
1065                 if (!origin)
1066                         return log_oom();
1067
1068                 r = readlink_malloc(origin, &combined);
1069                 if (r == -EINVAL) {
1070                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1071
1072                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1073                         if (r < 0)
1074                                 return r;
1075
1076                 } else if (r < 0)
1077                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1078                 else {
1079                         _cleanup_free_ char *target = NULL;
1080
1081                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1082                         if (!target)
1083                                 return log_oom();
1084
1085                         /* A symbolic link, a combination of controllers in one hierarchy */
1086
1087                         if (!filename_is_valid(combined)) {
1088                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1089                                 continue;
1090                         }
1091
1092                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1093                         if (r < 0)
1094                                 return r;
1095
1096                         if (symlink(combined, target) < 0)
1097                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1098                 }
1099         }
1100
1101         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1102         if (r < 0)
1103                 return r;
1104
1105         /* Make our own cgroup a (writable) bind mount */
1106         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1107         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1108                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1109
1110         /* And then remount the systemd cgroup root read-only */
1111         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1112         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1113                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1114
1115         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1116                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1117
1118         return 0;
1119 }
1120
1121 static int mount_tmpfs(const char *dest) {
1122         char **i, **o;
1123
1124         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1125                 _cleanup_free_ char *where = NULL;
1126                 int r;
1127
1128                 where = strappend(dest, *i);
1129                 if (!where)
1130                         return log_oom();
1131
1132                 r = mkdir_label(where, 0755);
1133                 if (r < 0 && r != -EEXIST)
1134                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1135
1136                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1137                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1138         }
1139
1140         return 0;
1141 }
1142
1143 static int setup_timezone(const char *dest) {
1144         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1145         char *z, *y;
1146         int r;
1147
1148         assert(dest);
1149
1150         /* Fix the timezone, if possible */
1151         r = readlink_malloc("/etc/localtime", &p);
1152         if (r < 0) {
1153                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1154                 return 0;
1155         }
1156
1157         z = path_startswith(p, "../usr/share/zoneinfo/");
1158         if (!z)
1159                 z = path_startswith(p, "/usr/share/zoneinfo/");
1160         if (!z) {
1161                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1162                 return 0;
1163         }
1164
1165         where = strappend(dest, "/etc/localtime");
1166         if (!where)
1167                 return log_oom();
1168
1169         r = readlink_malloc(where, &q);
1170         if (r >= 0) {
1171                 y = path_startswith(q, "../usr/share/zoneinfo/");
1172                 if (!y)
1173                         y = path_startswith(q, "/usr/share/zoneinfo/");
1174
1175                 /* Already pointing to the right place? Then do nothing .. */
1176                 if (y && streq(y, z))
1177                         return 0;
1178         }
1179
1180         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1181         if (!check)
1182                 return log_oom();
1183
1184         if (access(check, F_OK) < 0) {
1185                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1186                 return 0;
1187         }
1188
1189         what = strappend("../usr/share/zoneinfo/", z);
1190         if (!what)
1191                 return log_oom();
1192
1193         r = mkdir_parents(where, 0755);
1194         if (r < 0) {
1195                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1196
1197                 return 0;
1198         }
1199
1200         r = unlink(where);
1201         if (r < 0 && errno != ENOENT) {
1202                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1203
1204                 return 0;
1205         }
1206
1207         if (symlink(what, where) < 0) {
1208                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1209                 return 0;
1210         }
1211
1212         return 0;
1213 }
1214
1215 static int setup_resolv_conf(const char *dest) {
1216         _cleanup_free_ char *where = NULL;
1217         int r;
1218
1219         assert(dest);
1220
1221         if (arg_private_network)
1222                 return 0;
1223
1224         /* Fix resolv.conf, if possible */
1225         where = strappend(dest, "/etc/resolv.conf");
1226         if (!where)
1227                 return log_oom();
1228
1229         /* We don't really care for the results of this really. If it
1230          * fails, it fails, but meh... */
1231         r = mkdir_parents(where, 0755);
1232         if (r < 0) {
1233                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1234
1235                 return 0;
1236         }
1237
1238         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1239         if (r < 0) {
1240                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1241
1242                 return 0;
1243         }
1244
1245         return 0;
1246 }
1247
1248 static int setup_volatile_state(const char *directory) {
1249         const char *p;
1250         int r;
1251
1252         assert(directory);
1253
1254         if (arg_volatile != VOLATILE_STATE)
1255                 return 0;
1256
1257         /* --volatile=state means we simply overmount /var
1258            with a tmpfs, and the rest read-only. */
1259
1260         r = bind_remount_recursive(directory, true);
1261         if (r < 0)
1262                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1263
1264         p = strjoina(directory, "/var");
1265         r = mkdir(p, 0755);
1266         if (r < 0 && errno != EEXIST)
1267                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1268
1269         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1270                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1271
1272         return 0;
1273 }
1274
1275 static int setup_volatile(const char *directory) {
1276         bool tmpfs_mounted = false, bind_mounted = false;
1277         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1278         const char *f, *t;
1279         int r;
1280
1281         assert(directory);
1282
1283         if (arg_volatile != VOLATILE_YES)
1284                 return 0;
1285
1286         /* --volatile=yes means we mount a tmpfs to the root dir, and
1287            the original /usr to use inside it, and that read-only. */
1288
1289         if (!mkdtemp(template))
1290                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1291
1292         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1293                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1294                 r = -errno;
1295                 goto fail;
1296         }
1297
1298         tmpfs_mounted = true;
1299
1300         f = strjoina(directory, "/usr");
1301         t = strjoina(template, "/usr");
1302
1303         r = mkdir(t, 0755);
1304         if (r < 0 && errno != EEXIST) {
1305                 log_error_errno(errno, "Failed to create %s: %m", t);
1306                 r = -errno;
1307                 goto fail;
1308         }
1309
1310         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1311                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1312                 r = -errno;
1313                 goto fail;
1314         }
1315
1316         bind_mounted = true;
1317
1318         r = bind_remount_recursive(t, true);
1319         if (r < 0) {
1320                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1321                 goto fail;
1322         }
1323
1324         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1325                 log_error_errno(errno, "Failed to move root mount: %m");
1326                 r = -errno;
1327                 goto fail;
1328         }
1329
1330         rmdir(template);
1331
1332         return 0;
1333
1334 fail:
1335         if (bind_mounted)
1336                 umount(t);
1337         if (tmpfs_mounted)
1338                 umount(template);
1339         rmdir(template);
1340         return r;
1341 }
1342
1343 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1344
1345         snprintf(s, 37,
1346                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1347                  SD_ID128_FORMAT_VAL(id));
1348
1349         return s;
1350 }
1351
1352 static int setup_boot_id(const char *dest) {
1353         _cleanup_free_ char *from = NULL, *to = NULL;
1354         sd_id128_t rnd = {};
1355         char as_uuid[37];
1356         int r;
1357
1358         assert(dest);
1359
1360         if (arg_share_system)
1361                 return 0;
1362
1363         /* Generate a new randomized boot ID, so that each boot-up of
1364          * the container gets a new one */
1365
1366         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1367         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1368         if (!from || !to)
1369                 return log_oom();
1370
1371         r = sd_id128_randomize(&rnd);
1372         if (r < 0)
1373                 return log_error_errno(r, "Failed to generate random boot id: %m");
1374
1375         id128_format_as_uuid(rnd, as_uuid);
1376
1377         r = write_string_file(from, as_uuid);
1378         if (r < 0)
1379                 return log_error_errno(r, "Failed to write boot id: %m");
1380
1381         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1382                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1383                 r = -errno;
1384         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1385                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1386
1387         unlink(from);
1388         return r;
1389 }
1390
1391 static int copy_devnodes(const char *dest) {
1392
1393         static const char devnodes[] =
1394                 "null\0"
1395                 "zero\0"
1396                 "full\0"
1397                 "random\0"
1398                 "urandom\0"
1399                 "tty\0"
1400                 "net/tun\0";
1401
1402         const char *d;
1403         int r = 0;
1404         _cleanup_umask_ mode_t u;
1405
1406         assert(dest);
1407
1408         u = umask(0000);
1409
1410         NULSTR_FOREACH(d, devnodes) {
1411                 _cleanup_free_ char *from = NULL, *to = NULL;
1412                 struct stat st;
1413
1414                 from = strappend("/dev/", d);
1415                 to = strjoin(dest, "/dev/", d, NULL);
1416                 if (!from || !to)
1417                         return log_oom();
1418
1419                 if (stat(from, &st) < 0) {
1420
1421                         if (errno != ENOENT)
1422                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1423
1424                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1425
1426                         log_error("%s is not a char or block device, cannot copy", from);
1427                         return -EIO;
1428
1429                 } else {
1430                         r = mkdir_parents(to, 0775);
1431                         if (r < 0) {
1432                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1433                                 return -r;
1434                         }
1435
1436                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1437                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1438
1439                         if (arg_userns && arg_uid_shift != UID_INVALID)
1440                                 if (lchown(to, arg_uid_shift, arg_uid_shift) < 0)
1441                                         return log_error_errno(errno, "chown() of device node %s failed: %m", to);
1442                 }
1443         }
1444
1445         return r;
1446 }
1447
1448 static int setup_ptmx(const char *dest) {
1449         _cleanup_free_ char *p = NULL;
1450
1451         p = strappend(dest, "/dev/ptmx");
1452         if (!p)
1453                 return log_oom();
1454
1455         if (symlink("pts/ptmx", p) < 0)
1456                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1457
1458         if (arg_userns && arg_uid_shift != UID_INVALID)
1459                 if (lchown(p, arg_uid_shift, arg_uid_shift) < 0)
1460                         return log_error_errno(errno, "lchown() of symlink %s failed: %m", p);
1461
1462         return 0;
1463 }
1464
1465 static int setup_dev_console(const char *dest, const char *console) {
1466         _cleanup_umask_ mode_t u;
1467         const char *to;
1468         struct stat st;
1469         int r;
1470
1471         assert(dest);
1472         assert(console);
1473
1474         u = umask(0000);
1475
1476         if (stat("/dev/null", &st) < 0)
1477                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1478
1479         r = chmod_and_chown(console, 0600, 0, 0);
1480         if (r < 0)
1481                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1482
1483         /* We need to bind mount the right tty to /dev/console since
1484          * ptys can only exist on pts file systems. To have something
1485          * to bind mount things on we create a device node first, and
1486          * use /dev/null for that since we the cgroups device policy
1487          * allows us to create that freely, while we cannot create
1488          * /dev/console. (Note that the major minor doesn't actually
1489          * matter here, since we mount it over anyway). */
1490
1491         to = strjoina(dest, "/dev/console");
1492         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1493                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1494
1495         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1496                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1497
1498         return 0;
1499 }
1500
1501 static int setup_kmsg(const char *dest, int kmsg_socket) {
1502         _cleanup_free_ char *from = NULL, *to = NULL;
1503         _cleanup_umask_ mode_t u;
1504         int r, fd, k;
1505         union {
1506                 struct cmsghdr cmsghdr;
1507                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1508         } control = {};
1509         struct msghdr mh = {
1510                 .msg_control = &control,
1511                 .msg_controllen = sizeof(control),
1512         };
1513         struct cmsghdr *cmsg;
1514
1515         assert(dest);
1516         assert(kmsg_socket >= 0);
1517
1518         u = umask(0000);
1519
1520         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1521          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1522          * on the reading side behave very similar to /proc/kmsg,
1523          * their writing side behaves differently from /dev/kmsg in
1524          * that writing blocks when nothing is reading. In order to
1525          * avoid any problems with containers deadlocking due to this
1526          * we simply make /dev/kmsg unavailable to the container. */
1527         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1528             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1529                 return log_oom();
1530
1531         if (mkfifo(from, 0600) < 0)
1532                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1533
1534         r = chmod_and_chown(from, 0600, 0, 0);
1535         if (r < 0)
1536                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1537
1538         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1539                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1540
1541         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1542         if (fd < 0)
1543                 return log_error_errno(errno, "Failed to open fifo: %m");
1544
1545         cmsg = CMSG_FIRSTHDR(&mh);
1546         cmsg->cmsg_level = SOL_SOCKET;
1547         cmsg->cmsg_type = SCM_RIGHTS;
1548         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1549         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1550
1551         mh.msg_controllen = cmsg->cmsg_len;
1552
1553         /* Store away the fd in the socket, so that it stays open as
1554          * long as we run the child */
1555         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1556         safe_close(fd);
1557
1558         if (k < 0)
1559                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1560
1561         /* And now make the FIFO unavailable as /dev/kmsg... */
1562         unlink(from);
1563         return 0;
1564 }
1565
1566 static int send_rtnl(int send_fd) {
1567         union {
1568                 struct cmsghdr cmsghdr;
1569                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1570         } control = {};
1571         struct msghdr mh = {
1572                 .msg_control = &control,
1573                 .msg_controllen = sizeof(control),
1574         };
1575         struct cmsghdr *cmsg;
1576         _cleanup_close_ int fd = -1;
1577         ssize_t k;
1578
1579         assert(send_fd >= 0);
1580
1581         if (!arg_expose_ports)
1582                 return 0;
1583
1584         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1585         if (fd < 0)
1586                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1587
1588         cmsg = CMSG_FIRSTHDR(&mh);
1589         cmsg->cmsg_level = SOL_SOCKET;
1590         cmsg->cmsg_type = SCM_RIGHTS;
1591         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1592         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1593
1594         mh.msg_controllen = cmsg->cmsg_len;
1595
1596         /* Store away the fd in the socket, so that it stays open as
1597          * long as we run the child */
1598         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1599         if (k < 0)
1600                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1601
1602         return 0;
1603 }
1604
1605 static int flush_ports(union in_addr_union *exposed) {
1606         ExposePort *p;
1607         int r, af = AF_INET;
1608
1609         assert(exposed);
1610
1611         if (!arg_expose_ports)
1612                 return 0;
1613
1614         if (in_addr_is_null(af, exposed))
1615                 return 0;
1616
1617         log_debug("Lost IP address.");
1618
1619         LIST_FOREACH(ports, p, arg_expose_ports) {
1620                 r = fw_add_local_dnat(false,
1621                                       af,
1622                                       p->protocol,
1623                                       NULL,
1624                                       NULL, 0,
1625                                       NULL, 0,
1626                                       p->host_port,
1627                                       exposed,
1628                                       p->container_port,
1629                                       NULL);
1630                 if (r < 0)
1631                         log_warning_errno(r, "Failed to modify firewall: %m");
1632         }
1633
1634         *exposed = IN_ADDR_NULL;
1635         return 0;
1636 }
1637
1638 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1639         _cleanup_free_ struct local_address *addresses = NULL;
1640         _cleanup_free_ char *pretty = NULL;
1641         union in_addr_union new_exposed;
1642         ExposePort *p;
1643         bool add;
1644         int af = AF_INET, r;
1645
1646         assert(exposed);
1647
1648         /* Invoked each time an address is added or removed inside the
1649          * container */
1650
1651         if (!arg_expose_ports)
1652                 return 0;
1653
1654         r = local_addresses(rtnl, 0, af, &addresses);
1655         if (r < 0)
1656                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1657
1658         add = r > 0 &&
1659                 addresses[0].family == af &&
1660                 addresses[0].scope < RT_SCOPE_LINK;
1661
1662         if (!add)
1663                 return flush_ports(exposed);
1664
1665         new_exposed = addresses[0].address;
1666         if (in_addr_equal(af, exposed, &new_exposed))
1667                 return 0;
1668
1669         in_addr_to_string(af, &new_exposed, &pretty);
1670         log_debug("New container IP is %s.", strna(pretty));
1671
1672         LIST_FOREACH(ports, p, arg_expose_ports) {
1673
1674                 r = fw_add_local_dnat(true,
1675                                       af,
1676                                       p->protocol,
1677                                       NULL,
1678                                       NULL, 0,
1679                                       NULL, 0,
1680                                       p->host_port,
1681                                       &new_exposed,
1682                                       p->container_port,
1683                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1684                 if (r < 0)
1685                         log_warning_errno(r, "Failed to modify firewall: %m");
1686         }
1687
1688         *exposed = new_exposed;
1689         return 0;
1690 }
1691
1692 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1693         union in_addr_union *exposed = userdata;
1694
1695         assert(rtnl);
1696         assert(m);
1697         assert(exposed);
1698
1699         expose_ports(rtnl, exposed);
1700         return 0;
1701 }
1702
1703 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1704         union {
1705                 struct cmsghdr cmsghdr;
1706                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1707         } control = {};
1708         struct msghdr mh = {
1709                 .msg_control = &control,
1710                 .msg_controllen = sizeof(control),
1711         };
1712         struct cmsghdr *cmsg;
1713         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1714         int fd, r;
1715         ssize_t k;
1716
1717         assert(event);
1718         assert(recv_fd >= 0);
1719         assert(ret);
1720
1721         if (!arg_expose_ports)
1722                 return 0;
1723
1724         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1725         if (k < 0)
1726                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1727
1728         cmsg = CMSG_FIRSTHDR(&mh);
1729         assert(cmsg->cmsg_level == SOL_SOCKET);
1730         assert(cmsg->cmsg_type == SCM_RIGHTS);
1731         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1732         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1733
1734         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1735         if (r < 0) {
1736                 safe_close(fd);
1737                 return log_error_errno(r, "Failed to create rtnl object: %m");
1738         }
1739
1740         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1741         if (r < 0)
1742                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1743
1744         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1747
1748         r = sd_rtnl_attach_event(rtnl, event, 0);
1749         if (r < 0)
1750                 return log_error_errno(r, "Failed to add to even loop: %m");
1751
1752         *ret = rtnl;
1753         rtnl = NULL;
1754
1755         return 0;
1756 }
1757
1758 static int setup_hostname(void) {
1759
1760         if (arg_share_system)
1761                 return 0;
1762
1763         if (sethostname_idempotent(arg_machine) < 0)
1764                 return -errno;
1765
1766         return 0;
1767 }
1768
1769 static int setup_journal(const char *directory) {
1770         sd_id128_t machine_id, this_id;
1771         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1772         char *id;
1773         int r;
1774
1775         /* Don't link journals in ephemeral mode */
1776         if (arg_ephemeral)
1777                 return 0;
1778
1779         p = strappend(directory, "/etc/machine-id");
1780         if (!p)
1781                 return log_oom();
1782
1783         r = read_one_line_file(p, &b);
1784         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1785                 return 0;
1786         else if (r < 0)
1787                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1788
1789         id = strstrip(b);
1790         if (isempty(id) && arg_link_journal == LINK_AUTO)
1791                 return 0;
1792
1793         /* Verify validity */
1794         r = sd_id128_from_string(id, &machine_id);
1795         if (r < 0)
1796                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1797
1798         r = sd_id128_get_machine(&this_id);
1799         if (r < 0)
1800                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1801
1802         if (sd_id128_equal(machine_id, this_id)) {
1803                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1804                          "Host and machine ids are equal (%s): refusing to link journals", id);
1805                 if (arg_link_journal == LINK_AUTO)
1806                         return 0;
1807                 return -EEXIST;
1808         }
1809
1810         if (arg_link_journal == LINK_NO)
1811                 return 0;
1812
1813         free(p);
1814         p = strappend("/var/log/journal/", id);
1815         q = strjoin(directory, "/var/log/journal/", id, NULL);
1816         if (!p || !q)
1817                 return log_oom();
1818
1819         if (path_is_mount_point(p, false) > 0) {
1820                 if (arg_link_journal != LINK_AUTO) {
1821                         log_error("%s: already a mount point, refusing to use for journal", p);
1822                         return -EEXIST;
1823                 }
1824
1825                 return 0;
1826         }
1827
1828         if (path_is_mount_point(q, false) > 0) {
1829                 if (arg_link_journal != LINK_AUTO) {
1830                         log_error("%s: already a mount point, refusing to use for journal", q);
1831                         return -EEXIST;
1832                 }
1833
1834                 return 0;
1835         }
1836
1837         r = readlink_and_make_absolute(p, &d);
1838         if (r >= 0) {
1839                 if ((arg_link_journal == LINK_GUEST ||
1840                      arg_link_journal == LINK_AUTO) &&
1841                     path_equal(d, q)) {
1842
1843                         r = mkdir_p(q, 0755);
1844                         if (r < 0)
1845                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1846                         return 0;
1847                 }
1848
1849                 if (unlink(p) < 0)
1850                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1851         } else if (r == -EINVAL) {
1852
1853                 if (arg_link_journal == LINK_GUEST &&
1854                     rmdir(p) < 0) {
1855
1856                         if (errno == ENOTDIR) {
1857                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1858                                 return r;
1859                         } else {
1860                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1861                                 return -errno;
1862                         }
1863                 }
1864         } else if (r != -ENOENT) {
1865                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1866                 return r;
1867         }
1868
1869         if (arg_link_journal == LINK_GUEST) {
1870
1871                 if (symlink(q, p) < 0) {
1872                         if (arg_link_journal_try) {
1873                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1874                                 return 0;
1875                         } else {
1876                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1877                                 return -errno;
1878                         }
1879                 }
1880
1881                 r = mkdir_p(q, 0755);
1882                 if (r < 0)
1883                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1884                 return 0;
1885         }
1886
1887         if (arg_link_journal == LINK_HOST) {
1888                 /* don't create parents here -- if the host doesn't have
1889                  * permanent journal set up, don't force it here */
1890                 r = mkdir(p, 0755);
1891                 if (r < 0) {
1892                         if (arg_link_journal_try) {
1893                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1894                                 return 0;
1895                         } else {
1896                                 log_error_errno(errno, "Failed to create %s: %m", p);
1897                                 return r;
1898                         }
1899                 }
1900
1901         } else if (access(p, F_OK) < 0)
1902                 return 0;
1903
1904         if (dir_is_empty(q) == 0)
1905                 log_warning("%s is not empty, proceeding anyway.", q);
1906
1907         r = mkdir_p(q, 0755);
1908         if (r < 0) {
1909                 log_error_errno(errno, "Failed to create %s: %m", q);
1910                 return r;
1911         }
1912
1913         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1914                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1915
1916         return 0;
1917 }
1918
1919 static int drop_capabilities(void) {
1920         return capability_bounding_set_drop(~arg_retain, false);
1921 }
1922
1923 static int register_machine(pid_t pid, int local_ifindex) {
1924         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1925         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1926         int r;
1927
1928         if (!arg_register)
1929                 return 0;
1930
1931         r = sd_bus_default_system(&bus);
1932         if (r < 0)
1933                 return log_error_errno(r, "Failed to open system bus: %m");
1934
1935         if (arg_keep_unit) {
1936                 r = sd_bus_call_method(
1937                                 bus,
1938                                 "org.freedesktop.machine1",
1939                                 "/org/freedesktop/machine1",
1940                                 "org.freedesktop.machine1.Manager",
1941                                 "RegisterMachineWithNetwork",
1942                                 &error,
1943                                 NULL,
1944                                 "sayssusai",
1945                                 arg_machine,
1946                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1947                                 "nspawn",
1948                                 "container",
1949                                 (uint32_t) pid,
1950                                 strempty(arg_directory),
1951                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1952         } else {
1953                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1954                 char **i;
1955
1956                 r = sd_bus_message_new_method_call(
1957                                 bus,
1958                                 &m,
1959                                 "org.freedesktop.machine1",
1960                                 "/org/freedesktop/machine1",
1961                                 "org.freedesktop.machine1.Manager",
1962                                 "CreateMachineWithNetwork");
1963                 if (r < 0)
1964                         return bus_log_create_error(r);
1965
1966                 r = sd_bus_message_append(
1967                                 m,
1968                                 "sayssusai",
1969                                 arg_machine,
1970                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1971                                 "nspawn",
1972                                 "container",
1973                                 (uint32_t) pid,
1974                                 strempty(arg_directory),
1975                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1976                 if (r < 0)
1977                         return bus_log_create_error(r);
1978
1979                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1980                 if (r < 0)
1981                         return bus_log_create_error(r);
1982
1983                 if (!isempty(arg_slice)) {
1984                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1985                         if (r < 0)
1986                                 return bus_log_create_error(r);
1987                 }
1988
1989                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1990                 if (r < 0)
1991                         return bus_log_create_error(r);
1992
1993                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1994                                           /* Allow the container to
1995                                            * access and create the API
1996                                            * device nodes, so that
1997                                            * PrivateDevices= in the
1998                                            * container can work
1999                                            * fine */
2000                                           "/dev/null", "rwm",
2001                                           "/dev/zero", "rwm",
2002                                           "/dev/full", "rwm",
2003                                           "/dev/random", "rwm",
2004                                           "/dev/urandom", "rwm",
2005                                           "/dev/tty", "rwm",
2006                                           "/dev/net/tun", "rwm",
2007                                           /* Allow the container
2008                                            * access to ptys. However,
2009                                            * do not permit the
2010                                            * container to ever create
2011                                            * these device nodes. */
2012                                           "/dev/pts/ptmx", "rw",
2013                                           "char-pts", "rw");
2014                 if (r < 0)
2015                         return log_error_errno(r, "Failed to add device whitelist: %m");
2016
2017                 STRV_FOREACH(i, arg_property) {
2018                         r = sd_bus_message_open_container(m, 'r', "sv");
2019                         if (r < 0)
2020                                 return bus_log_create_error(r);
2021
2022                         r = bus_append_unit_property_assignment(m, *i);
2023                         if (r < 0)
2024                                 return r;
2025
2026                         r = sd_bus_message_close_container(m);
2027                         if (r < 0)
2028                                 return bus_log_create_error(r);
2029                 }
2030
2031                 r = sd_bus_message_close_container(m);
2032                 if (r < 0)
2033                         return bus_log_create_error(r);
2034
2035                 r = sd_bus_call(bus, m, 0, &error, NULL);
2036         }
2037
2038         if (r < 0) {
2039                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
2040                 return r;
2041         }
2042
2043         return 0;
2044 }
2045
2046 static int terminate_machine(pid_t pid) {
2047         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2048         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2049         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2050         const char *path;
2051         int r;
2052
2053         if (!arg_register)
2054                 return 0;
2055
2056         r = sd_bus_default_system(&bus);
2057         if (r < 0)
2058                 return log_error_errno(r, "Failed to open system bus: %m");
2059
2060         r = sd_bus_call_method(
2061                         bus,
2062                         "org.freedesktop.machine1",
2063                         "/org/freedesktop/machine1",
2064                         "org.freedesktop.machine1.Manager",
2065                         "GetMachineByPID",
2066                         &error,
2067                         &reply,
2068                         "u",
2069                         (uint32_t) pid);
2070         if (r < 0) {
2071                 /* Note that the machine might already have been
2072                  * cleaned up automatically, hence don't consider it a
2073                  * failure if we cannot get the machine object. */
2074                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2075                 return 0;
2076         }
2077
2078         r = sd_bus_message_read(reply, "o", &path);
2079         if (r < 0)
2080                 return bus_log_parse_error(r);
2081
2082         r = sd_bus_call_method(
2083                         bus,
2084                         "org.freedesktop.machine1",
2085                         path,
2086                         "org.freedesktop.machine1.Machine",
2087                         "Terminate",
2088                         &error,
2089                         NULL,
2090                         NULL);
2091         if (r < 0) {
2092                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2093                 return 0;
2094         }
2095
2096         return 0;
2097 }
2098
2099 static int reset_audit_loginuid(void) {
2100         _cleanup_free_ char *p = NULL;
2101         int r;
2102
2103         if (arg_share_system)
2104                 return 0;
2105
2106         r = read_one_line_file("/proc/self/loginuid", &p);
2107         if (r == -ENOENT)
2108                 return 0;
2109         if (r < 0)
2110                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2111
2112         /* Already reset? */
2113         if (streq(p, "4294967295"))
2114                 return 0;
2115
2116         r = write_string_file("/proc/self/loginuid", "4294967295");
2117         if (r < 0) {
2118                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2119                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2120                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2121                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2122                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2123
2124                 sleep(5);
2125         }
2126
2127         return 0;
2128 }
2129
2130 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2131 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2132 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2133
2134 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2135         uint8_t result[8];
2136         size_t l, sz;
2137         uint8_t *v, *i;
2138         int r;
2139
2140         l = strlen(arg_machine);
2141         sz = sizeof(sd_id128_t) + l;
2142         if (idx > 0)
2143                 sz += sizeof(idx);
2144
2145         v = alloca(sz);
2146
2147         /* fetch some persistent data unique to the host */
2148         r = sd_id128_get_machine((sd_id128_t*) v);
2149         if (r < 0)
2150                 return r;
2151
2152         /* combine with some data unique (on this host) to this
2153          * container instance */
2154         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2155         if (idx > 0) {
2156                 idx = htole64(idx);
2157                 memcpy(i, &idx, sizeof(idx));
2158         }
2159
2160         /* Let's hash the host machine ID plus the container name. We
2161          * use a fixed, but originally randomly created hash key here. */
2162         siphash24(result, v, sz, hash_key.bytes);
2163
2164         assert_cc(ETH_ALEN <= sizeof(result));
2165         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2166
2167         /* see eth_random_addr in the kernel */
2168         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2169         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2170
2171         return 0;
2172 }
2173
2174 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2175         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2176         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2177         struct ether_addr mac_host, mac_container;
2178         int r, i;
2179
2180         if (!arg_private_network)
2181                 return 0;
2182
2183         if (!arg_network_veth)
2184                 return 0;
2185
2186         /* Use two different interface name prefixes depending whether
2187          * we are in bridge mode or not. */
2188         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2189                  arg_network_bridge ? "vb" : "ve", arg_machine);
2190
2191         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2192         if (r < 0)
2193                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2194
2195         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2196         if (r < 0)
2197                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2198
2199         r = sd_rtnl_open(&rtnl, 0);
2200         if (r < 0)
2201                 return log_error_errno(r, "Failed to connect to netlink: %m");
2202
2203         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2204         if (r < 0)
2205                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2206
2207         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2210
2211         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2214
2215         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to open netlink container: %m");
2218
2219         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to open netlink container: %m");
2222
2223         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to open netlink container: %m");
2226
2227         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2230
2231         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2234
2235         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2238
2239         r = sd_rtnl_message_close_container(m);
2240         if (r < 0)
2241                 return log_error_errno(r, "Failed to close netlink container: %m");
2242
2243         r = sd_rtnl_message_close_container(m);
2244         if (r < 0)
2245                 return log_error_errno(r, "Failed to close netlink container: %m");
2246
2247         r = sd_rtnl_message_close_container(m);
2248         if (r < 0)
2249                 return log_error_errno(r, "Failed to close netlink container: %m");
2250
2251         r = sd_rtnl_call(rtnl, m, 0, NULL);
2252         if (r < 0)
2253                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2254
2255         i = (int) if_nametoindex(iface_name);
2256         if (i <= 0)
2257                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2258
2259         *ifi = i;
2260
2261         return 0;
2262 }
2263
2264 static int setup_bridge(const char veth_name[], int *ifi) {
2265         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2266         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2267         int r, bridge;
2268
2269         if (!arg_private_network)
2270                 return 0;
2271
2272         if (!arg_network_veth)
2273                 return 0;
2274
2275         if (!arg_network_bridge)
2276                 return 0;
2277
2278         bridge = (int) if_nametoindex(arg_network_bridge);
2279         if (bridge <= 0)
2280                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2281
2282         *ifi = bridge;
2283
2284         r = sd_rtnl_open(&rtnl, 0);
2285         if (r < 0)
2286                 return log_error_errno(r, "Failed to connect to netlink: %m");
2287
2288         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2289         if (r < 0)
2290                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2291
2292         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2293         if (r < 0)
2294                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2295
2296         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2297         if (r < 0)
2298                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2299
2300         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2301         if (r < 0)
2302                 return log_error_errno(r, "Failed to add netlink master field: %m");
2303
2304         r = sd_rtnl_call(rtnl, m, 0, NULL);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2307
2308         return 0;
2309 }
2310
2311 static int parse_interface(struct udev *udev, const char *name) {
2312         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2313         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2314         int ifi;
2315
2316         ifi = (int) if_nametoindex(name);
2317         if (ifi <= 0)
2318                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2319
2320         sprintf(ifi_str, "n%i", ifi);
2321         d = udev_device_new_from_device_id(udev, ifi_str);
2322         if (!d)
2323                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2324
2325         if (udev_device_get_is_initialized(d) <= 0) {
2326                 log_error("Network interface %s is not initialized yet.", name);
2327                 return -EBUSY;
2328         }
2329
2330         return ifi;
2331 }
2332
2333 static int move_network_interfaces(pid_t pid) {
2334         _cleanup_udev_unref_ struct udev *udev = NULL;
2335         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2336         char **i;
2337         int r;
2338
2339         if (!arg_private_network)
2340                 return 0;
2341
2342         if (strv_isempty(arg_network_interfaces))
2343                 return 0;
2344
2345         r = sd_rtnl_open(&rtnl, 0);
2346         if (r < 0)
2347                 return log_error_errno(r, "Failed to connect to netlink: %m");
2348
2349         udev = udev_new();
2350         if (!udev) {
2351                 log_error("Failed to connect to udev.");
2352                 return -ENOMEM;
2353         }
2354
2355         STRV_FOREACH(i, arg_network_interfaces) {
2356                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2357                 int ifi;
2358
2359                 ifi = parse_interface(udev, *i);
2360                 if (ifi < 0)
2361                         return ifi;
2362
2363                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2364                 if (r < 0)
2365                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2366
2367                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2368                 if (r < 0)
2369                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2370
2371                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2372                 if (r < 0)
2373                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2374         }
2375
2376         return 0;
2377 }
2378
2379 static int setup_macvlan(pid_t pid) {
2380         _cleanup_udev_unref_ struct udev *udev = NULL;
2381         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2382         unsigned idx = 0;
2383         char **i;
2384         int r;
2385
2386         if (!arg_private_network)
2387                 return 0;
2388
2389         if (strv_isempty(arg_network_macvlan))
2390                 return 0;
2391
2392         r = sd_rtnl_open(&rtnl, 0);
2393         if (r < 0)
2394                 return log_error_errno(r, "Failed to connect to netlink: %m");
2395
2396         udev = udev_new();
2397         if (!udev) {
2398                 log_error("Failed to connect to udev.");
2399                 return -ENOMEM;
2400         }
2401
2402         STRV_FOREACH(i, arg_network_macvlan) {
2403                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2404                 _cleanup_free_ char *n = NULL;
2405                 struct ether_addr mac;
2406                 int ifi;
2407
2408                 ifi = parse_interface(udev, *i);
2409                 if (ifi < 0)
2410                         return ifi;
2411
2412                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2413                 if (r < 0)
2414                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2415
2416                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2417                 if (r < 0)
2418                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2419
2420                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2421                 if (r < 0)
2422                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2423
2424                 n = strappend("mv-", *i);
2425                 if (!n)
2426                         return log_oom();
2427
2428                 strshorten(n, IFNAMSIZ-1);
2429
2430                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2431                 if (r < 0)
2432                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2433
2434                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2435                 if (r < 0)
2436                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2437
2438                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2439                 if (r < 0)
2440                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2441
2442                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to open netlink container: %m");
2445
2446                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to open netlink container: %m");
2449
2450                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2453
2454                 r = sd_rtnl_message_close_container(m);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to close netlink container: %m");
2457
2458                 r = sd_rtnl_message_close_container(m);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to close netlink container: %m");
2461
2462                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2465         }
2466
2467         return 0;
2468 }
2469
2470 static int setup_ipvlan(pid_t pid) {
2471         _cleanup_udev_unref_ struct udev *udev = NULL;
2472         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2473         char **i;
2474         int r;
2475
2476         if (!arg_private_network)
2477                 return 0;
2478
2479         if (strv_isempty(arg_network_ipvlan))
2480                 return 0;
2481
2482         r = sd_rtnl_open(&rtnl, 0);
2483         if (r < 0)
2484                 return log_error_errno(r, "Failed to connect to netlink: %m");
2485
2486         udev = udev_new();
2487         if (!udev) {
2488                 log_error("Failed to connect to udev.");
2489                 return -ENOMEM;
2490         }
2491
2492         STRV_FOREACH(i, arg_network_ipvlan) {
2493                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2494                 _cleanup_free_ char *n = NULL;
2495                 int ifi;
2496
2497                 ifi = parse_interface(udev, *i);
2498                 if (ifi < 0)
2499                         return ifi;
2500
2501                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2502                 if (r < 0)
2503                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2504
2505                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2506                 if (r < 0)
2507                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2508
2509                 n = strappend("iv-", *i);
2510                 if (!n)
2511                         return log_oom();
2512
2513                 strshorten(n, IFNAMSIZ-1);
2514
2515                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2516                 if (r < 0)
2517                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2518
2519                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2520                 if (r < 0)
2521                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2522
2523                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2524                 if (r < 0)
2525                         return log_error_errno(r, "Failed to open netlink container: %m");
2526
2527                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2528                 if (r < 0)
2529                         return log_error_errno(r, "Failed to open netlink container: %m");
2530
2531                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2532                 if (r < 0)
2533                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2534
2535                 r = sd_rtnl_message_close_container(m);
2536                 if (r < 0)
2537                         return log_error_errno(r, "Failed to close netlink container: %m");
2538
2539                 r = sd_rtnl_message_close_container(m);
2540                 if (r < 0)
2541                         return log_error_errno(r, "Failed to close netlink container: %m");
2542
2543                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2544                 if (r < 0)
2545                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2546         }
2547
2548         return 0;
2549 }
2550
2551 static int setup_seccomp(void) {
2552
2553 #ifdef HAVE_SECCOMP
2554         static const int blacklist[] = {
2555                 SCMP_SYS(kexec_load),
2556                 SCMP_SYS(open_by_handle_at),
2557                 SCMP_SYS(iopl),
2558                 SCMP_SYS(ioperm),
2559                 SCMP_SYS(swapon),
2560                 SCMP_SYS(swapoff),
2561         };
2562
2563         static const int kmod_blacklist[] = {
2564                 SCMP_SYS(init_module),
2565                 SCMP_SYS(finit_module),
2566                 SCMP_SYS(delete_module),
2567         };
2568
2569         scmp_filter_ctx seccomp;
2570         unsigned i;
2571         int r;
2572
2573         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2574         if (!seccomp)
2575                 return log_oom();
2576
2577         r = seccomp_add_secondary_archs(seccomp);
2578         if (r < 0) {
2579                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2580                 goto finish;
2581         }
2582
2583         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2584                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2585                 if (r == -EFAULT)
2586                         continue; /* unknown syscall */
2587                 if (r < 0) {
2588                         log_error_errno(r, "Failed to block syscall: %m");
2589                         goto finish;
2590                 }
2591         }
2592
2593         /* If the CAP_SYS_MODULE capability is not requested then
2594          * we'll block the kmod syscalls too */
2595         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2596                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2597                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2598                         if (r == -EFAULT)
2599                                 continue; /* unknown syscall */
2600                         if (r < 0) {
2601                                 log_error_errno(r, "Failed to block syscall: %m");
2602                                 goto finish;
2603                         }
2604                 }
2605         }
2606
2607         /*
2608            Audit is broken in containers, much of the userspace audit
2609            hookup will fail if running inside a container. We don't
2610            care and just turn off creation of audit sockets.
2611
2612            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2613            with EAFNOSUPPORT which audit userspace uses as indication
2614            that audit is disabled in the kernel.
2615          */
2616
2617         r = seccomp_rule_add(
2618                         seccomp,
2619                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2620                         SCMP_SYS(socket),
2621                         2,
2622                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2623                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2624         if (r < 0) {
2625                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2626                 goto finish;
2627         }
2628
2629         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2630         if (r < 0) {
2631                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2632                 goto finish;
2633         }
2634
2635         r = seccomp_load(seccomp);
2636         if (r < 0)
2637                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2638
2639 finish:
2640         seccomp_release(seccomp);
2641         return r;
2642 #else
2643         return 0;
2644 #endif
2645
2646 }
2647
2648 static int setup_propagate(const char *root) {
2649         const char *p, *q;
2650
2651         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2652         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2653         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2654         (void) mkdir_p(p, 0600);
2655
2656         q = strjoina(root, "/run/systemd/nspawn/incoming");
2657         mkdir_parents(q, 0755);
2658         mkdir_p(q, 0600);
2659
2660         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2661                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2662
2663         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2664                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2665
2666         return 0;
2667 }
2668
2669 static int setup_image(char **device_path, int *loop_nr) {
2670         struct loop_info64 info = {
2671                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2672         };
2673         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2674         _cleanup_free_ char* loopdev = NULL;
2675         struct stat st;
2676         int r, nr;
2677
2678         assert(device_path);
2679         assert(loop_nr);
2680         assert(arg_image);
2681
2682         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2683         if (fd < 0)
2684                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2685
2686         if (fstat(fd, &st) < 0)
2687                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2688
2689         if (S_ISBLK(st.st_mode)) {
2690                 char *p;
2691
2692                 p = strdup(arg_image);
2693                 if (!p)
2694                         return log_oom();
2695
2696                 *device_path = p;
2697
2698                 *loop_nr = -1;
2699
2700                 r = fd;
2701                 fd = -1;
2702
2703                 return r;
2704         }
2705
2706         if (!S_ISREG(st.st_mode)) {
2707                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2708                 return -EINVAL;
2709         }
2710
2711         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2712         if (control < 0)
2713                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2714
2715         nr = ioctl(control, LOOP_CTL_GET_FREE);
2716         if (nr < 0)
2717                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2718
2719         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2720                 return log_oom();
2721
2722         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2723         if (loop < 0)
2724                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2725
2726         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2727                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2728
2729         if (arg_read_only)
2730                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2731
2732         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2733                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2734
2735         *device_path = loopdev;
2736         loopdev = NULL;
2737
2738         *loop_nr = nr;
2739
2740         r = loop;
2741         loop = -1;
2742
2743         return r;
2744 }
2745
2746 #define PARTITION_TABLE_BLURB \
2747         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2748         "type 0x83 that is marked bootable, or a single GPT partition of type " \
2749         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2750         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2751         "to be bootable with systemd-nspawn."
2752
2753 static int dissect_image(
2754                 int fd,
2755                 char **root_device, bool *root_device_rw,
2756                 char **home_device, bool *home_device_rw,
2757                 char **srv_device, bool *srv_device_rw,
2758                 bool *secondary) {
2759
2760 #ifdef HAVE_BLKID
2761         int home_nr = -1, srv_nr = -1;
2762 #ifdef GPT_ROOT_NATIVE
2763         int root_nr = -1;
2764 #endif
2765 #ifdef GPT_ROOT_SECONDARY
2766         int secondary_root_nr = -1;
2767 #endif
2768         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2769         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2770         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2771         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2772         _cleanup_udev_unref_ struct udev *udev = NULL;
2773         struct udev_list_entry *first, *item;
2774         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2775         bool is_gpt, is_mbr, multiple_generic = false;
2776         const char *pttype = NULL;
2777         blkid_partlist pl;
2778         struct stat st;
2779         unsigned i;
2780         int r;
2781
2782         assert(fd >= 0);
2783         assert(root_device);
2784         assert(home_device);
2785         assert(srv_device);
2786         assert(secondary);
2787         assert(arg_image);
2788
2789         b = blkid_new_probe();
2790         if (!b)
2791                 return log_oom();
2792
2793         errno = 0;
2794         r = blkid_probe_set_device(b, fd, 0, 0);
2795         if (r != 0) {
2796                 if (errno == 0)
2797                         return log_oom();
2798
2799                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2800                 return -errno;
2801         }
2802
2803         blkid_probe_enable_partitions(b, 1);
2804         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2805
2806         errno = 0;
2807         r = blkid_do_safeprobe(b);
2808         if (r == -2 || r == 1) {
2809                 log_error("Failed to identify any partition table on\n"
2810                           "    %s\n"
2811                           PARTITION_TABLE_BLURB, arg_image);
2812                 return -EINVAL;
2813         } else if (r != 0) {
2814                 if (errno == 0)
2815                         errno = EIO;
2816                 log_error_errno(errno, "Failed to probe: %m");
2817                 return -errno;
2818         }
2819
2820         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2821
2822         is_gpt = streq_ptr(pttype, "gpt");
2823         is_mbr = streq_ptr(pttype, "dos");
2824
2825         if (!is_gpt && !is_mbr) {
2826                 log_error("No GPT or MBR partition table discovered on\n"
2827                           "    %s\n"
2828                           PARTITION_TABLE_BLURB, arg_image);
2829                 return -EINVAL;
2830         }
2831
2832         errno = 0;
2833         pl = blkid_probe_get_partitions(b);
2834         if (!pl) {
2835                 if (errno == 0)
2836                         return log_oom();
2837
2838                 log_error("Failed to list partitions of %s", arg_image);
2839                 return -errno;
2840         }
2841
2842         udev = udev_new();
2843         if (!udev)
2844                 return log_oom();
2845
2846         if (fstat(fd, &st) < 0)
2847                 return log_error_errno(errno, "Failed to stat block device: %m");
2848
2849         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2850         if (!d)
2851                 return log_oom();
2852
2853         for (i = 0;; i++) {
2854                 int n, m;
2855
2856                 if (i >= 10) {
2857                         log_error("Kernel partitions never appeared.");
2858                         return -ENXIO;
2859                 }
2860
2861                 e = udev_enumerate_new(udev);
2862                 if (!e)
2863                         return log_oom();
2864
2865                 r = udev_enumerate_add_match_parent(e, d);
2866                 if (r < 0)
2867                         return log_oom();
2868
2869                 r = udev_enumerate_scan_devices(e);
2870                 if (r < 0)
2871                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2872
2873                 /* Count the partitions enumerated by the kernel */
2874                 n = 0;
2875                 first = udev_enumerate_get_list_entry(e);
2876                 udev_list_entry_foreach(item, first)
2877                         n++;
2878
2879                 /* Count the partitions enumerated by blkid */
2880                 m = blkid_partlist_numof_partitions(pl);
2881                 if (n == m + 1)
2882                         break;
2883                 if (n > m + 1) {
2884                         log_error("blkid and kernel partition list do not match.");
2885                         return -EIO;
2886                 }
2887                 if (n < m + 1) {
2888                         unsigned j;
2889
2890                         /* The kernel has probed fewer partitions than
2891                          * blkid? Maybe the kernel prober is still
2892                          * running or it got EBUSY because udev
2893                          * already opened the device. Let's reprobe
2894                          * the device, which is a synchronous call
2895                          * that waits until probing is complete. */
2896
2897                         for (j = 0; j < 20; j++) {
2898
2899                                 r = ioctl(fd, BLKRRPART, 0);
2900                                 if (r < 0)
2901                                         r = -errno;
2902                                 if (r >= 0 || r != -EBUSY)
2903                                         break;
2904
2905                                 /* If something else has the device
2906                                  * open, such as an udev rule, the
2907                                  * ioctl will return EBUSY. Since
2908                                  * there's no way to wait until it
2909                                  * isn't busy anymore, let's just wait
2910                                  * a bit, and try again.
2911                                  *
2912                                  * This is really something they
2913                                  * should fix in the kernel! */
2914
2915                                 usleep(50 * USEC_PER_MSEC);
2916                         }
2917
2918                         if (r < 0)
2919                                 return log_error_errno(r, "Failed to reread partition table: %m");
2920                 }
2921
2922                 e = udev_enumerate_unref(e);
2923         }
2924
2925         first = udev_enumerate_get_list_entry(e);
2926         udev_list_entry_foreach(item, first) {
2927                 _cleanup_udev_device_unref_ struct udev_device *q;
2928                 const char *node;
2929                 unsigned long long flags;
2930                 blkid_partition pp;
2931                 dev_t qn;
2932                 int nr;
2933
2934                 errno = 0;
2935                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2936                 if (!q) {
2937                         if (!errno)
2938                                 errno = ENOMEM;
2939
2940                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2941                         return -errno;
2942                 }
2943
2944                 qn = udev_device_get_devnum(q);
2945                 if (major(qn) == 0)
2946                         continue;
2947
2948                 if (st.st_rdev == qn)
2949                         continue;
2950
2951                 node = udev_device_get_devnode(q);
2952                 if (!node)
2953                         continue;
2954
2955                 pp = blkid_partlist_devno_to_partition(pl, qn);
2956                 if (!pp)
2957                         continue;
2958
2959                 flags = blkid_partition_get_flags(pp);
2960
2961                 nr = blkid_partition_get_partno(pp);
2962                 if (nr < 0)
2963                         continue;
2964
2965                 if (is_gpt) {
2966                         sd_id128_t type_id;
2967                         const char *stype;
2968
2969                         if (flags & GPT_FLAG_NO_AUTO)
2970                                 continue;
2971
2972                         stype = blkid_partition_get_type_string(pp);
2973                         if (!stype)
2974                                 continue;
2975
2976                         if (sd_id128_from_string(stype, &type_id) < 0)
2977                                 continue;
2978
2979                         if (sd_id128_equal(type_id, GPT_HOME)) {
2980
2981                                 if (home && nr >= home_nr)
2982                                         continue;
2983
2984                                 home_nr = nr;
2985                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2986
2987                                 r = free_and_strdup(&home, node);
2988                                 if (r < 0)
2989                                         return log_oom();
2990
2991                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2992
2993                                 if (srv && nr >= srv_nr)
2994                                         continue;
2995
2996                                 srv_nr = nr;
2997                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2998
2999                                 r = free_and_strdup(&srv, node);
3000                                 if (r < 0)
3001                                         return log_oom();
3002                         }
3003 #ifdef GPT_ROOT_NATIVE
3004                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
3005
3006                                 if (root && nr >= root_nr)
3007                                         continue;
3008
3009                                 root_nr = nr;
3010                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
3011
3012                                 r = free_and_strdup(&root, node);
3013                                 if (r < 0)
3014                                         return log_oom();
3015                         }
3016 #endif
3017 #ifdef GPT_ROOT_SECONDARY
3018                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
3019
3020                                 if (secondary_root && nr >= secondary_root_nr)
3021                                         continue;
3022
3023                                 secondary_root_nr = nr;
3024                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
3025
3026                                 r = free_and_strdup(&secondary_root, node);
3027                                 if (r < 0)
3028                                         return log_oom();
3029                         }
3030 #endif
3031                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
3032
3033                                 if (generic)
3034                                         multiple_generic = true;
3035                                 else {
3036                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
3037
3038                                         r = free_and_strdup(&generic, node);
3039                                         if (r < 0)
3040                                                 return log_oom();
3041                                 }
3042                         }
3043
3044                 } else if (is_mbr) {
3045                         int type;
3046
3047                         if (flags != 0x80) /* Bootable flag */
3048                                 continue;
3049
3050                         type = blkid_partition_get_type(pp);
3051                         if (type != 0x83) /* Linux partition */
3052                                 continue;
3053
3054                         if (generic)
3055                                 multiple_generic = true;
3056                         else {
3057                                 generic_rw = true;
3058
3059                                 r = free_and_strdup(&root, node);
3060                                 if (r < 0)
3061                                         return log_oom();
3062                         }
3063                 }
3064         }
3065
3066         if (root) {
3067                 *root_device = root;
3068                 root = NULL;
3069
3070                 *root_device_rw = root_rw;
3071                 *secondary = false;
3072         } else if (secondary_root) {
3073                 *root_device = secondary_root;
3074                 secondary_root = NULL;
3075
3076                 *root_device_rw = secondary_root_rw;
3077                 *secondary = true;
3078         } else if (generic) {
3079
3080                 /* There were no partitions with precise meanings
3081                  * around, but we found generic partitions. In this
3082                  * case, if there's only one, we can go ahead and boot
3083                  * it, otherwise we bail out, because we really cannot
3084                  * make any sense of it. */
3085
3086                 if (multiple_generic) {
3087                         log_error("Identified multiple bootable Linux partitions on\n"
3088                                   "    %s\n"
3089                                   PARTITION_TABLE_BLURB, arg_image);
3090                         return -EINVAL;
3091                 }
3092
3093                 *root_device = generic;
3094                 generic = NULL;
3095
3096                 *root_device_rw = generic_rw;
3097                 *secondary = false;
3098         } else {
3099                 log_error("Failed to identify root partition in disk image\n"
3100                           "    %s\n"
3101                           PARTITION_TABLE_BLURB, arg_image);
3102                 return -EINVAL;
3103         }
3104
3105         if (home) {
3106                 *home_device = home;
3107                 home = NULL;
3108
3109                 *home_device_rw = home_rw;
3110         }
3111
3112         if (srv) {
3113                 *srv_device = srv;
3114                 srv = NULL;
3115
3116                 *srv_device_rw = srv_rw;
3117         }
3118
3119         return 0;
3120 #else
3121         log_error("--image= is not supported, compiled without blkid support.");
3122         return -ENOTSUP;
3123 #endif
3124 }
3125
3126 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3127 #ifdef HAVE_BLKID
3128         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3129         const char *fstype, *p;
3130         int r;
3131
3132         assert(what);
3133         assert(where);
3134
3135         if (arg_read_only)
3136                 rw = false;
3137
3138         if (directory)
3139                 p = strjoina(where, directory);
3140         else
3141                 p = where;
3142
3143         errno = 0;
3144         b = blkid_new_probe_from_filename(what);
3145         if (!b) {
3146                 if (errno == 0)
3147                         return log_oom();
3148                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3149                 return -errno;
3150         }
3151
3152         blkid_probe_enable_superblocks(b, 1);
3153         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3154
3155         errno = 0;
3156         r = blkid_do_safeprobe(b);
3157         if (r == -1 || r == 1) {
3158                 log_error("Cannot determine file system type of %s", what);
3159                 return -EINVAL;
3160         } else if (r != 0) {
3161                 if (errno == 0)
3162                         errno = EIO;
3163                 log_error_errno(errno, "Failed to probe %s: %m", what);
3164                 return -errno;
3165         }
3166
3167         errno = 0;
3168         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3169                 if (errno == 0)
3170                         errno = EINVAL;
3171                 log_error("Failed to determine file system type of %s", what);
3172                 return -errno;
3173         }
3174
3175         if (streq(fstype, "crypto_LUKS")) {
3176                 log_error("nspawn currently does not support LUKS disk images.");
3177                 return -ENOTSUP;
3178         }
3179
3180         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3181                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3182
3183         return 0;
3184 #else
3185         log_error("--image= is not supported, compiled without blkid support.");
3186         return -ENOTSUP;
3187 #endif
3188 }
3189
3190 static int mount_devices(
3191                 const char *where,
3192                 const char *root_device, bool root_device_rw,
3193                 const char *home_device, bool home_device_rw,
3194                 const char *srv_device, bool srv_device_rw) {
3195         int r;
3196
3197         assert(where);
3198
3199         if (root_device) {
3200                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3201                 if (r < 0)
3202                         return log_error_errno(r, "Failed to mount root directory: %m");
3203         }
3204
3205         if (home_device) {
3206                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3207                 if (r < 0)
3208                         return log_error_errno(r, "Failed to mount home directory: %m");
3209         }
3210
3211         if (srv_device) {
3212                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3213                 if (r < 0)
3214                         return log_error_errno(r, "Failed to mount server data directory: %m");
3215         }
3216
3217         return 0;
3218 }
3219
3220 static void loop_remove(int nr, int *image_fd) {
3221         _cleanup_close_ int control = -1;
3222         int r;
3223
3224         if (nr < 0)
3225                 return;
3226
3227         if (image_fd && *image_fd >= 0) {
3228                 r = ioctl(*image_fd, LOOP_CLR_FD);
3229                 if (r < 0)
3230                         log_debug_errno(errno, "Failed to close loop image: %m");
3231                 *image_fd = safe_close(*image_fd);
3232         }
3233
3234         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3235         if (control < 0) {
3236                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3237                 return;
3238         }
3239
3240         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3241         if (r < 0)
3242                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3243 }
3244
3245 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3246         int pipe_fds[2];
3247         pid_t pid;
3248
3249         assert(database);
3250         assert(key);
3251         assert(rpid);
3252
3253         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3254                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3255
3256         pid = fork();
3257         if (pid < 0)
3258                 return log_error_errno(errno, "Failed to fork getent child: %m");
3259         else if (pid == 0) {
3260                 int nullfd;
3261                 char *empty_env = NULL;
3262
3263                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3264                         _exit(EXIT_FAILURE);
3265
3266                 if (pipe_fds[0] > 2)
3267                         safe_close(pipe_fds[0]);
3268                 if (pipe_fds[1] > 2)
3269                         safe_close(pipe_fds[1]);
3270
3271                 nullfd = open("/dev/null", O_RDWR);
3272                 if (nullfd < 0)
3273                         _exit(EXIT_FAILURE);
3274
3275                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3276                         _exit(EXIT_FAILURE);
3277
3278                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3279                         _exit(EXIT_FAILURE);
3280
3281                 if (nullfd > 2)
3282                         safe_close(nullfd);
3283
3284                 reset_all_signal_handlers();
3285                 close_all_fds(NULL, 0);
3286
3287                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3288                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3289                 _exit(EXIT_FAILURE);
3290         }
3291
3292         pipe_fds[1] = safe_close(pipe_fds[1]);
3293
3294         *rpid = pid;
3295
3296         return pipe_fds[0];
3297 }
3298
3299 static int change_uid_gid(char **_home) {
3300         char line[LINE_MAX], *x, *u, *g, *h;
3301         const char *word, *state;
3302         _cleanup_free_ uid_t *uids = NULL;
3303         _cleanup_free_ char *home = NULL;
3304         _cleanup_fclose_ FILE *f = NULL;
3305         _cleanup_close_ int fd = -1;
3306         unsigned n_uids = 0;
3307         size_t sz = 0, l;
3308         uid_t uid;
3309         gid_t gid;
3310         pid_t pid;
3311         int r;
3312
3313         assert(_home);
3314
3315         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3316                 /* Reset everything fully to 0, just in case */
3317
3318                 if (setgroups(0, NULL) < 0)
3319                         return log_error_errno(errno, "setgroups() failed: %m");
3320
3321                 if (setresgid(0, 0, 0) < 0)
3322                         return log_error_errno(errno, "setregid() failed: %m");
3323
3324                 if (setresuid(0, 0, 0) < 0)
3325                         return log_error_errno(errno, "setreuid() failed: %m");
3326
3327                 *_home = NULL;
3328                 return 0;
3329         }
3330
3331         /* First, get user credentials */
3332         fd = spawn_getent("passwd", arg_user, &pid);
3333         if (fd < 0)
3334                 return fd;
3335
3336         f = fdopen(fd, "r");
3337         if (!f)
3338                 return log_oom();
3339         fd = -1;
3340
3341         if (!fgets(line, sizeof(line), f)) {
3342
3343                 if (!ferror(f)) {
3344                         log_error("Failed to resolve user %s.", arg_user);
3345                         return -ESRCH;
3346                 }
3347
3348                 log_error_errno(errno, "Failed to read from getent: %m");
3349                 return -errno;
3350         }
3351
3352         truncate_nl(line);
3353
3354         wait_for_terminate_and_warn("getent passwd", pid, true);
3355
3356         x = strchr(line, ':');
3357         if (!x) {
3358                 log_error("/etc/passwd entry has invalid user field.");
3359                 return -EIO;
3360         }
3361
3362         u = strchr(x+1, ':');
3363         if (!u) {
3364                 log_error("/etc/passwd entry has invalid password field.");
3365                 return -EIO;
3366         }
3367
3368         u++;
3369         g = strchr(u, ':');
3370         if (!g) {
3371                 log_error("/etc/passwd entry has invalid UID field.");
3372                 return -EIO;
3373         }
3374
3375         *g = 0;
3376         g++;
3377         x = strchr(g, ':');
3378         if (!x) {
3379                 log_error("/etc/passwd entry has invalid GID field.");
3380                 return -EIO;
3381         }
3382
3383         *x = 0;
3384         h = strchr(x+1, ':');
3385         if (!h) {
3386                 log_error("/etc/passwd entry has invalid GECOS field.");
3387                 return -EIO;
3388         }
3389
3390         h++;
3391         x = strchr(h, ':');
3392         if (!x) {
3393                 log_error("/etc/passwd entry has invalid home directory field.");
3394                 return -EIO;
3395         }
3396
3397         *x = 0;
3398
3399         r = parse_uid(u, &uid);
3400         if (r < 0) {
3401                 log_error("Failed to parse UID of user.");
3402                 return -EIO;
3403         }
3404
3405         r = parse_gid(g, &gid);
3406         if (r < 0) {
3407                 log_error("Failed to parse GID of user.");
3408                 return -EIO;
3409         }
3410
3411         home = strdup(h);
3412         if (!home)
3413                 return log_oom();
3414
3415         /* Second, get group memberships */
3416         fd = spawn_getent("initgroups", arg_user, &pid);
3417         if (fd < 0)
3418                 return fd;
3419
3420         fclose(f);
3421         f = fdopen(fd, "r");
3422         if (!f)
3423                 return log_oom();
3424         fd = -1;
3425
3426         if (!fgets(line, sizeof(line), f)) {
3427                 if (!ferror(f)) {
3428                         log_error("Failed to resolve user %s.", arg_user);
3429                         return -ESRCH;
3430                 }
3431
3432                 log_error_errno(errno, "Failed to read from getent: %m");
3433                 return -errno;
3434         }
3435
3436         truncate_nl(line);
3437
3438         wait_for_terminate_and_warn("getent initgroups", pid, true);
3439
3440         /* Skip over the username and subsequent separator whitespace */
3441         x = line;
3442         x += strcspn(x, WHITESPACE);
3443         x += strspn(x, WHITESPACE);
3444
3445         FOREACH_WORD(word, l, x, state) {
3446                 char c[l+1];
3447
3448                 memcpy(c, word, l);
3449                 c[l] = 0;
3450
3451                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3452                         return log_oom();
3453
3454                 r = parse_uid(c, &uids[n_uids++]);
3455                 if (r < 0) {
3456                         log_error("Failed to parse group data from getent.");
3457                         return -EIO;
3458                 }
3459         }
3460
3461         r = mkdir_parents(home, 0775);
3462         if (r < 0)
3463                 return log_error_errno(r, "Failed to make home root directory: %m");
3464
3465         r = mkdir_safe(home, 0755, uid, gid);
3466         if (r < 0 && r != -EEXIST)
3467                 return log_error_errno(r, "Failed to make home directory: %m");
3468
3469         fchown(STDIN_FILENO, uid, gid);
3470         fchown(STDOUT_FILENO, uid, gid);
3471         fchown(STDERR_FILENO, uid, gid);
3472
3473         if (setgroups(n_uids, uids) < 0)
3474                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3475
3476         if (setresgid(gid, gid, gid) < 0)
3477                 return log_error_errno(errno, "setregid() failed: %m");
3478
3479         if (setresuid(uid, uid, uid) < 0)
3480                 return log_error_errno(errno, "setreuid() failed: %m");
3481
3482         if (_home) {
3483                 *_home = home;
3484                 home = NULL;
3485         }
3486
3487         return 0;
3488 }
3489
3490 /*
3491  * Return values:
3492  * < 0 : wait_for_terminate() failed to get the state of the
3493  *       container, the container was terminated by a signal, or
3494  *       failed for an unknown reason.  No change is made to the
3495  *       container argument.
3496  * > 0 : The program executed in the container terminated with an
3497  *       error.  The exit code of the program executed in the
3498  *       container is returned.  The container argument has been set
3499  *       to CONTAINER_TERMINATED.
3500  *   0 : The container is being rebooted, has been shut down or exited
3501  *       successfully.  The container argument has been set to either
3502  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3503  *
3504  * That is, success is indicated by a return value of zero, and an
3505  * error is indicated by a non-zero value.
3506  */
3507 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3508         siginfo_t status;
3509         int r;
3510
3511         r = wait_for_terminate(pid, &status);
3512         if (r < 0)
3513                 return log_warning_errno(r, "Failed to wait for container: %m");
3514
3515         switch (status.si_code) {
3516
3517         case CLD_EXITED:
3518                 if (status.si_status == 0) {
3519                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3520
3521                 } else
3522                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3523
3524                 *container = CONTAINER_TERMINATED;
3525                 return status.si_status;
3526
3527         case CLD_KILLED:
3528                 if (status.si_status == SIGINT) {
3529
3530                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3531                         *container = CONTAINER_TERMINATED;
3532                         return 0;
3533
3534                 } else if (status.si_status == SIGHUP) {
3535
3536                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3537                         *container = CONTAINER_REBOOTED;
3538                         return 0;
3539                 }
3540
3541                 /* CLD_KILLED fallthrough */
3542
3543         case CLD_DUMPED:
3544                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3545                 return -EIO;
3546
3547         default:
3548                 log_error("Container %s failed due to unknown reason.", arg_machine);
3549                 return -EIO;
3550         }
3551
3552         return r;
3553 }
3554
3555 static void nop_handler(int sig) {}
3556
3557 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3558         pid_t pid;
3559
3560         pid = PTR_TO_UINT32(userdata);
3561         if (pid > 0) {
3562                 if (kill(pid, SIGRTMIN+3) >= 0) {
3563                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3564                         sd_event_source_set_userdata(s, NULL);
3565                         return 0;
3566                 }
3567         }
3568
3569         sd_event_exit(sd_event_source_get_event(s), 0);
3570         return 0;
3571 }
3572
3573 static int determine_names(void) {
3574         int r;
3575
3576         if (!arg_image && !arg_directory) {
3577                 if (arg_machine) {
3578                         _cleanup_(image_unrefp) Image *i = NULL;
3579
3580                         r = image_find(arg_machine, &i);
3581                         if (r < 0)
3582                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3583                         else if (r == 0) {
3584                                 log_error("No image for machine '%s': %m", arg_machine);
3585                                 return -ENOENT;
3586                         }
3587
3588                         if (i->type == IMAGE_RAW)
3589                                 r = set_sanitized_path(&arg_image, i->path);
3590                         else
3591                                 r = set_sanitized_path(&arg_directory, i->path);
3592                         if (r < 0)
3593                                 return log_error_errno(r, "Invalid image directory: %m");
3594
3595                         arg_read_only = arg_read_only || i->read_only;
3596                 } else
3597                         arg_directory = get_current_dir_name();
3598
3599                 if (!arg_directory && !arg_machine) {
3600                         log_error("Failed to determine path, please use -D or -i.");
3601                         return -EINVAL;
3602                 }
3603         }
3604
3605         if (!arg_machine) {
3606                 if (arg_directory && path_equal(arg_directory, "/"))
3607                         arg_machine = gethostname_malloc();
3608                 else
3609                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3610
3611                 if (!arg_machine)
3612                         return log_oom();
3613
3614                 hostname_cleanup(arg_machine, false);
3615                 if (!machine_name_is_valid(arg_machine)) {
3616                         log_error("Failed to determine machine name automatically, please use -M.");
3617                         return -EINVAL;
3618                 }
3619
3620                 if (arg_ephemeral) {
3621                         char *b;
3622
3623                         /* Add a random suffix when this is an
3624                          * ephemeral machine, so that we can run many
3625                          * instances at once without manually having
3626                          * to specify -M each time. */
3627
3628                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3629                                 return log_oom();
3630
3631                         free(arg_machine);
3632                         arg_machine = b;
3633                 }
3634         }
3635
3636         return 0;
3637 }
3638
3639 static int determine_uid_shift(void) {
3640         int r;
3641
3642         if (!arg_userns)
3643                 return 0;
3644
3645         if (arg_uid_shift == UID_INVALID) {
3646                 struct stat st;
3647
3648                 r = stat(arg_directory, &st);
3649                 if (r < 0)
3650                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", arg_directory);
3651
3652                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3653
3654                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
3655                         log_error("UID and GID base of %s don't match.", arg_directory);
3656                         return -EINVAL;
3657                 }
3658
3659                 arg_uid_range = UINT32_C(0x10000);
3660         }
3661
3662         if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
3663                 log_error("UID base too high for UID range.");
3664                 return -EINVAL;
3665         }
3666
3667         log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3668         return 0;
3669 }
3670
3671 int main(int argc, char *argv[]) {
3672
3673         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3674         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3675         _cleanup_close_ int master = -1, image_fd = -1;
3676         _cleanup_fdset_free_ FDSet *fds = NULL;
3677         int r, n_fd_passed, loop_nr = -1;
3678         char veth_name[IFNAMSIZ];
3679         bool secondary = false, remove_subvol = false;
3680         sigset_t mask, mask_chld;
3681         pid_t pid = 0;
3682         int ret = EXIT_SUCCESS;
3683         union in_addr_union exposed = {};
3684         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3685         bool interactive;
3686
3687         log_parse_environment();
3688         log_open();
3689
3690         r = parse_argv(argc, argv);
3691         if (r <= 0)
3692                 goto finish;
3693
3694         r = determine_names();
3695         if (r < 0)
3696                 goto finish;
3697
3698         if (geteuid() != 0) {
3699                 log_error("Need to be root.");
3700                 r = -EPERM;
3701                 goto finish;
3702         }
3703
3704         if (sd_booted() <= 0) {
3705                 log_error("Not running on a systemd system.");
3706                 r = -EINVAL;
3707                 goto finish;
3708         }
3709
3710         log_close();
3711         n_fd_passed = sd_listen_fds(false);
3712         if (n_fd_passed > 0) {
3713                 r = fdset_new_listen_fds(&fds, false);
3714                 if (r < 0) {
3715                         log_error_errno(r, "Failed to collect file descriptors: %m");
3716                         goto finish;
3717                 }
3718         }
3719         fdset_close_others(fds);
3720         log_open();
3721
3722         if (arg_directory) {
3723                 assert(!arg_image);
3724
3725                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3726                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3727                         r = -EINVAL;
3728                         goto finish;
3729                 }
3730
3731                 if (arg_ephemeral) {
3732                         char *np;
3733
3734                         /* If the specified path is a mount point we
3735                          * generate the new snapshot immediately
3736                          * inside it under a random name. However if
3737                          * the specified is not a mount point we
3738                          * create the new snapshot in the parent
3739                          * directory, just next to it. */
3740                         r = path_is_mount_point(arg_directory, false);
3741                         if (r < 0) {
3742                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3743                                 goto finish;
3744                         }
3745                         if (r > 0)
3746                                 r = tempfn_random_child(arg_directory, &np);
3747                         else
3748                                 r = tempfn_random(arg_directory, &np);
3749                         if (r < 0) {
3750                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3751                                 goto finish;
3752                         }
3753
3754                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3755                         if (r < 0) {
3756                                 log_error_errno(r, "Failed to lock %s: %m", np);
3757                                 goto finish;
3758                         }
3759
3760                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3761                         if (r < 0) {
3762                                 free(np);
3763                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3764                                 goto finish;
3765                         }
3766
3767                         free(arg_directory);
3768                         arg_directory = np;
3769
3770                         remove_subvol = true;
3771
3772                 } else {
3773                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3774                         if (r == -EBUSY) {
3775                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3776                                 goto finish;
3777                         }
3778                         if (r < 0) {
3779                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3780                                 return r;
3781                         }
3782
3783                         if (arg_template) {
3784                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3785                                 if (r == -EEXIST) {
3786                                         if (!arg_quiet)
3787                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3788                                 } else if (r < 0) {
3789                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3790                                         goto finish;
3791                                 } else {
3792                                         if (!arg_quiet)
3793                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3794                                 }
3795                         }
3796                 }
3797
3798                 if (arg_boot) {
3799                         if (path_is_os_tree(arg_directory) <= 0) {
3800                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3801                                 r = -EINVAL;
3802                                 goto finish;
3803                         }
3804                 } else {
3805                         const char *p;
3806
3807                         p = strjoina(arg_directory,
3808                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3809                         if (access(p, F_OK) < 0) {
3810                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3811                                 r = -EINVAL;
3812                                 goto finish;
3813                         }
3814                 }
3815
3816         } else {
3817                 char template[] = "/tmp/nspawn-root-XXXXXX";
3818
3819                 assert(arg_image);
3820                 assert(!arg_template);
3821
3822                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3823                 if (r == -EBUSY) {
3824                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3825                         goto finish;
3826                 }
3827                 if (r < 0) {
3828                         r = log_error_errno(r, "Failed to create image lock: %m");
3829                         goto finish;
3830                 }
3831
3832                 if (!mkdtemp(template)) {
3833                         log_error_errno(errno, "Failed to create temporary directory: %m");
3834                         r = -errno;
3835                         goto finish;
3836                 }
3837
3838                 arg_directory = strdup(template);
3839                 if (!arg_directory) {
3840                         r = log_oom();
3841                         goto finish;
3842                 }
3843
3844                 image_fd = setup_image(&device_path, &loop_nr);
3845                 if (image_fd < 0) {
3846                         r = image_fd;
3847                         goto finish;
3848                 }
3849
3850                 r = dissect_image(image_fd,
3851                                   &root_device, &root_device_rw,
3852                                   &home_device, &home_device_rw,
3853                                   &srv_device, &srv_device_rw,
3854                                   &secondary);
3855                 if (r < 0)
3856                         goto finish;
3857         }
3858
3859         r = determine_uid_shift();
3860         if (r < 0)
3861                 goto finish;
3862
3863         interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
3864
3865         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3866         if (master < 0) {
3867                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3868                 goto finish;
3869         }
3870
3871         r = ptsname_malloc(master, &console);
3872         if (r < 0) {
3873                 r = log_error_errno(r, "Failed to determine tty name: %m");
3874                 goto finish;
3875         }
3876
3877         if (unlockpt(master) < 0) {
3878                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3879                 goto finish;
3880         }
3881
3882         if (!arg_quiet)
3883                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3884                          arg_machine, arg_image ?: arg_directory);
3885
3886         assert_se(sigemptyset(&mask) == 0);
3887         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3888         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3889
3890         assert_se(sigemptyset(&mask_chld) == 0);
3891         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3892
3893         for (;;) {
3894                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3895                 ContainerStatus container_status;
3896                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3897                 struct sigaction sa = {
3898                         .sa_handler = nop_handler,
3899                         .sa_flags = SA_NOCLDSTOP,
3900                 };
3901
3902                 r = barrier_create(&barrier);
3903                 if (r < 0) {
3904                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3905                         goto finish;
3906                 }
3907
3908                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3909                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3910                         goto finish;
3911                 }
3912
3913                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3914                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3915                         goto finish;
3916                 }
3917
3918                 /* Child can be killed before execv(), so handle SIGCHLD
3919                  * in order to interrupt parent's blocking calls and
3920                  * give it a chance to call wait() and terminate. */
3921                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3922                 if (r < 0) {
3923                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3924                         goto finish;
3925                 }
3926
3927                 r = sigaction(SIGCHLD, &sa, NULL);
3928                 if (r < 0) {
3929                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3930                         goto finish;
3931                 }
3932
3933                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3934                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3935                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3936                 if (pid < 0) {
3937                         if (errno == EINVAL)
3938                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3939                         else
3940                                 r = log_error_errno(errno, "clone() failed: %m");
3941
3942                         goto finish;
3943                 }
3944
3945                 if (pid == 0) {
3946                         /* child */
3947                         _cleanup_free_ char *home = NULL;
3948                         unsigned n_env = 2;
3949                         const char *envp[] = {
3950                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3951                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3952                                 NULL, /* TERM */
3953                                 NULL, /* HOME */
3954                                 NULL, /* USER */
3955                                 NULL, /* LOGNAME */
3956                                 NULL, /* container_uuid */
3957                                 NULL, /* LISTEN_FDS */
3958                                 NULL, /* LISTEN_PID */
3959                                 NULL
3960                         };
3961                         char **env_use;
3962
3963                         barrier_set_role(&barrier, BARRIER_CHILD);
3964
3965                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3966                         if (envp[n_env])
3967                                 n_env ++;
3968
3969                         master = safe_close(master);
3970
3971                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3972                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3973
3974                         reset_all_signal_handlers();
3975                         reset_signal_mask();
3976
3977                         if (interactive) {
3978                                 close_nointr(STDIN_FILENO);
3979                                 close_nointr(STDOUT_FILENO);
3980                                 close_nointr(STDERR_FILENO);
3981
3982                                 r = open_terminal(console, O_RDWR);
3983                                 if (r != STDIN_FILENO) {
3984                                         if (r >= 0) {
3985                                                 safe_close(r);
3986                                                 r = -EINVAL;
3987                                         }
3988
3989                                         log_error_errno(r, "Failed to open console: %m");
3990                                         _exit(EXIT_FAILURE);
3991                                 }
3992
3993                                 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3994                                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3995                                         log_error_errno(errno, "Failed to duplicate console: %m");
3996                                         _exit(EXIT_FAILURE);
3997                                 }
3998                         }
3999
4000                         if (setsid() < 0) {
4001                                 log_error_errno(errno, "setsid() failed: %m");
4002                                 _exit(EXIT_FAILURE);
4003                         }
4004
4005                         if (reset_audit_loginuid() < 0)
4006                                 _exit(EXIT_FAILURE);
4007
4008                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
4009                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
4010                                 _exit(EXIT_FAILURE);
4011                         }
4012
4013                         if (arg_private_network)
4014                                 loopback_setup();
4015
4016                         /* Mark everything as slave, so that we still
4017                          * receive mounts from the real root, but don't
4018                          * propagate mounts to the real root. */
4019                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
4020                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
4021                                 _exit(EXIT_FAILURE);
4022                         }
4023
4024                         if (mount_devices(arg_directory,
4025                                           root_device, root_device_rw,
4026                                           home_device, home_device_rw,
4027                                           srv_device, srv_device_rw) < 0)
4028                                 _exit(EXIT_FAILURE);
4029
4030                         /* Turn directory into bind mount */
4031                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
4032                                 log_error_errno(errno, "Failed to make bind mount: %m");
4033                                 _exit(EXIT_FAILURE);
4034                         }
4035
4036                         r = setup_volatile(arg_directory);
4037                         if (r < 0)
4038                                 _exit(EXIT_FAILURE);
4039
4040                         if (setup_volatile_state(arg_directory) < 0)
4041                                 _exit(EXIT_FAILURE);
4042
4043                         r = base_filesystem_create(arg_directory);
4044                         if (r < 0)
4045                                 _exit(EXIT_FAILURE);
4046
4047                         if (arg_read_only) {
4048                                 r = bind_remount_recursive(arg_directory, true);
4049                                 if (r < 0) {
4050                                         log_error_errno(r, "Failed to make tree read-only: %m");
4051                                         _exit(EXIT_FAILURE);
4052                                 }
4053                         }
4054
4055                         if (mount_all(arg_directory) < 0)
4056                                 _exit(EXIT_FAILURE);
4057
4058                         if (copy_devnodes(arg_directory) < 0)
4059                                 _exit(EXIT_FAILURE);
4060
4061                         if (setup_ptmx(arg_directory) < 0)
4062                                 _exit(EXIT_FAILURE);
4063
4064                         dev_setup(arg_directory);
4065
4066                         if (setup_propagate(arg_directory) < 0)
4067                                 _exit(EXIT_FAILURE);
4068
4069                         if (setup_seccomp() < 0)
4070                                 _exit(EXIT_FAILURE);
4071
4072                         if (setup_dev_console(arg_directory, console) < 0)
4073                                 _exit(EXIT_FAILURE);
4074
4075                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
4076                                 _exit(EXIT_FAILURE);
4077                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4078
4079                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
4080                                 _exit(EXIT_FAILURE);
4081                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4082
4083                         /* Tell the parent that we are ready, and that
4084                          * it can cgroupify us to that we lack access
4085                          * to certain devices and resources. */
4086                         (void) barrier_place(&barrier); /* #1 */
4087
4088                         if (setup_boot_id(arg_directory) < 0)
4089                                 _exit(EXIT_FAILURE);
4090
4091                         if (setup_timezone(arg_directory) < 0)
4092                                 _exit(EXIT_FAILURE);
4093
4094                         if (setup_resolv_conf(arg_directory) < 0)
4095                                 _exit(EXIT_FAILURE);
4096
4097                         if (setup_journal(arg_directory) < 0)
4098                                 _exit(EXIT_FAILURE);
4099
4100                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4101                                 _exit(EXIT_FAILURE);
4102
4103                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4104                                 _exit(EXIT_FAILURE);
4105
4106                         if (mount_tmpfs(arg_directory) < 0)
4107                                 _exit(EXIT_FAILURE);
4108
4109                         /* Wait until we are cgroup-ified, so that we
4110                          * can mount the right cgroup path writable */
4111                         (void) barrier_place_and_sync(&barrier); /* #2 */
4112
4113                         if (mount_cgroup(arg_directory) < 0)
4114                                 _exit(EXIT_FAILURE);
4115
4116                         if (chdir(arg_directory) < 0) {
4117                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4118                                 _exit(EXIT_FAILURE);
4119                         }
4120
4121                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4122                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4123                                 _exit(EXIT_FAILURE);
4124                         }
4125
4126                         if (chroot(".") < 0) {
4127                                 log_error_errno(errno, "chroot() failed: %m");
4128                                 _exit(EXIT_FAILURE);
4129                         }
4130
4131                         if (chdir("/") < 0) {
4132                                 log_error_errno(errno, "chdir() failed: %m");
4133                                 _exit(EXIT_FAILURE);
4134                         }
4135
4136                         if (arg_userns) {
4137                                 if (unshare(CLONE_NEWUSER) < 0) {
4138                                         log_error_errno(errno, "unshare(CLONE_NEWUSER) failed: %m");
4139                                         _exit(EXIT_FAILURE);
4140                                 }
4141
4142                                 /* Tell the parent, that it now can
4143                                  * write the UID map. */
4144                                 (void) barrier_place(&barrier); /* #3 */
4145
4146                                 /* Wait until the parent wrote the UID
4147                                  * map */
4148                                 (void) barrier_place_and_sync(&barrier); /* #4 */
4149                         }
4150
4151                         umask(0022);
4152
4153                         if (drop_capabilities() < 0) {
4154                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4155                                 _exit(EXIT_FAILURE);
4156                         }
4157
4158                         setup_hostname();
4159
4160                         if (arg_personality != 0xffffffffLU) {
4161                                 if (personality(arg_personality) < 0) {
4162                                         log_error_errno(errno, "personality() failed: %m");
4163                                         _exit(EXIT_FAILURE);
4164                                 }
4165                         } else if (secondary) {
4166                                 if (personality(PER_LINUX32) < 0) {
4167                                         log_error_errno(errno, "personality() failed: %m");
4168                                         _exit(EXIT_FAILURE);
4169                                 }
4170                         }
4171
4172 #ifdef HAVE_SELINUX
4173                         if (arg_selinux_context)
4174                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4175                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4176                                         _exit(EXIT_FAILURE);
4177                                 }
4178 #endif
4179
4180                         r = change_uid_gid(&home);
4181                         if (r < 0)
4182                                 _exit(EXIT_FAILURE);
4183
4184                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4185                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4186                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4187                                 log_oom();
4188                                 _exit(EXIT_FAILURE);
4189                         }
4190
4191                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4192                                 char as_uuid[37];
4193
4194                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4195                                         log_oom();
4196                                         _exit(EXIT_FAILURE);
4197                                 }
4198                         }
4199
4200                         if (fdset_size(fds) > 0) {
4201                                 r = fdset_cloexec(fds, false);
4202                                 if (r < 0) {
4203                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4204                                         _exit(EXIT_FAILURE);
4205                                 }
4206
4207                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4208                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4209                                         log_oom();
4210                                         _exit(EXIT_FAILURE);
4211                                 }
4212                         }
4213
4214                         if (!strv_isempty(arg_setenv)) {
4215                                 char **n;
4216
4217                                 n = strv_env_merge(2, envp, arg_setenv);
4218                                 if (!n) {
4219                                         log_oom();
4220                                         _exit(EXIT_FAILURE);
4221                                 }
4222
4223                                 env_use = n;
4224                         } else
4225                                 env_use = (char**) envp;
4226
4227                         /* Let the parent know that we are ready and
4228                          * wait until the parent is ready with the
4229                          * setup, too... */
4230                         (void) barrier_place_and_sync(&barrier); /* #5 */
4231
4232                         if (arg_boot) {
4233                                 char **a;
4234                                 size_t l;
4235
4236                                 /* Automatically search for the init system */
4237
4238                                 l = 1 + argc - optind;
4239                                 a = newa(char*, l + 1);
4240                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4241
4242                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4243                                 execve(a[0], a, env_use);
4244
4245                                 a[0] = (char*) "/lib/systemd/systemd";
4246                                 execve(a[0], a, env_use);
4247
4248                                 a[0] = (char*) "/sbin/init";
4249                                 execve(a[0], a, env_use);
4250                         } else if (argc > optind)
4251                                 execvpe(argv[optind], argv + optind, env_use);
4252                         else {
4253                                 chdir(home ? home : "/root");
4254                                 execle("/bin/bash", "-bash", NULL, env_use);
4255                                 execle("/bin/sh", "-sh", NULL, env_use);
4256                         }
4257
4258                         log_error_errno(errno, "execv() failed: %m");
4259                         _exit(EXIT_FAILURE);
4260                 }
4261
4262                 barrier_set_role(&barrier, BARRIER_PARENT);
4263                 fdset_free(fds);
4264                 fds = NULL;
4265
4266                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4267                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4268
4269                 (void) barrier_place(&barrier); /* #1 */
4270
4271                 /* Wait for the most basic Child-setup to be done,
4272                  * before we add hardware to it, and place it in a
4273                  * cgroup. */
4274                 if (barrier_sync(&barrier)) { /* #1 */
4275                         int ifi = 0;
4276
4277                         r = move_network_interfaces(pid);
4278                         if (r < 0)
4279                                 goto finish;
4280
4281                         r = setup_veth(pid, veth_name, &ifi);
4282                         if (r < 0)
4283                                 goto finish;
4284
4285                         r = setup_bridge(veth_name, &ifi);
4286                         if (r < 0)
4287                                 goto finish;
4288
4289                         r = setup_macvlan(pid);
4290                         if (r < 0)
4291                                 goto finish;
4292
4293                         r = setup_ipvlan(pid);
4294                         if (r < 0)
4295                                 goto finish;
4296
4297                         r = register_machine(pid, ifi);
4298                         if (r < 0)
4299                                 goto finish;
4300
4301                         /* Notify the child that the parent is ready with all
4302                          * its setup, and that the child can now hand over
4303                          * control to the code to run inside the container. */
4304                         (void) barrier_place(&barrier); /* #2 */
4305
4306                         if (arg_userns) {
4307                                 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
4308
4309                                 (void) barrier_place_and_sync(&barrier); /* #3 */
4310
4311                                 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4312                                 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
4313                                 r = write_string_file(uid_map, line);
4314                                 if (r < 0) {
4315                                         log_error_errno(r, "Failed to write UID map: %m");
4316                                         goto finish;
4317                                 }
4318
4319                                 /* We always assign the same UID and GID ranges */
4320                                 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4321                                 r = write_string_file(uid_map, line);
4322                                 if (r < 0) {
4323                                         log_error_errno(r, "Failed to write GID map: %m");
4324                                         goto finish;
4325                                 }
4326
4327                                 (void) barrier_place(&barrier); /* #4 */
4328                         }
4329
4330                         /* Block SIGCHLD here, before notifying child.
4331                          * process_pty() will handle it with the other signals. */
4332                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4333                         if (r < 0)
4334                                 goto finish;
4335
4336                         /* Reset signal to default */
4337                         r = default_signals(SIGCHLD, -1);
4338                         if (r < 0)
4339                                 goto finish;
4340
4341                         /* Let the child know that we are ready and wait that the child is completely ready now. */
4342                         if (barrier_place_and_sync(&barrier)) { /* #5 */
4343                                 _cleanup_event_unref_ sd_event *event = NULL;
4344                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4345                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4346                                 char last_char = 0;
4347
4348                                 sd_notifyf(false,
4349                                            "READY=1\n"
4350                                            "STATUS=Container running.\n"
4351                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4352
4353                                 r = sd_event_new(&event);
4354                                 if (r < 0) {
4355                                         log_error_errno(r, "Failed to get default event source: %m");
4356                                         goto finish;
4357                                 }
4358
4359                                 if (arg_boot) {
4360                                         /* Try to kill the init system on SIGINT or SIGTERM */
4361                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4362                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4363                                 } else {
4364                                         /* Immediately exit */
4365                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4366                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4367                                 }
4368
4369                                 /* simply exit on sigchld */
4370                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4371
4372                                 if (arg_expose_ports) {
4373                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4374                                         if (r < 0)
4375                                                 goto finish;
4376
4377                                         (void) expose_ports(rtnl, &exposed);
4378                                 }
4379
4380                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4381
4382                                 r = pty_forward_new(event, master, true, !interactive, &forward);
4383                                 if (r < 0) {
4384                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4385                                         goto finish;
4386                                 }
4387
4388                                 r = sd_event_loop(event);
4389                                 if (r < 0) {
4390                                         log_error_errno(r, "Failed to run event loop: %m");
4391                                         goto finish;
4392                                 }
4393
4394                                 pty_forward_get_last_char(forward, &last_char);
4395
4396                                 forward = pty_forward_free(forward);
4397
4398                                 if (!arg_quiet && last_char != '\n')
4399                                         putc('\n', stdout);
4400
4401                                 /* Kill if it is not dead yet anyway */
4402                                 terminate_machine(pid);
4403                         }
4404                 }
4405
4406                 /* Normally redundant, but better safe than sorry */
4407                 kill(pid, SIGKILL);
4408
4409                 r = wait_for_container(pid, &container_status);
4410                 pid = 0;
4411
4412                 if (r < 0)
4413                         /* We failed to wait for the container, or the
4414                          * container exited abnormally */
4415                         goto finish;
4416                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4417                         /* The container exited with a non-zero
4418                          * status, or with zero status and no reboot
4419                          * was requested. */
4420                         ret = r;
4421                         break;
4422                 }
4423
4424                 /* CONTAINER_REBOOTED, loop again */
4425
4426                 if (arg_keep_unit) {
4427                         /* Special handling if we are running as a
4428                          * service: instead of simply restarting the
4429                          * machine we want to restart the entire
4430                          * service, so let's inform systemd about this
4431                          * with the special exit code 133. The service
4432                          * file uses RestartForceExitStatus=133 so
4433                          * that this results in a full nspawn
4434                          * restart. This is necessary since we might
4435                          * have cgroup parameters set we want to have
4436                          * flushed out. */
4437                         ret = 133;
4438                         r = 0;
4439                         break;
4440                 }
4441
4442                 flush_ports(&exposed);
4443         }
4444
4445 finish:
4446         sd_notify(false,
4447                   "STOPPING=1\n"
4448                   "STATUS=Terminating...");
4449
4450         loop_remove(loop_nr, &image_fd);
4451
4452         if (pid > 0)
4453                 kill(pid, SIGKILL);
4454
4455         if (remove_subvol && arg_directory) {
4456                 int k;
4457
4458                 k = btrfs_subvol_remove(arg_directory);
4459                 if (k < 0)
4460                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4461         }
4462
4463         if (arg_machine) {
4464                 const char *p;
4465
4466                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4467                 (void) rm_rf(p, false, true, false);
4468         }
4469
4470         free(arg_directory);
4471         free(arg_template);
4472         free(arg_image);
4473         free(arg_machine);
4474         free(arg_user);
4475         strv_free(arg_setenv);
4476         strv_free(arg_network_interfaces);
4477         strv_free(arg_network_macvlan);
4478         strv_free(arg_network_ipvlan);
4479         strv_free(arg_bind);
4480         strv_free(arg_bind_ro);
4481         strv_free(arg_tmpfs);
4482
4483         flush_ports(&exposed);
4484
4485         while (arg_expose_ports) {
4486                 ExposePort *p = arg_expose_ports;
4487                 LIST_REMOVE(ports, arg_expose_ports, p);
4488                 free(p);
4489         }
4490
4491         return r < 0 ? EXIT_FAILURE : ret;
4492 }