chiark / gitweb /
2beb810e5def98b949e9305418d82878fdf8f96c
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
820 #ifdef HAVE_SELINUX
821                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
822                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
823 #endif
824         };
825
826         unsigned k;
827         int r = 0;
828
829         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830                 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832                 _cleanup_free_ char *options = NULL;
833 #endif
834                 const char *o;
835                 int t;
836
837                 where = strjoin(dest, "/", mount_table[k].where, NULL);
838                 if (!where)
839                         return log_oom();
840
841                 t = path_is_mount_point(where, true);
842                 if (t < 0) {
843                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845                         if (r == 0)
846                                 r = t;
847
848                         continue;
849                 }
850
851                 /* Skip this entry if it is not a remount. */
852                 if (mount_table[k].what && t > 0)
853                         continue;
854
855                 t = mkdir_p(where, 0755);
856                 if (t < 0) {
857                         if (mount_table[k].fatal) {
858                                log_error_errno(t, "Failed to create directory %s: %m", where);
859
860                                 if (r == 0)
861                                         r = t;
862                         } else
863                                log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865                         continue;
866                 }
867
868 #ifdef HAVE_SELINUX
869                 if (arg_selinux_apifs_context &&
870                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872                         if (!options)
873                                 return log_oom();
874
875                         o = options;
876                 } else
877 #endif
878                         o = mount_table[k].options;
879
880
881                 if (mount(mount_table[k].what,
882                           where,
883                           mount_table[k].type,
884                           mount_table[k].flags,
885                           o) < 0) {
886
887                         if (mount_table[k].fatal) {
888                                 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890                                 if (r == 0)
891                                         r = -errno;
892                         } else
893                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
894                 }
895         }
896
897         return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901         char **x, **y;
902
903         STRV_FOREACH_PAIR(x, y, l) {
904                 _cleanup_free_ char *where = NULL;
905                 struct stat source_st, dest_st;
906                 int r;
907
908                 if (stat(*x, &source_st) < 0)
909                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911                 where = strappend(dest, *y);
912                 if (!where)
913                         return log_oom();
914
915                 r = stat(where, &dest_st);
916                 if (r == 0) {
917                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919                                 return -EINVAL;
920                         }
921                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923                                 return -EINVAL;
924                         }
925                 } else if (errno == ENOENT) {
926                         r = mkdir_parents_label(where, 0755);
927                         if (r < 0)
928                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929                 } else {
930                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931                         return -errno;
932                 }
933
934                 /* Create the mount point. Any non-directory file can be
935                  * mounted on any non-directory file (regular, fifo, socket,
936                  * char, block).
937                  */
938                 if (S_ISDIR(source_st.st_mode)) {
939                         r = mkdir_label(where, 0755);
940                         if (r < 0 && errno != EEXIST)
941                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942                 } else {
943                         r = touch(where);
944                         if (r < 0)
945                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946                 }
947
948                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949                         return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951                 if (ro) {
952                         r = bind_remount_recursive(where, true);
953                         if (r < 0)
954                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
955                 }
956         }
957
958         return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962         char *to;
963         int r;
964
965         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
966
967         r = path_is_mount_point(to, false);
968         if (r < 0)
969                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970         if (r > 0)
971                 return 0;
972
973         mkdir_p(to, 0755);
974
975         /* The superblock mount options of the mount point need to be
976          * identical to the hosts', and hence writable... */
977         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
978                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
979
980         /* ... hence let's only make the bind mount read-only, not the
981          * superblock. */
982         if (read_only) {
983                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
984                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
985         }
986         return 1;
987 }
988
989 static int mount_cgroup(const char *dest) {
990         _cleanup_set_free_free_ Set *controllers = NULL;
991         _cleanup_free_ char *own_cgroup_path = NULL;
992         const char *cgroup_root, *systemd_root, *systemd_own;
993         int r;
994
995         controllers = set_new(&string_hash_ops);
996         if (!controllers)
997                 return log_oom();
998
999         r = cg_kernel_controllers(controllers);
1000         if (r < 0)
1001                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1002
1003         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1004         if (r < 0)
1005                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1006
1007         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1008         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1009                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1010
1011         for (;;) {
1012                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1013
1014                 controller = set_steal_first(controllers);
1015                 if (!controller)
1016                         break;
1017
1018                 origin = strappend("/sys/fs/cgroup/", controller);
1019                 if (!origin)
1020                         return log_oom();
1021
1022                 r = readlink_malloc(origin, &combined);
1023                 if (r == -EINVAL) {
1024                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1025
1026                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1027                         if (r < 0)
1028                                 return r;
1029
1030                 } else if (r < 0)
1031                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1032                 else {
1033                         _cleanup_free_ char *target = NULL;
1034
1035                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1036                         if (!target)
1037                                 return log_oom();
1038
1039                         /* A symbolic link, a combination of controllers in one hierarchy */
1040
1041                         if (!filename_is_valid(combined)) {
1042                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1043                                 continue;
1044                         }
1045
1046                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1047                         if (r < 0)
1048                                 return r;
1049
1050                         if (symlink(combined, target) < 0)
1051                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1052                 }
1053         }
1054
1055         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1056         if (r < 0)
1057                 return r;
1058
1059         /* Make our own cgroup a (writable) bind mount */
1060         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1061         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1062                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1063
1064         /* And then remount the systemd cgroup root read-only */
1065         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1066         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1067                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1068
1069         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1070                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1071
1072         return 0;
1073 }
1074
1075 static int mount_tmpfs(const char *dest) {
1076         char **i, **o;
1077
1078         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1079                 _cleanup_free_ char *where = NULL;
1080                 int r;
1081
1082                 where = strappend(dest, *i);
1083                 if (!where)
1084                         return log_oom();
1085
1086                 r = mkdir_label(where, 0755);
1087                 if (r < 0 && r != -EEXIST)
1088                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1089
1090                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1091                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1092         }
1093
1094         return 0;
1095 }
1096
1097 static int setup_timezone(const char *dest) {
1098         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1099         char *z, *y;
1100         int r;
1101
1102         assert(dest);
1103
1104         /* Fix the timezone, if possible */
1105         r = readlink_malloc("/etc/localtime", &p);
1106         if (r < 0) {
1107                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1108                 return 0;
1109         }
1110
1111         z = path_startswith(p, "../usr/share/zoneinfo/");
1112         if (!z)
1113                 z = path_startswith(p, "/usr/share/zoneinfo/");
1114         if (!z) {
1115                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1116                 return 0;
1117         }
1118
1119         where = strappend(dest, "/etc/localtime");
1120         if (!where)
1121                 return log_oom();
1122
1123         r = readlink_malloc(where, &q);
1124         if (r >= 0) {
1125                 y = path_startswith(q, "../usr/share/zoneinfo/");
1126                 if (!y)
1127                         y = path_startswith(q, "/usr/share/zoneinfo/");
1128
1129                 /* Already pointing to the right place? Then do nothing .. */
1130                 if (y && streq(y, z))
1131                         return 0;
1132         }
1133
1134         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1135         if (!check)
1136                 return log_oom();
1137
1138         if (access(check, F_OK) < 0) {
1139                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1140                 return 0;
1141         }
1142
1143         what = strappend("../usr/share/zoneinfo/", z);
1144         if (!what)
1145                 return log_oom();
1146
1147         r = mkdir_parents(where, 0755);
1148         if (r < 0) {
1149                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1150
1151                 return 0;
1152         }
1153
1154         r = unlink(where);
1155         if (r < 0 && errno != ENOENT) {
1156                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1157
1158                 return 0;
1159         }
1160
1161         if (symlink(what, where) < 0) {
1162                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1163                 return 0;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int setup_resolv_conf(const char *dest) {
1170         _cleanup_free_ char *where = NULL;
1171         int r;
1172
1173         assert(dest);
1174
1175         if (arg_private_network)
1176                 return 0;
1177
1178         /* Fix resolv.conf, if possible */
1179         where = strappend(dest, "/etc/resolv.conf");
1180         if (!where)
1181                 return log_oom();
1182
1183         /* We don't really care for the results of this really. If it
1184          * fails, it fails, but meh... */
1185         r = mkdir_parents(where, 0755);
1186         if (r < 0) {
1187                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1188
1189                 return 0;
1190         }
1191
1192         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1193         if (r < 0) {
1194                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1195
1196                 return 0;
1197         }
1198
1199         return 0;
1200 }
1201
1202 static int setup_volatile_state(const char *directory) {
1203         const char *p;
1204         int r;
1205
1206         assert(directory);
1207
1208         if (arg_volatile != VOLATILE_STATE)
1209                 return 0;
1210
1211         /* --volatile=state means we simply overmount /var
1212            with a tmpfs, and the rest read-only. */
1213
1214         r = bind_remount_recursive(directory, true);
1215         if (r < 0)
1216                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1217
1218         p = strappenda(directory, "/var");
1219         r = mkdir(p, 0755);
1220         if (r < 0 && errno != EEXIST)
1221                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1222
1223         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1224                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1225
1226         return 0;
1227 }
1228
1229 static int setup_volatile(const char *directory) {
1230         bool tmpfs_mounted = false, bind_mounted = false;
1231         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1232         const char *f, *t;
1233         int r;
1234
1235         assert(directory);
1236
1237         if (arg_volatile != VOLATILE_YES)
1238                 return 0;
1239
1240         /* --volatile=yes means we mount a tmpfs to the root dir, and
1241            the original /usr to use inside it, and that read-only. */
1242
1243         if (!mkdtemp(template))
1244                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1245
1246         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1247                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1248                 r = -errno;
1249                 goto fail;
1250         }
1251
1252         tmpfs_mounted = true;
1253
1254         f = strappenda(directory, "/usr");
1255         t = strappenda(template, "/usr");
1256
1257         r = mkdir(t, 0755);
1258         if (r < 0 && errno != EEXIST) {
1259                 log_error_errno(errno, "Failed to create %s: %m", t);
1260                 r = -errno;
1261                 goto fail;
1262         }
1263
1264         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1265                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1266                 r = -errno;
1267                 goto fail;
1268         }
1269
1270         bind_mounted = true;
1271
1272         r = bind_remount_recursive(t, true);
1273         if (r < 0) {
1274                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1275                 goto fail;
1276         }
1277
1278         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1279                 log_error_errno(errno, "Failed to move root mount: %m");
1280                 r = -errno;
1281                 goto fail;
1282         }
1283
1284         rmdir(template);
1285
1286         return 0;
1287
1288 fail:
1289         if (bind_mounted)
1290                 umount(t);
1291         if (tmpfs_mounted)
1292                 umount(template);
1293         rmdir(template);
1294         return r;
1295 }
1296
1297 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1298
1299         snprintf(s, 37,
1300                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1301                  SD_ID128_FORMAT_VAL(id));
1302
1303         return s;
1304 }
1305
1306 static int setup_boot_id(const char *dest) {
1307         _cleanup_free_ char *from = NULL, *to = NULL;
1308         sd_id128_t rnd = {};
1309         char as_uuid[37];
1310         int r;
1311
1312         assert(dest);
1313
1314         if (arg_share_system)
1315                 return 0;
1316
1317         /* Generate a new randomized boot ID, so that each boot-up of
1318          * the container gets a new one */
1319
1320         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1321         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1322         if (!from || !to)
1323                 return log_oom();
1324
1325         r = sd_id128_randomize(&rnd);
1326         if (r < 0)
1327                 return log_error_errno(r, "Failed to generate random boot id: %m");
1328
1329         id128_format_as_uuid(rnd, as_uuid);
1330
1331         r = write_string_file(from, as_uuid);
1332         if (r < 0)
1333                 return log_error_errno(r, "Failed to write boot id: %m");
1334
1335         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1336                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1337                 r = -errno;
1338         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1339                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1340
1341         unlink(from);
1342         return r;
1343 }
1344
1345 static int copy_devnodes(const char *dest) {
1346
1347         static const char devnodes[] =
1348                 "null\0"
1349                 "zero\0"
1350                 "full\0"
1351                 "random\0"
1352                 "urandom\0"
1353                 "tty\0"
1354                 "net/tun\0";
1355
1356         const char *d;
1357         int r = 0;
1358         _cleanup_umask_ mode_t u;
1359
1360         assert(dest);
1361
1362         u = umask(0000);
1363
1364         NULSTR_FOREACH(d, devnodes) {
1365                 _cleanup_free_ char *from = NULL, *to = NULL;
1366                 struct stat st;
1367
1368                 from = strappend("/dev/", d);
1369                 to = strjoin(dest, "/dev/", d, NULL);
1370                 if (!from || !to)
1371                         return log_oom();
1372
1373                 if (stat(from, &st) < 0) {
1374
1375                         if (errno != ENOENT)
1376                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1377
1378                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1379
1380                         log_error("%s is not a char or block device, cannot copy", from);
1381                         return -EIO;
1382
1383                 } else {
1384                         r = mkdir_parents(to, 0775);
1385                         if (r < 0) {
1386                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1387                                 return -r;
1388                         }
1389
1390                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1391                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1392                 }
1393         }
1394
1395         return r;
1396 }
1397
1398 static int setup_ptmx(const char *dest) {
1399         _cleanup_free_ char *p = NULL;
1400
1401         p = strappend(dest, "/dev/ptmx");
1402         if (!p)
1403                 return log_oom();
1404
1405         if (symlink("pts/ptmx", p) < 0)
1406                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1407
1408         return 0;
1409 }
1410
1411 static int setup_dev_console(const char *dest, const char *console) {
1412         _cleanup_umask_ mode_t u;
1413         const char *to;
1414         struct stat st;
1415         int r;
1416
1417         assert(dest);
1418         assert(console);
1419
1420         u = umask(0000);
1421
1422         if (stat("/dev/null", &st) < 0)
1423                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1424
1425         r = chmod_and_chown(console, 0600, 0, 0);
1426         if (r < 0)
1427                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1428
1429         /* We need to bind mount the right tty to /dev/console since
1430          * ptys can only exist on pts file systems. To have something
1431          * to bind mount things on we create a device node first, and
1432          * use /dev/null for that since we the cgroups device policy
1433          * allows us to create that freely, while we cannot create
1434          * /dev/console. (Note that the major minor doesn't actually
1435          * matter here, since we mount it over anyway). */
1436
1437         to = strappenda(dest, "/dev/console");
1438         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1439                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1440
1441         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1442                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1443
1444         return 0;
1445 }
1446
1447 static int setup_kmsg(const char *dest, int kmsg_socket) {
1448         _cleanup_free_ char *from = NULL, *to = NULL;
1449         _cleanup_umask_ mode_t u;
1450         int r, fd, k;
1451         union {
1452                 struct cmsghdr cmsghdr;
1453                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1454         } control = {};
1455         struct msghdr mh = {
1456                 .msg_control = &control,
1457                 .msg_controllen = sizeof(control),
1458         };
1459         struct cmsghdr *cmsg;
1460
1461         assert(dest);
1462         assert(kmsg_socket >= 0);
1463
1464         u = umask(0000);
1465
1466         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1467          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1468          * on the reading side behave very similar to /proc/kmsg,
1469          * their writing side behaves differently from /dev/kmsg in
1470          * that writing blocks when nothing is reading. In order to
1471          * avoid any problems with containers deadlocking due to this
1472          * we simply make /dev/kmsg unavailable to the container. */
1473         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1474             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1475                 return log_oom();
1476
1477         if (mkfifo(from, 0600) < 0)
1478                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1479
1480         r = chmod_and_chown(from, 0600, 0, 0);
1481         if (r < 0)
1482                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1483
1484         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1485                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1486
1487         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1488         if (fd < 0)
1489                 return log_error_errno(errno, "Failed to open fifo: %m");
1490
1491         cmsg = CMSG_FIRSTHDR(&mh);
1492         cmsg->cmsg_level = SOL_SOCKET;
1493         cmsg->cmsg_type = SCM_RIGHTS;
1494         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1495         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1496
1497         mh.msg_controllen = cmsg->cmsg_len;
1498
1499         /* Store away the fd in the socket, so that it stays open as
1500          * long as we run the child */
1501         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1502         safe_close(fd);
1503
1504         if (k < 0)
1505                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1506
1507         /* And now make the FIFO unavailable as /dev/kmsg... */
1508         unlink(from);
1509         return 0;
1510 }
1511
1512 static int send_rtnl(int send_fd) {
1513         union {
1514                 struct cmsghdr cmsghdr;
1515                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1516         } control = {};
1517         struct msghdr mh = {
1518                 .msg_control = &control,
1519                 .msg_controllen = sizeof(control),
1520         };
1521         struct cmsghdr *cmsg;
1522         _cleanup_close_ int fd = -1;
1523         ssize_t k;
1524
1525         assert(send_fd >= 0);
1526
1527         if (!arg_expose_ports)
1528                 return 0;
1529
1530         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1531         if (fd < 0)
1532                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1533
1534         cmsg = CMSG_FIRSTHDR(&mh);
1535         cmsg->cmsg_level = SOL_SOCKET;
1536         cmsg->cmsg_type = SCM_RIGHTS;
1537         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1538         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1539
1540         mh.msg_controllen = cmsg->cmsg_len;
1541
1542         /* Store away the fd in the socket, so that it stays open as
1543          * long as we run the child */
1544         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1545         if (k < 0)
1546                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1547
1548         return 0;
1549 }
1550
1551 static int flush_ports(union in_addr_union *exposed) {
1552         ExposePort *p;
1553         int r, af = AF_INET;
1554
1555         assert(exposed);
1556
1557         if (!arg_expose_ports)
1558                 return 0;
1559
1560         if (in_addr_is_null(af, exposed))
1561                 return 0;
1562
1563         log_debug("Lost IP address.");
1564
1565         LIST_FOREACH(ports, p, arg_expose_ports) {
1566                 r = fw_add_local_dnat(false,
1567                                       af,
1568                                       p->protocol,
1569                                       NULL,
1570                                       NULL, 0,
1571                                       NULL, 0,
1572                                       p->host_port,
1573                                       exposed,
1574                                       p->container_port,
1575                                       NULL);
1576                 if (r < 0)
1577                         log_warning_errno(r, "Failed to modify firewall: %m");
1578         }
1579
1580         *exposed = IN_ADDR_NULL;
1581         return 0;
1582 }
1583
1584 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1585         _cleanup_free_ struct local_address *addresses = NULL;
1586         _cleanup_free_ char *pretty = NULL;
1587         union in_addr_union new_exposed;
1588         ExposePort *p;
1589         bool add;
1590         int af = AF_INET, r;
1591
1592         assert(exposed);
1593
1594         /* Invoked each time an address is added or removed inside the
1595          * container */
1596
1597         if (!arg_expose_ports)
1598                 return 0;
1599
1600         r = local_addresses(rtnl, 0, af, &addresses);
1601         if (r < 0)
1602                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1603
1604         add = r > 0 &&
1605                 addresses[0].family == af &&
1606                 addresses[0].scope < RT_SCOPE_LINK;
1607
1608         if (!add)
1609                 return flush_ports(exposed);
1610
1611         new_exposed = addresses[0].address;
1612         if (in_addr_equal(af, exposed, &new_exposed))
1613                 return 0;
1614
1615         in_addr_to_string(af, &new_exposed, &pretty);
1616         log_debug("New container IP is %s.", strna(pretty));
1617
1618         LIST_FOREACH(ports, p, arg_expose_ports) {
1619
1620                 r = fw_add_local_dnat(true,
1621                                       af,
1622                                       p->protocol,
1623                                       NULL,
1624                                       NULL, 0,
1625                                       NULL, 0,
1626                                       p->host_port,
1627                                       &new_exposed,
1628                                       p->container_port,
1629                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1630                 if (r < 0)
1631                         log_warning_errno(r, "Failed to modify firewall: %m");
1632         }
1633
1634         *exposed = new_exposed;
1635         return 0;
1636 }
1637
1638 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1639         union in_addr_union *exposed = userdata;
1640
1641         assert(rtnl);
1642         assert(m);
1643         assert(exposed);
1644
1645         expose_ports(rtnl, exposed);
1646         return 0;
1647 }
1648
1649 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1650         union {
1651                 struct cmsghdr cmsghdr;
1652                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1653         } control = {};
1654         struct msghdr mh = {
1655                 .msg_control = &control,
1656                 .msg_controllen = sizeof(control),
1657         };
1658         struct cmsghdr *cmsg;
1659         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1660         int fd, r;
1661         ssize_t k;
1662
1663         assert(event);
1664         assert(recv_fd >= 0);
1665         assert(ret);
1666
1667         if (!arg_expose_ports)
1668                 return 0;
1669
1670         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1671         if (k < 0)
1672                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1673
1674         cmsg = CMSG_FIRSTHDR(&mh);
1675         assert(cmsg->cmsg_level == SOL_SOCKET);
1676         assert(cmsg->cmsg_type == SCM_RIGHTS);
1677         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1678         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1679
1680         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1681         if (r < 0) {
1682                 safe_close(fd);
1683                 return log_error_errno(r, "Failed to create rtnl object: %m");
1684         }
1685
1686         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1689
1690         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1691         if (r < 0)
1692                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1693
1694         r = sd_rtnl_attach_event(rtnl, event, 0);
1695         if (r < 0)
1696                 return log_error_errno(r, "Failed to add to even loop: %m");
1697
1698         *ret = rtnl;
1699         rtnl = NULL;
1700
1701         return 0;
1702 }
1703
1704 static int setup_hostname(void) {
1705
1706         if (arg_share_system)
1707                 return 0;
1708
1709         if (sethostname_idempotent(arg_machine) < 0)
1710                 return -errno;
1711
1712         return 0;
1713 }
1714
1715 static int setup_journal(const char *directory) {
1716         sd_id128_t machine_id, this_id;
1717         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1718         char *id;
1719         int r;
1720
1721         /* Don't link journals in ephemeral mode */
1722         if (arg_ephemeral)
1723                 return 0;
1724
1725         p = strappend(directory, "/etc/machine-id");
1726         if (!p)
1727                 return log_oom();
1728
1729         r = read_one_line_file(p, &b);
1730         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1731                 return 0;
1732         else if (r < 0)
1733                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1734
1735         id = strstrip(b);
1736         if (isempty(id) && arg_link_journal == LINK_AUTO)
1737                 return 0;
1738
1739         /* Verify validity */
1740         r = sd_id128_from_string(id, &machine_id);
1741         if (r < 0)
1742                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1743
1744         r = sd_id128_get_machine(&this_id);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1747
1748         if (sd_id128_equal(machine_id, this_id)) {
1749                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1750                          "Host and machine ids are equal (%s): refusing to link journals", id);
1751                 if (arg_link_journal == LINK_AUTO)
1752                         return 0;
1753                 return -EEXIST;
1754         }
1755
1756         if (arg_link_journal == LINK_NO)
1757                 return 0;
1758
1759         free(p);
1760         p = strappend("/var/log/journal/", id);
1761         q = strjoin(directory, "/var/log/journal/", id, NULL);
1762         if (!p || !q)
1763                 return log_oom();
1764
1765         if (path_is_mount_point(p, false) > 0) {
1766                 if (arg_link_journal != LINK_AUTO) {
1767                         log_error("%s: already a mount point, refusing to use for journal", p);
1768                         return -EEXIST;
1769                 }
1770
1771                 return 0;
1772         }
1773
1774         if (path_is_mount_point(q, false) > 0) {
1775                 if (arg_link_journal != LINK_AUTO) {
1776                         log_error("%s: already a mount point, refusing to use for journal", q);
1777                         return -EEXIST;
1778                 }
1779
1780                 return 0;
1781         }
1782
1783         r = readlink_and_make_absolute(p, &d);
1784         if (r >= 0) {
1785                 if ((arg_link_journal == LINK_GUEST ||
1786                      arg_link_journal == LINK_AUTO) &&
1787                     path_equal(d, q)) {
1788
1789                         r = mkdir_p(q, 0755);
1790                         if (r < 0)
1791                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1792                         return 0;
1793                 }
1794
1795                 if (unlink(p) < 0)
1796                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1797         } else if (r == -EINVAL) {
1798
1799                 if (arg_link_journal == LINK_GUEST &&
1800                     rmdir(p) < 0) {
1801
1802                         if (errno == ENOTDIR) {
1803                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1804                                 return r;
1805                         } else {
1806                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1807                                 return -errno;
1808                         }
1809                 }
1810         } else if (r != -ENOENT) {
1811                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1812                 return r;
1813         }
1814
1815         if (arg_link_journal == LINK_GUEST) {
1816
1817                 if (symlink(q, p) < 0) {
1818                         if (arg_link_journal_try) {
1819                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1820                                 return 0;
1821                         } else {
1822                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1823                                 return -errno;
1824                         }
1825                 }
1826
1827                 r = mkdir_p(q, 0755);
1828                 if (r < 0)
1829                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1830                 return 0;
1831         }
1832
1833         if (arg_link_journal == LINK_HOST) {
1834                 /* don't create parents here -- if the host doesn't have
1835                  * permanent journal set up, don't force it here */
1836                 r = mkdir(p, 0755);
1837                 if (r < 0) {
1838                         if (arg_link_journal_try) {
1839                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1840                                 return 0;
1841                         } else {
1842                                 log_error_errno(errno, "Failed to create %s: %m", p);
1843                                 return r;
1844                         }
1845                 }
1846
1847         } else if (access(p, F_OK) < 0)
1848                 return 0;
1849
1850         if (dir_is_empty(q) == 0)
1851                 log_warning("%s is not empty, proceeding anyway.", q);
1852
1853         r = mkdir_p(q, 0755);
1854         if (r < 0) {
1855                 log_error_errno(errno, "Failed to create %s: %m", q);
1856                 return r;
1857         }
1858
1859         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1860                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1861
1862         return 0;
1863 }
1864
1865 static int drop_capabilities(void) {
1866         return capability_bounding_set_drop(~arg_retain, false);
1867 }
1868
1869 static int register_machine(pid_t pid, int local_ifindex) {
1870         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1871         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1872         int r;
1873
1874         if (!arg_register)
1875                 return 0;
1876
1877         r = sd_bus_default_system(&bus);
1878         if (r < 0)
1879                 return log_error_errno(r, "Failed to open system bus: %m");
1880
1881         if (arg_keep_unit) {
1882                 r = sd_bus_call_method(
1883                                 bus,
1884                                 "org.freedesktop.machine1",
1885                                 "/org/freedesktop/machine1",
1886                                 "org.freedesktop.machine1.Manager",
1887                                 "RegisterMachineWithNetwork",
1888                                 &error,
1889                                 NULL,
1890                                 "sayssusai",
1891                                 arg_machine,
1892                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1893                                 "nspawn",
1894                                 "container",
1895                                 (uint32_t) pid,
1896                                 strempty(arg_directory),
1897                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1898         } else {
1899                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1900
1901                 r = sd_bus_message_new_method_call(
1902                                 bus,
1903                                 &m,
1904                                 "org.freedesktop.machine1",
1905                                 "/org/freedesktop/machine1",
1906                                 "org.freedesktop.machine1.Manager",
1907                                 "CreateMachineWithNetwork");
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to create message: %m");
1910
1911                 r = sd_bus_message_append(
1912                                 m,
1913                                 "sayssusai",
1914                                 arg_machine,
1915                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1916                                 "nspawn",
1917                                 "container",
1918                                 (uint32_t) pid,
1919                                 strempty(arg_directory),
1920                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1921                 if (r < 0)
1922                         return log_error_errno(r, "Failed to append message arguments: %m");
1923
1924                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1925                 if (r < 0)
1926                         return log_error_errno(r, "Failed to open container: %m");
1927
1928                 if (!isempty(arg_slice)) {
1929                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1930                         if (r < 0)
1931                                 return log_error_errno(r, "Failed to append slice: %m");
1932                 }
1933
1934                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1935                 if (r < 0)
1936                         return log_error_errno(r, "Failed to add device policy: %m");
1937
1938                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1939                                           /* Allow the container to
1940                                            * access and create the API
1941                                            * device nodes, so that
1942                                            * PrivateDevices= in the
1943                                            * container can work
1944                                            * fine */
1945                                           "/dev/null", "rwm",
1946                                           "/dev/zero", "rwm",
1947                                           "/dev/full", "rwm",
1948                                           "/dev/random", "rwm",
1949                                           "/dev/urandom", "rwm",
1950                                           "/dev/tty", "rwm",
1951                                           "/dev/net/tun", "rwm",
1952                                           /* Allow the container
1953                                            * access to ptys. However,
1954                                            * do not permit the
1955                                            * container to ever create
1956                                            * these device nodes. */
1957                                           "/dev/pts/ptmx", "rw",
1958                                           "char-pts", "rw");
1959                 if (r < 0)
1960                         return log_error_errno(r, "Failed to add device whitelist: %m");
1961
1962                 r = sd_bus_message_close_container(m);
1963                 if (r < 0)
1964                         return log_error_errno(r, "Failed to close container: %m");
1965
1966                 r = sd_bus_call(bus, m, 0, &error, NULL);
1967         }
1968
1969         if (r < 0) {
1970                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1971                 return r;
1972         }
1973
1974         return 0;
1975 }
1976
1977 static int terminate_machine(pid_t pid) {
1978         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1979         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1980         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1981         const char *path;
1982         int r;
1983
1984         if (!arg_register)
1985                 return 0;
1986
1987         r = sd_bus_default_system(&bus);
1988         if (r < 0)
1989                 return log_error_errno(r, "Failed to open system bus: %m");
1990
1991         r = sd_bus_call_method(
1992                         bus,
1993                         "org.freedesktop.machine1",
1994                         "/org/freedesktop/machine1",
1995                         "org.freedesktop.machine1.Manager",
1996                         "GetMachineByPID",
1997                         &error,
1998                         &reply,
1999                         "u",
2000                         (uint32_t) pid);
2001         if (r < 0) {
2002                 /* Note that the machine might already have been
2003                  * cleaned up automatically, hence don't consider it a
2004                  * failure if we cannot get the machine object. */
2005                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2006                 return 0;
2007         }
2008
2009         r = sd_bus_message_read(reply, "o", &path);
2010         if (r < 0)
2011                 return bus_log_parse_error(r);
2012
2013         r = sd_bus_call_method(
2014                         bus,
2015                         "org.freedesktop.machine1",
2016                         path,
2017                         "org.freedesktop.machine1.Machine",
2018                         "Terminate",
2019                         &error,
2020                         NULL,
2021                         NULL);
2022         if (r < 0) {
2023                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2024                 return 0;
2025         }
2026
2027         return 0;
2028 }
2029
2030 static int reset_audit_loginuid(void) {
2031         _cleanup_free_ char *p = NULL;
2032         int r;
2033
2034         if (arg_share_system)
2035                 return 0;
2036
2037         r = read_one_line_file("/proc/self/loginuid", &p);
2038         if (r == -ENOENT)
2039                 return 0;
2040         if (r < 0)
2041                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2042
2043         /* Already reset? */
2044         if (streq(p, "4294967295"))
2045                 return 0;
2046
2047         r = write_string_file("/proc/self/loginuid", "4294967295");
2048         if (r < 0) {
2049                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2050                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2051                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2052                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2053                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2054
2055                 sleep(5);
2056         }
2057
2058         return 0;
2059 }
2060
2061 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2062 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2063 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2064
2065 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2066         uint8_t result[8];
2067         size_t l, sz;
2068         uint8_t *v, *i;
2069         int r;
2070
2071         l = strlen(arg_machine);
2072         sz = sizeof(sd_id128_t) + l;
2073         if (idx > 0)
2074                 sz += sizeof(idx);
2075
2076         v = alloca(sz);
2077
2078         /* fetch some persistent data unique to the host */
2079         r = sd_id128_get_machine((sd_id128_t*) v);
2080         if (r < 0)
2081                 return r;
2082
2083         /* combine with some data unique (on this host) to this
2084          * container instance */
2085         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2086         if (idx > 0) {
2087                 idx = htole64(idx);
2088                 memcpy(i, &idx, sizeof(idx));
2089         }
2090
2091         /* Let's hash the host machine ID plus the container name. We
2092          * use a fixed, but originally randomly created hash key here. */
2093         siphash24(result, v, sz, hash_key.bytes);
2094
2095         assert_cc(ETH_ALEN <= sizeof(result));
2096         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2097
2098         /* see eth_random_addr in the kernel */
2099         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2100         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2101
2102         return 0;
2103 }
2104
2105 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2106         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2107         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2108         struct ether_addr mac_host, mac_container;
2109         int r, i;
2110
2111         if (!arg_private_network)
2112                 return 0;
2113
2114         if (!arg_network_veth)
2115                 return 0;
2116
2117         /* Use two different interface name prefixes depending whether
2118          * we are in bridge mode or not. */
2119         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2120                  arg_network_bridge ? "vb" : "ve", arg_machine);
2121
2122         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2125
2126         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2129
2130         r = sd_rtnl_open(&rtnl, 0);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to connect to netlink: %m");
2133
2134         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2137
2138         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2141
2142         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2145
2146         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to open netlink container: %m");
2153
2154         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to open netlink container: %m");
2157
2158         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2161
2162         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2165
2166         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_message_close_container(m);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to close netlink container: %m");
2177
2178         r = sd_rtnl_message_close_container(m);
2179         if (r < 0)
2180                 return log_error_errno(r, "Failed to close netlink container: %m");
2181
2182         r = sd_rtnl_call(rtnl, m, 0, NULL);
2183         if (r < 0)
2184                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2185
2186         i = (int) if_nametoindex(iface_name);
2187         if (i <= 0)
2188                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2189
2190         *ifi = i;
2191
2192         return 0;
2193 }
2194
2195 static int setup_bridge(const char veth_name[], int *ifi) {
2196         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2197         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2198         int r, bridge;
2199
2200         if (!arg_private_network)
2201                 return 0;
2202
2203         if (!arg_network_veth)
2204                 return 0;
2205
2206         if (!arg_network_bridge)
2207                 return 0;
2208
2209         bridge = (int) if_nametoindex(arg_network_bridge);
2210         if (bridge <= 0)
2211                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2212
2213         *ifi = bridge;
2214
2215         r = sd_rtnl_open(&rtnl, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to connect to netlink: %m");
2218
2219         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2222
2223         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2226
2227         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2230
2231         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add netlink master field: %m");
2234
2235         r = sd_rtnl_call(rtnl, m, 0, NULL);
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2238
2239         return 0;
2240 }
2241
2242 static int parse_interface(struct udev *udev, const char *name) {
2243         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2244         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2245         int ifi;
2246
2247         ifi = (int) if_nametoindex(name);
2248         if (ifi <= 0)
2249                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2250
2251         sprintf(ifi_str, "n%i", ifi);
2252         d = udev_device_new_from_device_id(udev, ifi_str);
2253         if (!d)
2254                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2255
2256         if (udev_device_get_is_initialized(d) <= 0) {
2257                 log_error("Network interface %s is not initialized yet.", name);
2258                 return -EBUSY;
2259         }
2260
2261         return ifi;
2262 }
2263
2264 static int move_network_interfaces(pid_t pid) {
2265         _cleanup_udev_unref_ struct udev *udev = NULL;
2266         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2267         char **i;
2268         int r;
2269
2270         if (!arg_private_network)
2271                 return 0;
2272
2273         if (strv_isempty(arg_network_interfaces))
2274                 return 0;
2275
2276         r = sd_rtnl_open(&rtnl, 0);
2277         if (r < 0)
2278                 return log_error_errno(r, "Failed to connect to netlink: %m");
2279
2280         udev = udev_new();
2281         if (!udev) {
2282                 log_error("Failed to connect to udev.");
2283                 return -ENOMEM;
2284         }
2285
2286         STRV_FOREACH(i, arg_network_interfaces) {
2287                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2288                 int ifi;
2289
2290                 ifi = parse_interface(udev, *i);
2291                 if (ifi < 0)
2292                         return ifi;
2293
2294                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2297
2298                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2299                 if (r < 0)
2300                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2301
2302                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2303                 if (r < 0)
2304                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2305         }
2306
2307         return 0;
2308 }
2309
2310 static int setup_macvlan(pid_t pid) {
2311         _cleanup_udev_unref_ struct udev *udev = NULL;
2312         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2313         unsigned idx = 0;
2314         char **i;
2315         int r;
2316
2317         if (!arg_private_network)
2318                 return 0;
2319
2320         if (strv_isempty(arg_network_macvlan))
2321                 return 0;
2322
2323         r = sd_rtnl_open(&rtnl, 0);
2324         if (r < 0)
2325                 return log_error_errno(r, "Failed to connect to netlink: %m");
2326
2327         udev = udev_new();
2328         if (!udev) {
2329                 log_error("Failed to connect to udev.");
2330                 return -ENOMEM;
2331         }
2332
2333         STRV_FOREACH(i, arg_network_macvlan) {
2334                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2335                 _cleanup_free_ char *n = NULL;
2336                 struct ether_addr mac;
2337                 int ifi;
2338
2339                 ifi = parse_interface(udev, *i);
2340                 if (ifi < 0)
2341                         return ifi;
2342
2343                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2346
2347                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2348                 if (r < 0)
2349                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2350
2351                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2352                 if (r < 0)
2353                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2354
2355                 n = strappend("mv-", *i);
2356                 if (!n)
2357                         return log_oom();
2358
2359                 strshorten(n, IFNAMSIZ-1);
2360
2361                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2364
2365                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2368
2369                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2372
2373                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to open netlink container: %m");
2376
2377                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to open netlink container: %m");
2380
2381                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2384
2385                 r = sd_rtnl_message_close_container(m);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to close netlink container: %m");
2388
2389                 r = sd_rtnl_message_close_container(m);
2390                 if (r < 0)
2391                         return log_error_errno(r, "Failed to close netlink container: %m");
2392
2393                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2394                 if (r < 0)
2395                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2396         }
2397
2398         return 0;
2399 }
2400
2401 static int setup_ipvlan(pid_t pid) {
2402         _cleanup_udev_unref_ struct udev *udev = NULL;
2403         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2404         char **i;
2405         int r;
2406
2407         if (!arg_private_network)
2408                 return 0;
2409
2410         if (strv_isempty(arg_network_ipvlan))
2411                 return 0;
2412
2413         r = sd_rtnl_open(&rtnl, 0);
2414         if (r < 0)
2415                 return log_error_errno(r, "Failed to connect to netlink: %m");
2416
2417         udev = udev_new();
2418         if (!udev) {
2419                 log_error("Failed to connect to udev.");
2420                 return -ENOMEM;
2421         }
2422
2423         STRV_FOREACH(i, arg_network_ipvlan) {
2424                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2425                 _cleanup_free_ char *n = NULL;
2426                 int ifi;
2427
2428                 ifi = parse_interface(udev, *i);
2429                 if (ifi < 0)
2430                         return ifi;
2431
2432                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2435
2436                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2437                 if (r < 0)
2438                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2439
2440                 n = strappend("iv-", *i);
2441                 if (!n)
2442                         return log_oom();
2443
2444                 strshorten(n, IFNAMSIZ-1);
2445
2446                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2449
2450                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2453
2454                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to open netlink container: %m");
2457
2458                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to open netlink container: %m");
2461
2462                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2465
2466                 r = sd_rtnl_message_close_container(m);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to close netlink container: %m");
2469
2470                 r = sd_rtnl_message_close_container(m);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to close netlink container: %m");
2473
2474                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2475                 if (r < 0)
2476                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2477         }
2478
2479         return 0;
2480 }
2481
2482 static int setup_seccomp(void) {
2483
2484 #ifdef HAVE_SECCOMP
2485         static const int blacklist[] = {
2486                 SCMP_SYS(kexec_load),
2487                 SCMP_SYS(open_by_handle_at),
2488                 SCMP_SYS(init_module),
2489                 SCMP_SYS(finit_module),
2490                 SCMP_SYS(delete_module),
2491                 SCMP_SYS(iopl),
2492                 SCMP_SYS(ioperm),
2493                 SCMP_SYS(swapon),
2494                 SCMP_SYS(swapoff),
2495         };
2496
2497         scmp_filter_ctx seccomp;
2498         unsigned i;
2499         int r;
2500
2501         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2502         if (!seccomp)
2503                 return log_oom();
2504
2505         r = seccomp_add_secondary_archs(seccomp);
2506         if (r < 0) {
2507                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2508                 goto finish;
2509         }
2510
2511         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2512                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2513                 if (r == -EFAULT)
2514                         continue; /* unknown syscall */
2515                 if (r < 0) {
2516                         log_error_errno(r, "Failed to block syscall: %m");
2517                         goto finish;
2518                 }
2519         }
2520
2521         /*
2522            Audit is broken in containers, much of the userspace audit
2523            hookup will fail if running inside a container. We don't
2524            care and just turn off creation of audit sockets.
2525
2526            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2527            with EAFNOSUPPORT which audit userspace uses as indication
2528            that audit is disabled in the kernel.
2529          */
2530
2531         r = seccomp_rule_add(
2532                         seccomp,
2533                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2534                         SCMP_SYS(socket),
2535                         2,
2536                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2537                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2538         if (r < 0) {
2539                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2540                 goto finish;
2541         }
2542
2543         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2544         if (r < 0) {
2545                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2546                 goto finish;
2547         }
2548
2549         r = seccomp_load(seccomp);
2550         if (r < 0)
2551                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2552
2553 finish:
2554         seccomp_release(seccomp);
2555         return r;
2556 #else
2557         return 0;
2558 #endif
2559
2560 }
2561
2562 static int setup_propagate(const char *root) {
2563         const char *p, *q;
2564
2565         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2566         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2567         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2568         (void) mkdir_p(p, 0600);
2569
2570         q = strappenda(root, "/run/systemd/nspawn/incoming");
2571         mkdir_parents(q, 0755);
2572         mkdir_p(q, 0600);
2573
2574         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2575                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2576
2577         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2578                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2579
2580         return 0;
2581 }
2582
2583 static int setup_image(char **device_path, int *loop_nr) {
2584         struct loop_info64 info = {
2585                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2586         };
2587         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2588         _cleanup_free_ char* loopdev = NULL;
2589         struct stat st;
2590         int r, nr;
2591
2592         assert(device_path);
2593         assert(loop_nr);
2594         assert(arg_image);
2595
2596         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2597         if (fd < 0)
2598                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2599
2600         if (fstat(fd, &st) < 0)
2601                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2602
2603         if (S_ISBLK(st.st_mode)) {
2604                 char *p;
2605
2606                 p = strdup(arg_image);
2607                 if (!p)
2608                         return log_oom();
2609
2610                 *device_path = p;
2611
2612                 *loop_nr = -1;
2613
2614                 r = fd;
2615                 fd = -1;
2616
2617                 return r;
2618         }
2619
2620         if (!S_ISREG(st.st_mode)) {
2621                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2622                 return -EINVAL;
2623         }
2624
2625         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2626         if (control < 0)
2627                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2628
2629         nr = ioctl(control, LOOP_CTL_GET_FREE);
2630         if (nr < 0)
2631                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2632
2633         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2634                 return log_oom();
2635
2636         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2637         if (loop < 0)
2638                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2639
2640         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2641                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2642
2643         if (arg_read_only)
2644                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2645
2646         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2647                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2648
2649         *device_path = loopdev;
2650         loopdev = NULL;
2651
2652         *loop_nr = nr;
2653
2654         r = loop;
2655         loop = -1;
2656
2657         return r;
2658 }
2659
2660 #define PARTITION_TABLE_BLURB \
2661         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2662         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2663         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2664         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2665         "to be bootable with systemd-nspawn."
2666
2667 static int dissect_image(
2668                 int fd,
2669                 char **root_device, bool *root_device_rw,
2670                 char **home_device, bool *home_device_rw,
2671                 char **srv_device, bool *srv_device_rw,
2672                 bool *secondary) {
2673
2674 #ifdef HAVE_BLKID
2675         int home_nr = -1, srv_nr = -1;
2676 #ifdef GPT_ROOT_NATIVE
2677         int root_nr = -1;
2678 #endif
2679 #ifdef GPT_ROOT_SECONDARY
2680         int secondary_root_nr = -1;
2681 #endif
2682         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2683         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2684         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2685         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2686         _cleanup_udev_unref_ struct udev *udev = NULL;
2687         struct udev_list_entry *first, *item;
2688         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2689         bool is_gpt, is_mbr, multiple_generic = false;
2690         const char *pttype = NULL;
2691         blkid_partlist pl;
2692         struct stat st;
2693         unsigned i;
2694         int r;
2695
2696         assert(fd >= 0);
2697         assert(root_device);
2698         assert(home_device);
2699         assert(srv_device);
2700         assert(secondary);
2701         assert(arg_image);
2702
2703         b = blkid_new_probe();
2704         if (!b)
2705                 return log_oom();
2706
2707         errno = 0;
2708         r = blkid_probe_set_device(b, fd, 0, 0);
2709         if (r != 0) {
2710                 if (errno == 0)
2711                         return log_oom();
2712
2713                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2714                 return -errno;
2715         }
2716
2717         blkid_probe_enable_partitions(b, 1);
2718         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2719
2720         errno = 0;
2721         r = blkid_do_safeprobe(b);
2722         if (r == -2 || r == 1) {
2723                 log_error("Failed to identify any partition table on\n"
2724                           "    %s\n"
2725                           PARTITION_TABLE_BLURB, arg_image);
2726                 return -EINVAL;
2727         } else if (r != 0) {
2728                 if (errno == 0)
2729                         errno = EIO;
2730                 log_error_errno(errno, "Failed to probe: %m");
2731                 return -errno;
2732         }
2733
2734         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2735
2736         is_gpt = streq_ptr(pttype, "gpt");
2737         is_mbr = streq_ptr(pttype, "dos");
2738
2739         if (!is_gpt && !is_mbr) {
2740                 log_error("No GPT or MBR partition table discovered on\n"
2741                           "    %s\n"
2742                           PARTITION_TABLE_BLURB, arg_image);
2743                 return -EINVAL;
2744         }
2745
2746         errno = 0;
2747         pl = blkid_probe_get_partitions(b);
2748         if (!pl) {
2749                 if (errno == 0)
2750                         return log_oom();
2751
2752                 log_error("Failed to list partitions of %s", arg_image);
2753                 return -errno;
2754         }
2755
2756         udev = udev_new();
2757         if (!udev)
2758                 return log_oom();
2759
2760         if (fstat(fd, &st) < 0)
2761                 return log_error_errno(errno, "Failed to stat block device: %m");
2762
2763         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2764         if (!d)
2765                 return log_oom();
2766
2767         for (i = 0;; i++) {
2768                 int n, m;
2769
2770                 if (i >= 10) {
2771                         log_error("Kernel partitions never appeared.");
2772                         return -ENXIO;
2773                 }
2774
2775                 e = udev_enumerate_new(udev);
2776                 if (!e)
2777                         return log_oom();
2778
2779                 r = udev_enumerate_add_match_parent(e, d);
2780                 if (r < 0)
2781                         return log_oom();
2782
2783                 r = udev_enumerate_scan_devices(e);
2784                 if (r < 0)
2785                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2786
2787                 /* Count the partitions enumerated by the kernel */
2788                 n = 0;
2789                 first = udev_enumerate_get_list_entry(e);
2790                 udev_list_entry_foreach(item, first)
2791                         n++;
2792
2793                 /* Count the partitions enumerated by blkid */
2794                 m = blkid_partlist_numof_partitions(pl);
2795                 if (n == m + 1)
2796                         break;
2797                 if (n > m + 1) {
2798                         log_error("blkid and kernel partition list do not match.");
2799                         return -EIO;
2800                 }
2801                 if (n < m + 1) {
2802                         unsigned j;
2803
2804                         /* The kernel has probed fewer partitions than
2805                          * blkid? Maybe the kernel prober is still
2806                          * running or it got EBUSY because udev
2807                          * already opened the device. Let's reprobe
2808                          * the device, which is a synchronous call
2809                          * that waits until probing is complete. */
2810
2811                         for (j = 0; j < 20; j++) {
2812
2813                                 r = ioctl(fd, BLKRRPART, 0);
2814                                 if (r < 0)
2815                                         r = -errno;
2816                                 if (r >= 0 || r != -EBUSY)
2817                                         break;
2818
2819                                 /* If something else has the device
2820                                  * open, such as an udev rule, the
2821                                  * ioctl will return EBUSY. Since
2822                                  * there's no way to wait until it
2823                                  * isn't busy anymore, let's just wait
2824                                  * a bit, and try again.
2825                                  *
2826                                  * This is really something they
2827                                  * should fix in the kernel! */
2828
2829                                 usleep(50 * USEC_PER_MSEC);
2830                         }
2831
2832                         if (r < 0)
2833                                 return log_error_errno(r, "Failed to reread partition table: %m");
2834                 }
2835
2836                 e = udev_enumerate_unref(e);
2837         }
2838
2839         first = udev_enumerate_get_list_entry(e);
2840         udev_list_entry_foreach(item, first) {
2841                 _cleanup_udev_device_unref_ struct udev_device *q;
2842                 const char *node;
2843                 unsigned long long flags;
2844                 blkid_partition pp;
2845                 dev_t qn;
2846                 int nr;
2847
2848                 errno = 0;
2849                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2850                 if (!q) {
2851                         if (!errno)
2852                                 errno = ENOMEM;
2853
2854                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2855                         return -errno;
2856                 }
2857
2858                 qn = udev_device_get_devnum(q);
2859                 if (major(qn) == 0)
2860                         continue;
2861
2862                 if (st.st_rdev == qn)
2863                         continue;
2864
2865                 node = udev_device_get_devnode(q);
2866                 if (!node)
2867                         continue;
2868
2869                 pp = blkid_partlist_devno_to_partition(pl, qn);
2870                 if (!pp)
2871                         continue;
2872
2873                 flags = blkid_partition_get_flags(pp);
2874
2875                 nr = blkid_partition_get_partno(pp);
2876                 if (nr < 0)
2877                         continue;
2878
2879                 if (is_gpt) {
2880                         sd_id128_t type_id;
2881                         const char *stype;
2882
2883                         if (flags & GPT_FLAG_NO_AUTO)
2884                                 continue;
2885
2886                         stype = blkid_partition_get_type_string(pp);
2887                         if (!stype)
2888                                 continue;
2889
2890                         if (sd_id128_from_string(stype, &type_id) < 0)
2891                                 continue;
2892
2893                         if (sd_id128_equal(type_id, GPT_HOME)) {
2894
2895                                 if (home && nr >= home_nr)
2896                                         continue;
2897
2898                                 home_nr = nr;
2899                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2900
2901                                 r = free_and_strdup(&home, node);
2902                                 if (r < 0)
2903                                         return log_oom();
2904
2905                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2906
2907                                 if (srv && nr >= srv_nr)
2908                                         continue;
2909
2910                                 srv_nr = nr;
2911                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2912
2913                                 r = free_and_strdup(&srv, node);
2914                                 if (r < 0)
2915                                         return log_oom();
2916                         }
2917 #ifdef GPT_ROOT_NATIVE
2918                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2919
2920                                 if (root && nr >= root_nr)
2921                                         continue;
2922
2923                                 root_nr = nr;
2924                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2925
2926                                 r = free_and_strdup(&root, node);
2927                                 if (r < 0)
2928                                         return log_oom();
2929                         }
2930 #endif
2931 #ifdef GPT_ROOT_SECONDARY
2932                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2933
2934                                 if (secondary_root && nr >= secondary_root_nr)
2935                                         continue;
2936
2937                                 secondary_root_nr = nr;
2938                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2939
2940                                 r = free_and_strdup(&secondary_root, node);
2941                                 if (r < 0)
2942                                         return log_oom();
2943                         }
2944 #endif
2945                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2946
2947                                 if (generic)
2948                                         multiple_generic = true;
2949                                 else {
2950                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2951
2952                                         r = free_and_strdup(&generic, node);
2953                                         if (r < 0)
2954                                                 return log_oom();
2955                                 }
2956                         }
2957
2958                 } else if (is_mbr) {
2959                         int type;
2960
2961                         if (flags != 0x80) /* Bootable flag */
2962                                 continue;
2963
2964                         type = blkid_partition_get_type(pp);
2965                         if (type != 0x83) /* Linux partition */
2966                                 continue;
2967
2968                         if (generic)
2969                                 multiple_generic = true;
2970                         else {
2971                                 generic_rw = true;
2972
2973                                 r = free_and_strdup(&root, node);
2974                                 if (r < 0)
2975                                         return log_oom();
2976                         }
2977                 }
2978         }
2979
2980         if (root) {
2981                 *root_device = root;
2982                 root = NULL;
2983
2984                 *root_device_rw = root_rw;
2985                 *secondary = false;
2986         } else if (secondary_root) {
2987                 *root_device = secondary_root;
2988                 secondary_root = NULL;
2989
2990                 *root_device_rw = secondary_root_rw;
2991                 *secondary = true;
2992         } else if (generic) {
2993
2994                 /* There were no partitions with precise meanings
2995                  * around, but we found generic partitions. In this
2996                  * case, if there's only one, we can go ahead and boot
2997                  * it, otherwise we bail out, because we really cannot
2998                  * make any sense of it. */
2999
3000                 if (multiple_generic) {
3001                         log_error("Identified multiple bootable Linux partitions on\n"
3002                                   "    %s\n"
3003                                   PARTITION_TABLE_BLURB, arg_image);
3004                         return -EINVAL;
3005                 }
3006
3007                 *root_device = generic;
3008                 generic = NULL;
3009
3010                 *root_device_rw = generic_rw;
3011                 *secondary = false;
3012         } else {
3013                 log_error("Failed to identify root partition in disk image\n"
3014                           "    %s\n"
3015                           PARTITION_TABLE_BLURB, arg_image);
3016                 return -EINVAL;
3017         }
3018
3019         if (home) {
3020                 *home_device = home;
3021                 home = NULL;
3022
3023                 *home_device_rw = home_rw;
3024         }
3025
3026         if (srv) {
3027                 *srv_device = srv;
3028                 srv = NULL;
3029
3030                 *srv_device_rw = srv_rw;
3031         }
3032
3033         return 0;
3034 #else
3035         log_error("--image= is not supported, compiled without blkid support.");
3036         return -ENOTSUP;
3037 #endif
3038 }
3039
3040 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3041 #ifdef HAVE_BLKID
3042         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3043         const char *fstype, *p;
3044         int r;
3045
3046         assert(what);
3047         assert(where);
3048
3049         if (arg_read_only)
3050                 rw = false;
3051
3052         if (directory)
3053                 p = strappenda(where, directory);
3054         else
3055                 p = where;
3056
3057         errno = 0;
3058         b = blkid_new_probe_from_filename(what);
3059         if (!b) {
3060                 if (errno == 0)
3061                         return log_oom();
3062                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3063                 return -errno;
3064         }
3065
3066         blkid_probe_enable_superblocks(b, 1);
3067         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3068
3069         errno = 0;
3070         r = blkid_do_safeprobe(b);
3071         if (r == -1 || r == 1) {
3072                 log_error("Cannot determine file system type of %s", what);
3073                 return -EINVAL;
3074         } else if (r != 0) {
3075                 if (errno == 0)
3076                         errno = EIO;
3077                 log_error_errno(errno, "Failed to probe %s: %m", what);
3078                 return -errno;
3079         }
3080
3081         errno = 0;
3082         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3083                 if (errno == 0)
3084                         errno = EINVAL;
3085                 log_error("Failed to determine file system type of %s", what);
3086                 return -errno;
3087         }
3088
3089         if (streq(fstype, "crypto_LUKS")) {
3090                 log_error("nspawn currently does not support LUKS disk images.");
3091                 return -ENOTSUP;
3092         }
3093
3094         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3095                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3096
3097         return 0;
3098 #else
3099         log_error("--image= is not supported, compiled without blkid support.");
3100         return -ENOTSUP;
3101 #endif
3102 }
3103
3104 static int mount_devices(
3105                 const char *where,
3106                 const char *root_device, bool root_device_rw,
3107                 const char *home_device, bool home_device_rw,
3108                 const char *srv_device, bool srv_device_rw) {
3109         int r;
3110
3111         assert(where);
3112
3113         if (root_device) {
3114                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3115                 if (r < 0)
3116                         return log_error_errno(r, "Failed to mount root directory: %m");
3117         }
3118
3119         if (home_device) {
3120                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3121                 if (r < 0)
3122                         return log_error_errno(r, "Failed to mount home directory: %m");
3123         }
3124
3125         if (srv_device) {
3126                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3127                 if (r < 0)
3128                         return log_error_errno(r, "Failed to mount server data directory: %m");
3129         }
3130
3131         return 0;
3132 }
3133
3134 static void loop_remove(int nr, int *image_fd) {
3135         _cleanup_close_ int control = -1;
3136         int r;
3137
3138         if (nr < 0)
3139                 return;
3140
3141         if (image_fd && *image_fd >= 0) {
3142                 r = ioctl(*image_fd, LOOP_CLR_FD);
3143                 if (r < 0)
3144                         log_debug_errno(errno, "Failed to close loop image: %m");
3145                 *image_fd = safe_close(*image_fd);
3146         }
3147
3148         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3149         if (control < 0) {
3150                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3151                 return;
3152         }
3153
3154         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3155         if (r < 0)
3156                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3157 }
3158
3159 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3160         int pipe_fds[2];
3161         pid_t pid;
3162
3163         assert(database);
3164         assert(key);
3165         assert(rpid);
3166
3167         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3168                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3169
3170         pid = fork();
3171         if (pid < 0)
3172                 return log_error_errno(errno, "Failed to fork getent child: %m");
3173         else if (pid == 0) {
3174                 int nullfd;
3175                 char *empty_env = NULL;
3176
3177                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3178                         _exit(EXIT_FAILURE);
3179
3180                 if (pipe_fds[0] > 2)
3181                         safe_close(pipe_fds[0]);
3182                 if (pipe_fds[1] > 2)
3183                         safe_close(pipe_fds[1]);
3184
3185                 nullfd = open("/dev/null", O_RDWR);
3186                 if (nullfd < 0)
3187                         _exit(EXIT_FAILURE);
3188
3189                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3190                         _exit(EXIT_FAILURE);
3191
3192                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3193                         _exit(EXIT_FAILURE);
3194
3195                 if (nullfd > 2)
3196                         safe_close(nullfd);
3197
3198                 reset_all_signal_handlers();
3199                 close_all_fds(NULL, 0);
3200
3201                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3202                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3203                 _exit(EXIT_FAILURE);
3204         }
3205
3206         pipe_fds[1] = safe_close(pipe_fds[1]);
3207
3208         *rpid = pid;
3209
3210         return pipe_fds[0];
3211 }
3212
3213 static int change_uid_gid(char **_home) {
3214         char line[LINE_MAX], *x, *u, *g, *h;
3215         const char *word, *state;
3216         _cleanup_free_ uid_t *uids = NULL;
3217         _cleanup_free_ char *home = NULL;
3218         _cleanup_fclose_ FILE *f = NULL;
3219         _cleanup_close_ int fd = -1;
3220         unsigned n_uids = 0;
3221         size_t sz = 0, l;
3222         uid_t uid;
3223         gid_t gid;
3224         pid_t pid;
3225         int r;
3226
3227         assert(_home);
3228
3229         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3230                 /* Reset everything fully to 0, just in case */
3231
3232                 if (setgroups(0, NULL) < 0)
3233                         return log_error_errno(errno, "setgroups() failed: %m");
3234
3235                 if (setresgid(0, 0, 0) < 0)
3236                         return log_error_errno(errno, "setregid() failed: %m");
3237
3238                 if (setresuid(0, 0, 0) < 0)
3239                         return log_error_errno(errno, "setreuid() failed: %m");
3240
3241                 *_home = NULL;
3242                 return 0;
3243         }
3244
3245         /* First, get user credentials */
3246         fd = spawn_getent("passwd", arg_user, &pid);
3247         if (fd < 0)
3248                 return fd;
3249
3250         f = fdopen(fd, "r");
3251         if (!f)
3252                 return log_oom();
3253         fd = -1;
3254
3255         if (!fgets(line, sizeof(line), f)) {
3256
3257                 if (!ferror(f)) {
3258                         log_error("Failed to resolve user %s.", arg_user);
3259                         return -ESRCH;
3260                 }
3261
3262                 log_error_errno(errno, "Failed to read from getent: %m");
3263                 return -errno;
3264         }
3265
3266         truncate_nl(line);
3267
3268         wait_for_terminate_and_warn("getent passwd", pid, true);
3269
3270         x = strchr(line, ':');
3271         if (!x) {
3272                 log_error("/etc/passwd entry has invalid user field.");
3273                 return -EIO;
3274         }
3275
3276         u = strchr(x+1, ':');
3277         if (!u) {
3278                 log_error("/etc/passwd entry has invalid password field.");
3279                 return -EIO;
3280         }
3281
3282         u++;
3283         g = strchr(u, ':');
3284         if (!g) {
3285                 log_error("/etc/passwd entry has invalid UID field.");
3286                 return -EIO;
3287         }
3288
3289         *g = 0;
3290         g++;
3291         x = strchr(g, ':');
3292         if (!x) {
3293                 log_error("/etc/passwd entry has invalid GID field.");
3294                 return -EIO;
3295         }
3296
3297         *x = 0;
3298         h = strchr(x+1, ':');
3299         if (!h) {
3300                 log_error("/etc/passwd entry has invalid GECOS field.");
3301                 return -EIO;
3302         }
3303
3304         h++;
3305         x = strchr(h, ':');
3306         if (!x) {
3307                 log_error("/etc/passwd entry has invalid home directory field.");
3308                 return -EIO;
3309         }
3310
3311         *x = 0;
3312
3313         r = parse_uid(u, &uid);
3314         if (r < 0) {
3315                 log_error("Failed to parse UID of user.");
3316                 return -EIO;
3317         }
3318
3319         r = parse_gid(g, &gid);
3320         if (r < 0) {
3321                 log_error("Failed to parse GID of user.");
3322                 return -EIO;
3323         }
3324
3325         home = strdup(h);
3326         if (!home)
3327                 return log_oom();
3328
3329         /* Second, get group memberships */
3330         fd = spawn_getent("initgroups", arg_user, &pid);
3331         if (fd < 0)
3332                 return fd;
3333
3334         fclose(f);
3335         f = fdopen(fd, "r");
3336         if (!f)
3337                 return log_oom();
3338         fd = -1;
3339
3340         if (!fgets(line, sizeof(line), f)) {
3341                 if (!ferror(f)) {
3342                         log_error("Failed to resolve user %s.", arg_user);
3343                         return -ESRCH;
3344                 }
3345
3346                 log_error_errno(errno, "Failed to read from getent: %m");
3347                 return -errno;
3348         }
3349
3350         truncate_nl(line);
3351
3352         wait_for_terminate_and_warn("getent initgroups", pid, true);
3353
3354         /* Skip over the username and subsequent separator whitespace */
3355         x = line;
3356         x += strcspn(x, WHITESPACE);
3357         x += strspn(x, WHITESPACE);
3358
3359         FOREACH_WORD(word, l, x, state) {
3360                 char c[l+1];
3361
3362                 memcpy(c, word, l);
3363                 c[l] = 0;
3364
3365                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3366                         return log_oom();
3367
3368                 r = parse_uid(c, &uids[n_uids++]);
3369                 if (r < 0) {
3370                         log_error("Failed to parse group data from getent.");
3371                         return -EIO;
3372                 }
3373         }
3374
3375         r = mkdir_parents(home, 0775);
3376         if (r < 0)
3377                 return log_error_errno(r, "Failed to make home root directory: %m");
3378
3379         r = mkdir_safe(home, 0755, uid, gid);
3380         if (r < 0 && r != -EEXIST)
3381                 return log_error_errno(r, "Failed to make home directory: %m");
3382
3383         fchown(STDIN_FILENO, uid, gid);
3384         fchown(STDOUT_FILENO, uid, gid);
3385         fchown(STDERR_FILENO, uid, gid);
3386
3387         if (setgroups(n_uids, uids) < 0)
3388                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3389
3390         if (setresgid(gid, gid, gid) < 0)
3391                 return log_error_errno(errno, "setregid() failed: %m");
3392
3393         if (setresuid(uid, uid, uid) < 0)
3394                 return log_error_errno(errno, "setreuid() failed: %m");
3395
3396         if (_home) {
3397                 *_home = home;
3398                 home = NULL;
3399         }
3400
3401         return 0;
3402 }
3403
3404 /*
3405  * Return values:
3406  * < 0 : wait_for_terminate() failed to get the state of the
3407  *       container, the container was terminated by a signal, or
3408  *       failed for an unknown reason.  No change is made to the
3409  *       container argument.
3410  * > 0 : The program executed in the container terminated with an
3411  *       error.  The exit code of the program executed in the
3412  *       container is returned.  The container argument has been set
3413  *       to CONTAINER_TERMINATED.
3414  *   0 : The container is being rebooted, has been shut down or exited
3415  *       successfully.  The container argument has been set to either
3416  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3417  *
3418  * That is, success is indicated by a return value of zero, and an
3419  * error is indicated by a non-zero value.
3420  */
3421 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3422         siginfo_t status;
3423         int r;
3424
3425         r = wait_for_terminate(pid, &status);
3426         if (r < 0)
3427                 return log_warning_errno(r, "Failed to wait for container: %m");
3428
3429         switch (status.si_code) {
3430
3431         case CLD_EXITED:
3432                 if (status.si_status == 0) {
3433                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3434
3435                 } else
3436                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3437
3438                 *container = CONTAINER_TERMINATED;
3439                 return status.si_status;
3440
3441         case CLD_KILLED:
3442                 if (status.si_status == SIGINT) {
3443
3444                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3445                         *container = CONTAINER_TERMINATED;
3446                         return 0;
3447
3448                 } else if (status.si_status == SIGHUP) {
3449
3450                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3451                         *container = CONTAINER_REBOOTED;
3452                         return 0;
3453                 }
3454
3455                 /* CLD_KILLED fallthrough */
3456
3457         case CLD_DUMPED:
3458                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3459                 return -EIO;
3460
3461         default:
3462                 log_error("Container %s failed due to unknown reason.", arg_machine);
3463                 return -EIO;
3464         }
3465
3466         return r;
3467 }
3468
3469 static void nop_handler(int sig) {}
3470
3471 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3472         pid_t pid;
3473
3474         pid = PTR_TO_UINT32(userdata);
3475         if (pid > 0) {
3476                 if (kill(pid, SIGRTMIN+3) >= 0) {
3477                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3478                         sd_event_source_set_userdata(s, NULL);
3479                         return 0;
3480                 }
3481         }
3482
3483         sd_event_exit(sd_event_source_get_event(s), 0);
3484         return 0;
3485 }
3486
3487 static int determine_names(void) {
3488         int r;
3489
3490         if (!arg_image && !arg_directory) {
3491                 if (arg_machine) {
3492                         _cleanup_(image_unrefp) Image *i = NULL;
3493
3494                         r = image_find(arg_machine, &i);
3495                         if (r < 0)
3496                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3497                         else if (r == 0) {
3498                                 log_error("No image for machine '%s': %m", arg_machine);
3499                                 return -ENOENT;
3500                         }
3501
3502                         if (i->type == IMAGE_RAW)
3503                                 r = set_sanitized_path(&arg_image, i->path);
3504                         else
3505                                 r = set_sanitized_path(&arg_directory, i->path);
3506                         if (r < 0)
3507                                 return log_error_errno(r, "Invalid image directory: %m");
3508
3509                         arg_read_only = arg_read_only || i->read_only;
3510                 } else
3511                         arg_directory = get_current_dir_name();
3512
3513                 if (!arg_directory && !arg_machine) {
3514                         log_error("Failed to determine path, please use -D or -i.");
3515                         return -EINVAL;
3516                 }
3517         }
3518
3519         if (!arg_machine) {
3520                 if (arg_directory && path_equal(arg_directory, "/"))
3521                         arg_machine = gethostname_malloc();
3522                 else
3523                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3524
3525                 if (!arg_machine)
3526                         return log_oom();
3527
3528                 hostname_cleanup(arg_machine, false);
3529                 if (!machine_name_is_valid(arg_machine)) {
3530                         log_error("Failed to determine machine name automatically, please use -M.");
3531                         return -EINVAL;
3532                 }
3533
3534                 if (arg_ephemeral) {
3535                         char *b;
3536
3537                         /* Add a random suffix when this is an
3538                          * ephemeral machine, so that we can run many
3539                          * instances at once without manually having
3540                          * to specify -M each time. */
3541
3542                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3543                                 return log_oom();
3544
3545                         free(arg_machine);
3546                         arg_machine = b;
3547                 }
3548         }
3549
3550         return 0;
3551 }
3552
3553 int main(int argc, char *argv[]) {
3554
3555         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3556         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3557         _cleanup_close_ int master = -1, image_fd = -1;
3558         _cleanup_fdset_free_ FDSet *fds = NULL;
3559         int r, n_fd_passed, loop_nr = -1;
3560         char veth_name[IFNAMSIZ];
3561         bool secondary = false, remove_subvol = false;
3562         sigset_t mask, mask_chld;
3563         pid_t pid = 0;
3564         int ret = EXIT_SUCCESS;
3565         union in_addr_union exposed = {};
3566         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3567
3568         log_parse_environment();
3569         log_open();
3570
3571         r = parse_argv(argc, argv);
3572         if (r <= 0)
3573                 goto finish;
3574
3575         r = determine_names();
3576         if (r < 0)
3577                 goto finish;
3578
3579         if (geteuid() != 0) {
3580                 log_error("Need to be root.");
3581                 r = -EPERM;
3582                 goto finish;
3583         }
3584
3585         if (sd_booted() <= 0) {
3586                 log_error("Not running on a systemd system.");
3587                 r = -EINVAL;
3588                 goto finish;
3589         }
3590
3591         log_close();
3592         n_fd_passed = sd_listen_fds(false);
3593         if (n_fd_passed > 0) {
3594                 r = fdset_new_listen_fds(&fds, false);
3595                 if (r < 0) {
3596                         log_error_errno(r, "Failed to collect file descriptors: %m");
3597                         goto finish;
3598                 }
3599         }
3600         fdset_close_others(fds);
3601         log_open();
3602
3603         if (arg_directory) {
3604                 assert(!arg_image);
3605
3606                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3607                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3608                         r = -EINVAL;
3609                         goto finish;
3610                 }
3611
3612                 if (arg_ephemeral) {
3613                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3614                         char *np;
3615
3616                         /* If the specified path is a mount point we
3617                          * generate the new snapshot immediately
3618                          * inside it under a random name. However if
3619                          * the specified is not a mount point we
3620                          * create the new snapshot in the parent
3621                          * directory, just next to it. */
3622                         r = path_is_mount_point(arg_directory, false);
3623                         if (r < 0) {
3624                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3625                                 goto finish;
3626                         }
3627                         if (r > 0)
3628                                 r = tempfn_random_child(arg_directory, &np);
3629                         else
3630                                 r = tempfn_random(arg_directory, &np);
3631                         if (r < 0) {
3632                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3633                                 goto finish;
3634                         }
3635
3636                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3637                         if (r < 0) {
3638                                 log_error_errno(r, "Failed to lock %s: %m", np);
3639                                 goto finish;
3640                         }
3641
3642                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3643                         if (r < 0) {
3644                                 free(np);
3645                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3646                                 goto finish;
3647                         }
3648
3649                         free(arg_directory);
3650                         arg_directory = np;
3651
3652                         remove_subvol = true;
3653
3654                 } else {
3655                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3656                         if (r == -EBUSY) {
3657                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3658                                 goto finish;
3659                         }
3660                         if (r < 0) {
3661                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3662                                 return r;
3663                         }
3664
3665                         if (arg_template) {
3666                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3667                                 if (r == -EEXIST) {
3668                                         if (!arg_quiet)
3669                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3670                                 } else if (r < 0) {
3671                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3672                                         goto finish;
3673                                 } else {
3674                                         if (!arg_quiet)
3675                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3676                                 }
3677                         }
3678                 }
3679
3680                 if (arg_boot) {
3681                         if (path_is_os_tree(arg_directory) <= 0) {
3682                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3683                                 r = -EINVAL;
3684                                 goto finish;
3685                         }
3686                 } else {
3687                         const char *p;
3688
3689                         p = strappenda(arg_directory,
3690                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3691                         if (access(p, F_OK) < 0) {
3692                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3693                                 r = -EINVAL;
3694                                 goto finish;
3695                         }
3696                 }
3697
3698         } else {
3699                 char template[] = "/tmp/nspawn-root-XXXXXX";
3700
3701                 assert(arg_image);
3702                 assert(!arg_template);
3703
3704                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3705                 if (r == -EBUSY) {
3706                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3707                         goto finish;
3708                 }
3709                 if (r < 0) {
3710                         r = log_error_errno(r, "Failed to create image lock: %m");
3711                         goto finish;
3712                 }
3713
3714                 if (!mkdtemp(template)) {
3715                         log_error_errno(errno, "Failed to create temporary directory: %m");
3716                         r = -errno;
3717                         goto finish;
3718                 }
3719
3720                 arg_directory = strdup(template);
3721                 if (!arg_directory) {
3722                         r = log_oom();
3723                         goto finish;
3724                 }
3725
3726                 image_fd = setup_image(&device_path, &loop_nr);
3727                 if (image_fd < 0) {
3728                         r = image_fd;
3729                         goto finish;
3730                 }
3731
3732                 r = dissect_image(image_fd,
3733                                   &root_device, &root_device_rw,
3734                                   &home_device, &home_device_rw,
3735                                   &srv_device, &srv_device_rw,
3736                                   &secondary);
3737                 if (r < 0)
3738                         goto finish;
3739         }
3740
3741         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3742         if (master < 0) {
3743                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3744                 goto finish;
3745         }
3746
3747         r = ptsname_malloc(master, &console);
3748         if (r < 0) {
3749                 r = log_error_errno(r, "Failed to determine tty name: %m");
3750                 goto finish;
3751         }
3752
3753         if (!arg_quiet)
3754                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3755                          arg_machine, arg_image ?: arg_directory);
3756
3757         if (unlockpt(master) < 0) {
3758                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3759                 goto finish;
3760         }
3761
3762         assert_se(sigemptyset(&mask) == 0);
3763         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3764         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3765
3766         assert_se(sigemptyset(&mask_chld) == 0);
3767         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3768
3769         for (;;) {
3770                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3771                 ContainerStatus container_status;
3772                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3773                 struct sigaction sa = {
3774                         .sa_handler = nop_handler,
3775                         .sa_flags = SA_NOCLDSTOP,
3776                 };
3777
3778                 r = barrier_create(&barrier);
3779                 if (r < 0) {
3780                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3781                         goto finish;
3782                 }
3783
3784                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3785                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3786                         goto finish;
3787                 }
3788
3789                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3790                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3791                         goto finish;
3792                 }
3793
3794                 /* Child can be killed before execv(), so handle SIGCHLD
3795                  * in order to interrupt parent's blocking calls and
3796                  * give it a chance to call wait() and terminate. */
3797                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3798                 if (r < 0) {
3799                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3800                         goto finish;
3801                 }
3802
3803                 r = sigaction(SIGCHLD, &sa, NULL);
3804                 if (r < 0) {
3805                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3806                         goto finish;
3807                 }
3808
3809                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3810                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3811                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3812                 if (pid < 0) {
3813                         if (errno == EINVAL)
3814                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3815                         else
3816                                 r = log_error_errno(errno, "clone() failed: %m");
3817
3818                         goto finish;
3819                 }
3820
3821                 if (pid == 0) {
3822                         /* child */
3823                         _cleanup_free_ char *home = NULL;
3824                         unsigned n_env = 2;
3825                         const char *envp[] = {
3826                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3827                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3828                                 NULL, /* TERM */
3829                                 NULL, /* HOME */
3830                                 NULL, /* USER */
3831                                 NULL, /* LOGNAME */
3832                                 NULL, /* container_uuid */
3833                                 NULL, /* LISTEN_FDS */
3834                                 NULL, /* LISTEN_PID */
3835                                 NULL
3836                         };
3837                         char **env_use;
3838
3839                         barrier_set_role(&barrier, BARRIER_CHILD);
3840
3841                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3842                         if (envp[n_env])
3843                                 n_env ++;
3844
3845                         master = safe_close(master);
3846
3847                         close_nointr(STDIN_FILENO);
3848                         close_nointr(STDOUT_FILENO);
3849                         close_nointr(STDERR_FILENO);
3850
3851                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3852                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3853
3854                         reset_all_signal_handlers();
3855                         reset_signal_mask();
3856
3857                         r = open_terminal(console, O_RDWR);
3858                         if (r != STDIN_FILENO) {
3859                                 if (r >= 0) {
3860                                         safe_close(r);
3861                                         r = -EINVAL;
3862                                 }
3863
3864                                 log_error_errno(r, "Failed to open console: %m");
3865                                 _exit(EXIT_FAILURE);
3866                         }
3867
3868                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3869                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3870                                 log_error_errno(errno, "Failed to duplicate console: %m");
3871                                 _exit(EXIT_FAILURE);
3872                         }
3873
3874                         if (setsid() < 0) {
3875                                 log_error_errno(errno, "setsid() failed: %m");
3876                                 _exit(EXIT_FAILURE);
3877                         }
3878
3879                         if (reset_audit_loginuid() < 0)
3880                                 _exit(EXIT_FAILURE);
3881
3882                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3883                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3884                                 _exit(EXIT_FAILURE);
3885                         }
3886
3887                         /* Mark everything as slave, so that we still
3888                          * receive mounts from the real root, but don't
3889                          * propagate mounts to the real root. */
3890                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3891                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3892                                 _exit(EXIT_FAILURE);
3893                         }
3894
3895                         if (mount_devices(arg_directory,
3896                                           root_device, root_device_rw,
3897                                           home_device, home_device_rw,
3898                                           srv_device, srv_device_rw) < 0)
3899                                 _exit(EXIT_FAILURE);
3900
3901                         /* Turn directory into bind mount */
3902                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3903                                 log_error_errno(errno, "Failed to make bind mount: %m");
3904                                 _exit(EXIT_FAILURE);
3905                         }
3906
3907                         r = setup_volatile(arg_directory);
3908                         if (r < 0)
3909                                 _exit(EXIT_FAILURE);
3910
3911                         if (setup_volatile_state(arg_directory) < 0)
3912                                 _exit(EXIT_FAILURE);
3913
3914                         r = base_filesystem_create(arg_directory);
3915                         if (r < 0)
3916                                 _exit(EXIT_FAILURE);
3917
3918                         if (arg_read_only) {
3919                                 r = bind_remount_recursive(arg_directory, true);
3920                                 if (r < 0) {
3921                                         log_error_errno(r, "Failed to make tree read-only: %m");
3922                                         _exit(EXIT_FAILURE);
3923                                 }
3924                         }
3925
3926                         if (mount_all(arg_directory) < 0)
3927                                 _exit(EXIT_FAILURE);
3928
3929                         if (copy_devnodes(arg_directory) < 0)
3930                                 _exit(EXIT_FAILURE);
3931
3932                         if (setup_ptmx(arg_directory) < 0)
3933                                 _exit(EXIT_FAILURE);
3934
3935                         dev_setup(arg_directory);
3936
3937                         if (setup_propagate(arg_directory) < 0)
3938                                 _exit(EXIT_FAILURE);
3939
3940                         if (setup_seccomp() < 0)
3941                                 _exit(EXIT_FAILURE);
3942
3943                         if (setup_dev_console(arg_directory, console) < 0)
3944                                 _exit(EXIT_FAILURE);
3945
3946                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3947                                 _exit(EXIT_FAILURE);
3948                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3949
3950                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3951                                 _exit(EXIT_FAILURE);
3952                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3953
3954                         /* Tell the parent that we are ready, and that
3955                          * it can cgroupify us to that we lack access
3956                          * to certain devices and resources. */
3957                         (void) barrier_place(&barrier);
3958
3959                         if (setup_boot_id(arg_directory) < 0)
3960                                 _exit(EXIT_FAILURE);
3961
3962                         if (setup_timezone(arg_directory) < 0)
3963                                 _exit(EXIT_FAILURE);
3964
3965                         if (setup_resolv_conf(arg_directory) < 0)
3966                                 _exit(EXIT_FAILURE);
3967
3968                         if (setup_journal(arg_directory) < 0)
3969                                 _exit(EXIT_FAILURE);
3970
3971                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3972                                 _exit(EXIT_FAILURE);
3973
3974                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3975                                 _exit(EXIT_FAILURE);
3976
3977                         if (mount_tmpfs(arg_directory) < 0)
3978                                 _exit(EXIT_FAILURE);
3979
3980                         /* Wait until we are cgroup-ified, so that we
3981                          * can mount the right cgroup path writable */
3982                         (void) barrier_sync_next(&barrier);
3983
3984                         if (mount_cgroup(arg_directory) < 0)
3985                                 _exit(EXIT_FAILURE);
3986
3987                         if (chdir(arg_directory) < 0) {
3988                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3989                                 _exit(EXIT_FAILURE);
3990                         }
3991
3992                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3993                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3994                                 _exit(EXIT_FAILURE);
3995                         }
3996
3997                         if (chroot(".") < 0) {
3998                                 log_error_errno(errno, "chroot() failed: %m");
3999                                 _exit(EXIT_FAILURE);
4000                         }
4001
4002                         if (chdir("/") < 0) {
4003                                 log_error_errno(errno, "chdir() failed: %m");
4004                                 _exit(EXIT_FAILURE);
4005                         }
4006
4007                         umask(0022);
4008
4009                         if (arg_private_network)
4010                                 loopback_setup();
4011
4012                         if (drop_capabilities() < 0) {
4013                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4014                                 _exit(EXIT_FAILURE);
4015                         }
4016
4017                         r = change_uid_gid(&home);
4018                         if (r < 0)
4019                                 _exit(EXIT_FAILURE);
4020
4021                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4022                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4023                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4024                                 log_oom();
4025                                 _exit(EXIT_FAILURE);
4026                         }
4027
4028                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4029                                 char as_uuid[37];
4030
4031                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4032                                         log_oom();
4033                                         _exit(EXIT_FAILURE);
4034                                 }
4035                         }
4036
4037                         if (fdset_size(fds) > 0) {
4038                                 r = fdset_cloexec(fds, false);
4039                                 if (r < 0) {
4040                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4041                                         _exit(EXIT_FAILURE);
4042                                 }
4043
4044                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4045                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4046                                         log_oom();
4047                                         _exit(EXIT_FAILURE);
4048                                 }
4049                         }
4050
4051                         setup_hostname();
4052
4053                         if (arg_personality != 0xffffffffLU) {
4054                                 if (personality(arg_personality) < 0) {
4055                                         log_error_errno(errno, "personality() failed: %m");
4056                                         _exit(EXIT_FAILURE);
4057                                 }
4058                         } else if (secondary) {
4059                                 if (personality(PER_LINUX32) < 0) {
4060                                         log_error_errno(errno, "personality() failed: %m");
4061                                         _exit(EXIT_FAILURE);
4062                                 }
4063                         }
4064
4065 #ifdef HAVE_SELINUX
4066                         if (arg_selinux_context)
4067                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4068                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4069                                         _exit(EXIT_FAILURE);
4070                                 }
4071 #endif
4072
4073                         if (!strv_isempty(arg_setenv)) {
4074                                 char **n;
4075
4076                                 n = strv_env_merge(2, envp, arg_setenv);
4077                                 if (!n) {
4078                                         log_oom();
4079                                         _exit(EXIT_FAILURE);
4080                                 }
4081
4082                                 env_use = n;
4083                         } else
4084                                 env_use = (char**) envp;
4085
4086                         /* Wait until the parent is ready with the setup, too... */
4087                         if (!barrier_place_and_sync(&barrier))
4088                                 _exit(EXIT_FAILURE);
4089
4090                         if (arg_boot) {
4091                                 char **a;
4092                                 size_t l;
4093
4094                                 /* Automatically search for the init system */
4095
4096                                 l = 1 + argc - optind;
4097                                 a = newa(char*, l + 1);
4098                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4099
4100                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4101                                 execve(a[0], a, env_use);
4102
4103                                 a[0] = (char*) "/lib/systemd/systemd";
4104                                 execve(a[0], a, env_use);
4105
4106                                 a[0] = (char*) "/sbin/init";
4107                                 execve(a[0], a, env_use);
4108                         } else if (argc > optind)
4109                                 execvpe(argv[optind], argv + optind, env_use);
4110                         else {
4111                                 chdir(home ? home : "/root");
4112                                 execle("/bin/bash", "-bash", NULL, env_use);
4113                                 execle("/bin/sh", "-sh", NULL, env_use);
4114                         }
4115
4116                         log_error_errno(errno, "execv() failed: %m");
4117                         _exit(EXIT_FAILURE);
4118                 }
4119
4120                 barrier_set_role(&barrier, BARRIER_PARENT);
4121                 fdset_free(fds);
4122                 fds = NULL;
4123
4124                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4125                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4126
4127                 /* Wait for the most basic Child-setup to be done,
4128                  * before we add hardware to it, and place it in a
4129                  * cgroup. */
4130                 if (barrier_sync_next(&barrier)) {
4131                         int ifi = 0;
4132
4133                         r = move_network_interfaces(pid);
4134                         if (r < 0)
4135                                 goto finish;
4136
4137                         r = setup_veth(pid, veth_name, &ifi);
4138                         if (r < 0)
4139                                 goto finish;
4140
4141                         r = setup_bridge(veth_name, &ifi);
4142                         if (r < 0)
4143                                 goto finish;
4144
4145                         r = setup_macvlan(pid);
4146                         if (r < 0)
4147                                 goto finish;
4148
4149                         r = setup_ipvlan(pid);
4150                         if (r < 0)
4151                                 goto finish;
4152
4153                         r = register_machine(pid, ifi);
4154                         if (r < 0)
4155                                 goto finish;
4156
4157                         /* Block SIGCHLD here, before notifying child.
4158                          * process_pty() will handle it with the other signals. */
4159                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4160                         if (r < 0)
4161                                 goto finish;
4162
4163                         /* Reset signal to default */
4164                         r = default_signals(SIGCHLD, -1);
4165                         if (r < 0)
4166                                 goto finish;
4167
4168                         /* Notify the child that the parent is ready with all
4169                          * its setup, and that the child can now hand over
4170                          * control to the code to run inside the container. */
4171                         (void) barrier_place(&barrier);
4172
4173                         /* And wait that the child is completely ready now. */
4174                         if (barrier_place_and_sync(&barrier)) {
4175                                 _cleanup_event_unref_ sd_event *event = NULL;
4176                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4177                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4178                                 char last_char = 0;
4179
4180                                 sd_notifyf(false,
4181                                            "READY=1\n"
4182                                            "STATUS=Container running.\n"
4183                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4184
4185                                 r = sd_event_new(&event);
4186                                 if (r < 0) {
4187                                         log_error_errno(r, "Failed to get default event source: %m");
4188                                         goto finish;
4189                                 }
4190
4191                                 if (arg_boot) {
4192                                         /* Try to kill the init system on SIGINT or SIGTERM */
4193                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4194                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4195                                 } else {
4196                                         /* Immediately exit */
4197                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4198                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4199                                 }
4200
4201                                 /* simply exit on sigchld */
4202                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4203
4204                                 if (arg_expose_ports) {
4205                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4206                                         if (r < 0)
4207                                                 goto finish;
4208
4209                                         (void) expose_ports(rtnl, &exposed);
4210                                 }
4211
4212                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4213
4214                                 r = pty_forward_new(event, master, true, &forward);
4215                                 if (r < 0) {
4216                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4217                                         goto finish;
4218                                 }
4219
4220                                 r = sd_event_loop(event);
4221                                 if (r < 0) {
4222                                         log_error_errno(r, "Failed to run event loop: %m");
4223                                         goto finish;
4224                                 }
4225
4226                                 pty_forward_get_last_char(forward, &last_char);
4227
4228                                 forward = pty_forward_free(forward);
4229
4230                                 if (!arg_quiet && last_char != '\n')
4231                                         putc('\n', stdout);
4232
4233                                 /* Kill if it is not dead yet anyway */
4234                                 terminate_machine(pid);
4235                         }
4236                 }
4237
4238                 /* Normally redundant, but better safe than sorry */
4239                 kill(pid, SIGKILL);
4240
4241                 r = wait_for_container(pid, &container_status);
4242                 pid = 0;
4243
4244                 if (r < 0)
4245                         /* We failed to wait for the container, or the
4246                          * container exited abnormally */
4247                         goto finish;
4248                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4249                         /* The container exited with a non-zero
4250                          * status, or with zero status and no reboot
4251                          * was requested. */
4252                         ret = r;
4253                         break;
4254                 }
4255
4256                 /* CONTAINER_REBOOTED, loop again */
4257
4258                 if (arg_keep_unit) {
4259                         /* Special handling if we are running as a
4260                          * service: instead of simply restarting the
4261                          * machine we want to restart the entire
4262                          * service, so let's inform systemd about this
4263                          * with the special exit code 133. The service
4264                          * file uses RestartForceExitStatus=133 so
4265                          * that this results in a full nspawn
4266                          * restart. This is necessary since we might
4267                          * have cgroup parameters set we want to have
4268                          * flushed out. */
4269                         ret = 133;
4270                         r = 0;
4271                         break;
4272                 }
4273
4274                 flush_ports(&exposed);
4275         }
4276
4277 finish:
4278         sd_notify(false,
4279                   "STOPPING=1\n"
4280                   "STATUS=Terminating...");
4281
4282         loop_remove(loop_nr, &image_fd);
4283
4284         if (pid > 0)
4285                 kill(pid, SIGKILL);
4286
4287         if (remove_subvol && arg_directory) {
4288                 int k;
4289
4290                 k = btrfs_subvol_remove(arg_directory);
4291                 if (k < 0)
4292                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4293         }
4294
4295         if (arg_machine) {
4296                 const char *p;
4297
4298                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4299                 (void) rm_rf(p, false, true, false);
4300         }
4301
4302         free(arg_directory);
4303         free(arg_template);
4304         free(arg_image);
4305         free(arg_machine);
4306         free(arg_user);
4307         strv_free(arg_setenv);
4308         strv_free(arg_network_interfaces);
4309         strv_free(arg_network_macvlan);
4310         strv_free(arg_network_ipvlan);
4311         strv_free(arg_bind);
4312         strv_free(arg_bind_ro);
4313         strv_free(arg_tmpfs);
4314
4315         flush_ports(&exposed);
4316
4317         while (arg_expose_ports) {
4318                 ExposePort *p = arg_expose_ports;
4319                 LIST_REMOVE(ports, arg_expose_ports, p);
4320                 free(p);
4321         }
4322
4323         return r < 0 ? EXIT_FAILURE : ret;
4324 }