chiark / gitweb /
nspawn: work around kernel bug with partition table probing on loopback devices
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819 #ifdef HAVE_SELINUX
820                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
821                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
822 #endif
823         };
824
825         unsigned k;
826         int r = 0;
827
828         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
829                 _cleanup_free_ char *where = NULL;
830 #ifdef HAVE_SELINUX
831                 _cleanup_free_ char *options = NULL;
832 #endif
833                 const char *o;
834                 int t;
835
836                 where = strjoin(dest, "/", mount_table[k].where, NULL);
837                 if (!where)
838                         return log_oom();
839
840                 t = path_is_mount_point(where, true);
841                 if (t < 0) {
842                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
843
844                         if (r == 0)
845                                 r = t;
846
847                         continue;
848                 }
849
850                 /* Skip this entry if it is not a remount. */
851                 if (mount_table[k].what && t > 0)
852                         continue;
853
854                 t = mkdir_p(where, 0755);
855                 if (t < 0) {
856                         if (mount_table[k].fatal) {
857                                log_error_errno(t, "Failed to create directory %s: %m", where);
858
859                                 if (r == 0)
860                                         r = t;
861                         } else
862                                log_warning_errno(t, "Failed to create directory %s: %m", where);
863
864                         continue;
865                 }
866
867 #ifdef HAVE_SELINUX
868                 if (arg_selinux_apifs_context &&
869                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
870                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
871                         if (!options)
872                                 return log_oom();
873
874                         o = options;
875                 } else
876 #endif
877                         o = mount_table[k].options;
878
879
880                 if (mount(mount_table[k].what,
881                           where,
882                           mount_table[k].type,
883                           mount_table[k].flags,
884                           o) < 0) {
885
886                         if (mount_table[k].fatal) {
887                                 log_error_errno(errno, "mount(%s) failed: %m", where);
888
889                                 if (r == 0)
890                                         r = -errno;
891                         } else
892                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
893                 }
894         }
895
896         return r;
897 }
898
899 static int mount_binds(const char *dest, char **l, bool ro) {
900         char **x, **y;
901
902         STRV_FOREACH_PAIR(x, y, l) {
903                 _cleanup_free_ char *where = NULL;
904                 struct stat source_st, dest_st;
905                 int r;
906
907                 if (stat(*x, &source_st) < 0)
908                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
909
910                 where = strappend(dest, *y);
911                 if (!where)
912                         return log_oom();
913
914                 r = stat(where, &dest_st);
915                 if (r == 0) {
916                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
917                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
918                                 return -EINVAL;
919                         }
920                 } else if (errno == ENOENT) {
921                         r = mkdir_parents_label(where, 0755);
922                         if (r < 0)
923                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
924                 } else {
925                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
926                         return -errno;
927                 }
928
929                 /* Create the mount point, but be conservative -- refuse to create block
930                  * and char devices. */
931                 if (S_ISDIR(source_st.st_mode)) {
932                         r = mkdir_label(where, 0755);
933                         if (r < 0 && errno != EEXIST)
934                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
935                 } else if (S_ISFIFO(source_st.st_mode)) {
936                         r = mkfifo(where, 0644);
937                         if (r < 0 && errno != EEXIST)
938                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
939                 } else if (S_ISSOCK(source_st.st_mode)) {
940                         r = mknod(where, 0644 | S_IFSOCK, 0);
941                         if (r < 0 && errno != EEXIST)
942                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
943                 } else if (S_ISREG(source_st.st_mode)) {
944                         r = touch(where);
945                         if (r < 0)
946                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
947                 } else {
948                         log_error("Refusing to create mountpoint for file: %s", *x);
949                         return -ENOTSUP;
950                 }
951
952                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
953                         return log_error_errno(errno, "mount(%s) failed: %m", where);
954
955                 if (ro) {
956                         r = bind_remount_recursive(where, true);
957                         if (r < 0)
958                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
959                 }
960         }
961
962         return 0;
963 }
964
965 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
966         char *to;
967         int r;
968
969         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
970
971         r = path_is_mount_point(to, false);
972         if (r < 0)
973                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
974         if (r > 0)
975                 return 0;
976
977         mkdir_p(to, 0755);
978
979         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
980                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
981
982         return 1;
983 }
984
985 static int mount_cgroup(const char *dest) {
986         _cleanup_set_free_free_ Set *controllers = NULL;
987         _cleanup_free_ char *own_cgroup_path = NULL;
988         const char *cgroup_root, *systemd_root, *systemd_own;
989         int r;
990
991         controllers = set_new(&string_hash_ops);
992         if (!controllers)
993                 return log_oom();
994
995         r = cg_kernel_controllers(controllers);
996         if (r < 0)
997                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
998
999         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1000         if (r < 0)
1001                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1002
1003         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1004         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1005                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1006
1007         for (;;) {
1008                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1009
1010                 controller = set_steal_first(controllers);
1011                 if (!controller)
1012                         break;
1013
1014                 origin = strappend("/sys/fs/cgroup/", controller);
1015                 if (!origin)
1016                         return log_oom();
1017
1018                 r = readlink_malloc(origin, &combined);
1019                 if (r == -EINVAL) {
1020                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1021
1022                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1023                         if (r < 0)
1024                                 return r;
1025
1026                 } else if (r < 0)
1027                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1028                 else {
1029                         _cleanup_free_ char *target = NULL;
1030
1031                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1032                         if (!target)
1033                                 return log_oom();
1034
1035                         /* A symbolic link, a combination of controllers in one hierarchy */
1036
1037                         if (!filename_is_valid(combined)) {
1038                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1039                                 continue;
1040                         }
1041
1042                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1043                         if (r < 0)
1044                                 return r;
1045
1046                         if (symlink(combined, target) < 0)
1047                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1048                 }
1049         }
1050
1051         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1052         if (r < 0)
1053                 return r;
1054
1055         /* Make our own cgroup a (writable) bind mount */
1056         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1057         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1058                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1059
1060         /* And then remount the systemd cgroup root read-only */
1061         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1062         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1063                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1064
1065         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1066                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1067
1068         return 0;
1069 }
1070
1071 static int mount_tmpfs(const char *dest) {
1072         char **i, **o;
1073
1074         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1075                 _cleanup_free_ char *where = NULL;
1076                 int r;
1077
1078                 where = strappend(dest, *i);
1079                 if (!where)
1080                         return log_oom();
1081
1082                 r = mkdir_label(where, 0755);
1083                 if (r < 0 && r != -EEXIST)
1084                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1085
1086                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1087                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1088         }
1089
1090         return 0;
1091 }
1092
1093 static int setup_timezone(const char *dest) {
1094         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1095         char *z, *y;
1096         int r;
1097
1098         assert(dest);
1099
1100         /* Fix the timezone, if possible */
1101         r = readlink_malloc("/etc/localtime", &p);
1102         if (r < 0) {
1103                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1104                 return 0;
1105         }
1106
1107         z = path_startswith(p, "../usr/share/zoneinfo/");
1108         if (!z)
1109                 z = path_startswith(p, "/usr/share/zoneinfo/");
1110         if (!z) {
1111                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1112                 return 0;
1113         }
1114
1115         where = strappend(dest, "/etc/localtime");
1116         if (!where)
1117                 return log_oom();
1118
1119         r = readlink_malloc(where, &q);
1120         if (r >= 0) {
1121                 y = path_startswith(q, "../usr/share/zoneinfo/");
1122                 if (!y)
1123                         y = path_startswith(q, "/usr/share/zoneinfo/");
1124
1125                 /* Already pointing to the right place? Then do nothing .. */
1126                 if (y && streq(y, z))
1127                         return 0;
1128         }
1129
1130         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1131         if (!check)
1132                 return log_oom();
1133
1134         if (access(check, F_OK) < 0) {
1135                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1136                 return 0;
1137         }
1138
1139         what = strappend("../usr/share/zoneinfo/", z);
1140         if (!what)
1141                 return log_oom();
1142
1143         r = mkdir_parents(where, 0755);
1144         if (r < 0) {
1145                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1146
1147                 return 0;
1148         }
1149
1150         r = unlink(where);
1151         if (r < 0 && errno != ENOENT) {
1152                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1153
1154                 return 0;
1155         }
1156
1157         if (symlink(what, where) < 0) {
1158                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1159                 return 0;
1160         }
1161
1162         return 0;
1163 }
1164
1165 static int setup_resolv_conf(const char *dest) {
1166         _cleanup_free_ char *where = NULL;
1167         int r;
1168
1169         assert(dest);
1170
1171         if (arg_private_network)
1172                 return 0;
1173
1174         /* Fix resolv.conf, if possible */
1175         where = strappend(dest, "/etc/resolv.conf");
1176         if (!where)
1177                 return log_oom();
1178
1179         /* We don't really care for the results of this really. If it
1180          * fails, it fails, but meh... */
1181         r = mkdir_parents(where, 0755);
1182         if (r < 0) {
1183                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1184
1185                 return 0;
1186         }
1187
1188         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1189         if (r < 0) {
1190                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1191
1192                 return 0;
1193         }
1194
1195         return 0;
1196 }
1197
1198 static int setup_volatile_state(const char *directory) {
1199         const char *p;
1200         int r;
1201
1202         assert(directory);
1203
1204         if (arg_volatile != VOLATILE_STATE)
1205                 return 0;
1206
1207         /* --volatile=state means we simply overmount /var
1208            with a tmpfs, and the rest read-only. */
1209
1210         r = bind_remount_recursive(directory, true);
1211         if (r < 0)
1212                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1213
1214         p = strappenda(directory, "/var");
1215         r = mkdir(p, 0755);
1216         if (r < 0 && errno != EEXIST)
1217                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1218
1219         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1220                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1221
1222         return 0;
1223 }
1224
1225 static int setup_volatile(const char *directory) {
1226         bool tmpfs_mounted = false, bind_mounted = false;
1227         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1228         const char *f, *t;
1229         int r;
1230
1231         assert(directory);
1232
1233         if (arg_volatile != VOLATILE_YES)
1234                 return 0;
1235
1236         /* --volatile=yes means we mount a tmpfs to the root dir, and
1237            the original /usr to use inside it, and that read-only. */
1238
1239         if (!mkdtemp(template))
1240                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1241
1242         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1243                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1244                 r = -errno;
1245                 goto fail;
1246         }
1247
1248         tmpfs_mounted = true;
1249
1250         f = strappenda(directory, "/usr");
1251         t = strappenda(template, "/usr");
1252
1253         r = mkdir(t, 0755);
1254         if (r < 0 && errno != EEXIST) {
1255                 log_error_errno(errno, "Failed to create %s: %m", t);
1256                 r = -errno;
1257                 goto fail;
1258         }
1259
1260         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1261                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1262                 r = -errno;
1263                 goto fail;
1264         }
1265
1266         bind_mounted = true;
1267
1268         r = bind_remount_recursive(t, true);
1269         if (r < 0) {
1270                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1271                 goto fail;
1272         }
1273
1274         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1275                 log_error_errno(errno, "Failed to move root mount: %m");
1276                 r = -errno;
1277                 goto fail;
1278         }
1279
1280         rmdir(template);
1281
1282         return 0;
1283
1284 fail:
1285         if (bind_mounted)
1286                 umount(t);
1287         if (tmpfs_mounted)
1288                 umount(template);
1289         rmdir(template);
1290         return r;
1291 }
1292
1293 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1294
1295         snprintf(s, 37,
1296                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1297                  SD_ID128_FORMAT_VAL(id));
1298
1299         return s;
1300 }
1301
1302 static int setup_boot_id(const char *dest) {
1303         _cleanup_free_ char *from = NULL, *to = NULL;
1304         sd_id128_t rnd = {};
1305         char as_uuid[37];
1306         int r;
1307
1308         assert(dest);
1309
1310         if (arg_share_system)
1311                 return 0;
1312
1313         /* Generate a new randomized boot ID, so that each boot-up of
1314          * the container gets a new one */
1315
1316         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1317         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1318         if (!from || !to)
1319                 return log_oom();
1320
1321         r = sd_id128_randomize(&rnd);
1322         if (r < 0)
1323                 return log_error_errno(r, "Failed to generate random boot id: %m");
1324
1325         id128_format_as_uuid(rnd, as_uuid);
1326
1327         r = write_string_file(from, as_uuid);
1328         if (r < 0)
1329                 return log_error_errno(r, "Failed to write boot id: %m");
1330
1331         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1332                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1333                 r = -errno;
1334         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1335                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1336
1337         unlink(from);
1338         return r;
1339 }
1340
1341 static int copy_devnodes(const char *dest) {
1342
1343         static const char devnodes[] =
1344                 "null\0"
1345                 "zero\0"
1346                 "full\0"
1347                 "random\0"
1348                 "urandom\0"
1349                 "tty\0"
1350                 "net/tun\0";
1351
1352         const char *d;
1353         int r = 0;
1354         _cleanup_umask_ mode_t u;
1355
1356         assert(dest);
1357
1358         u = umask(0000);
1359
1360         NULSTR_FOREACH(d, devnodes) {
1361                 _cleanup_free_ char *from = NULL, *to = NULL;
1362                 struct stat st;
1363
1364                 from = strappend("/dev/", d);
1365                 to = strjoin(dest, "/dev/", d, NULL);
1366                 if (!from || !to)
1367                         return log_oom();
1368
1369                 if (stat(from, &st) < 0) {
1370
1371                         if (errno != ENOENT)
1372                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1373
1374                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1375
1376                         log_error("%s is not a char or block device, cannot copy", from);
1377                         return -EIO;
1378
1379                 } else {
1380                         r = mkdir_parents(to, 0775);
1381                         if (r < 0) {
1382                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1383                                 return -r;
1384                         }
1385
1386                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1387                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1388                 }
1389         }
1390
1391         return r;
1392 }
1393
1394 static int setup_ptmx(const char *dest) {
1395         _cleanup_free_ char *p = NULL;
1396
1397         p = strappend(dest, "/dev/ptmx");
1398         if (!p)
1399                 return log_oom();
1400
1401         if (symlink("pts/ptmx", p) < 0)
1402                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1403
1404         return 0;
1405 }
1406
1407 static int setup_dev_console(const char *dest, const char *console) {
1408         _cleanup_umask_ mode_t u;
1409         const char *to;
1410         struct stat st;
1411         int r;
1412
1413         assert(dest);
1414         assert(console);
1415
1416         u = umask(0000);
1417
1418         if (stat("/dev/null", &st) < 0)
1419                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1420
1421         r = chmod_and_chown(console, 0600, 0, 0);
1422         if (r < 0)
1423                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1424
1425         /* We need to bind mount the right tty to /dev/console since
1426          * ptys can only exist on pts file systems. To have something
1427          * to bind mount things on we create a device node first, and
1428          * use /dev/null for that since we the cgroups device policy
1429          * allows us to create that freely, while we cannot create
1430          * /dev/console. (Note that the major minor doesn't actually
1431          * matter here, since we mount it over anyway). */
1432
1433         to = strappenda(dest, "/dev/console");
1434         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1435                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1436
1437         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1438                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1439
1440         return 0;
1441 }
1442
1443 static int setup_kmsg(const char *dest, int kmsg_socket) {
1444         _cleanup_free_ char *from = NULL, *to = NULL;
1445         _cleanup_umask_ mode_t u;
1446         int r, fd, k;
1447         union {
1448                 struct cmsghdr cmsghdr;
1449                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1450         } control = {};
1451         struct msghdr mh = {
1452                 .msg_control = &control,
1453                 .msg_controllen = sizeof(control),
1454         };
1455         struct cmsghdr *cmsg;
1456
1457         assert(dest);
1458         assert(kmsg_socket >= 0);
1459
1460         u = umask(0000);
1461
1462         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1463          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1464          * on the reading side behave very similar to /proc/kmsg,
1465          * their writing side behaves differently from /dev/kmsg in
1466          * that writing blocks when nothing is reading. In order to
1467          * avoid any problems with containers deadlocking due to this
1468          * we simply make /dev/kmsg unavailable to the container. */
1469         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1470             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1471                 return log_oom();
1472
1473         if (mkfifo(from, 0600) < 0)
1474                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1475
1476         r = chmod_and_chown(from, 0600, 0, 0);
1477         if (r < 0)
1478                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1479
1480         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1481                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1482
1483         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1484         if (fd < 0)
1485                 return log_error_errno(errno, "Failed to open fifo: %m");
1486
1487         cmsg = CMSG_FIRSTHDR(&mh);
1488         cmsg->cmsg_level = SOL_SOCKET;
1489         cmsg->cmsg_type = SCM_RIGHTS;
1490         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1491         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1492
1493         mh.msg_controllen = cmsg->cmsg_len;
1494
1495         /* Store away the fd in the socket, so that it stays open as
1496          * long as we run the child */
1497         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1498         safe_close(fd);
1499
1500         if (k < 0)
1501                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1502
1503         /* And now make the FIFO unavailable as /dev/kmsg... */
1504         unlink(from);
1505         return 0;
1506 }
1507
1508 static int send_rtnl(int send_fd) {
1509         union {
1510                 struct cmsghdr cmsghdr;
1511                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1512         } control = {};
1513         struct msghdr mh = {
1514                 .msg_control = &control,
1515                 .msg_controllen = sizeof(control),
1516         };
1517         struct cmsghdr *cmsg;
1518         _cleanup_close_ int fd = -1;
1519         ssize_t k;
1520
1521         assert(send_fd >= 0);
1522
1523         if (!arg_expose_ports)
1524                 return 0;
1525
1526         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1527         if (fd < 0)
1528                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1529
1530         cmsg = CMSG_FIRSTHDR(&mh);
1531         cmsg->cmsg_level = SOL_SOCKET;
1532         cmsg->cmsg_type = SCM_RIGHTS;
1533         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1534         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1535
1536         mh.msg_controllen = cmsg->cmsg_len;
1537
1538         /* Store away the fd in the socket, so that it stays open as
1539          * long as we run the child */
1540         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1541         if (k < 0)
1542                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1543
1544         return 0;
1545 }
1546
1547 static int flush_ports(union in_addr_union *exposed) {
1548         ExposePort *p;
1549         int r, af = AF_INET;
1550
1551         assert(exposed);
1552
1553         if (!arg_expose_ports)
1554                 return 0;
1555
1556         if (in_addr_is_null(af, exposed))
1557                 return 0;
1558
1559         log_debug("Lost IP address.");
1560
1561         LIST_FOREACH(ports, p, arg_expose_ports) {
1562                 r = fw_add_local_dnat(false,
1563                                       af,
1564                                       p->protocol,
1565                                       NULL,
1566                                       NULL, 0,
1567                                       NULL, 0,
1568                                       p->host_port,
1569                                       exposed,
1570                                       p->container_port,
1571                                       NULL);
1572                 if (r < 0)
1573                         log_warning_errno(r, "Failed to modify firewall: %m");
1574         }
1575
1576         *exposed = IN_ADDR_NULL;
1577         return 0;
1578 }
1579
1580 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1581         _cleanup_free_ struct local_address *addresses = NULL;
1582         _cleanup_free_ char *pretty = NULL;
1583         union in_addr_union new_exposed;
1584         ExposePort *p;
1585         bool add;
1586         int af = AF_INET, r;
1587
1588         assert(exposed);
1589
1590         /* Invoked each time an address is added or removed inside the
1591          * container */
1592
1593         if (!arg_expose_ports)
1594                 return 0;
1595
1596         r = local_addresses(rtnl, 0, af, &addresses);
1597         if (r < 0)
1598                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1599
1600         add = r > 0 &&
1601                 addresses[0].family == af &&
1602                 addresses[0].scope < RT_SCOPE_LINK;
1603
1604         if (!add)
1605                 return flush_ports(exposed);
1606
1607         new_exposed = addresses[0].address;
1608         if (in_addr_equal(af, exposed, &new_exposed))
1609                 return 0;
1610
1611         in_addr_to_string(af, &new_exposed, &pretty);
1612         log_debug("New container IP is %s.", strna(pretty));
1613
1614         LIST_FOREACH(ports, p, arg_expose_ports) {
1615
1616                 r = fw_add_local_dnat(true,
1617                                       af,
1618                                       p->protocol,
1619                                       NULL,
1620                                       NULL, 0,
1621                                       NULL, 0,
1622                                       p->host_port,
1623                                       &new_exposed,
1624                                       p->container_port,
1625                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1626                 if (r < 0)
1627                         log_warning_errno(r, "Failed to modify firewall: %m");
1628         }
1629
1630         *exposed = new_exposed;
1631         return 0;
1632 }
1633
1634 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1635         union in_addr_union *exposed = userdata;
1636
1637         assert(rtnl);
1638         assert(m);
1639         assert(exposed);
1640
1641         expose_ports(rtnl, exposed);
1642         return 0;
1643 }
1644
1645 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1646         union {
1647                 struct cmsghdr cmsghdr;
1648                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1649         } control = {};
1650         struct msghdr mh = {
1651                 .msg_control = &control,
1652                 .msg_controllen = sizeof(control),
1653         };
1654         struct cmsghdr *cmsg;
1655         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1656         int fd, r;
1657         ssize_t k;
1658
1659         assert(event);
1660         assert(recv_fd >= 0);
1661         assert(ret);
1662
1663         if (!arg_expose_ports)
1664                 return 0;
1665
1666         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1667         if (k < 0)
1668                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1669
1670         cmsg = CMSG_FIRSTHDR(&mh);
1671         assert(cmsg->cmsg_level == SOL_SOCKET);
1672         assert(cmsg->cmsg_type == SCM_RIGHTS);
1673         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1674         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1675
1676         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1677         if (r < 0) {
1678                 safe_close(fd);
1679                 return log_error_errno(r, "Failed to create rtnl object: %m");
1680         }
1681
1682         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1683         if (r < 0)
1684                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1685
1686         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1689
1690         r = sd_rtnl_attach_event(rtnl, event, 0);
1691         if (r < 0)
1692                 return log_error_errno(r, "Failed to add to even loop: %m");
1693
1694         *ret = rtnl;
1695         rtnl = NULL;
1696
1697         return 0;
1698 }
1699
1700 static int setup_hostname(void) {
1701
1702         if (arg_share_system)
1703                 return 0;
1704
1705         if (sethostname_idempotent(arg_machine) < 0)
1706                 return -errno;
1707
1708         return 0;
1709 }
1710
1711 static int setup_journal(const char *directory) {
1712         sd_id128_t machine_id, this_id;
1713         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1714         char *id;
1715         int r;
1716
1717         /* Don't link journals in ephemeral mode */
1718         if (arg_ephemeral)
1719                 return 0;
1720
1721         p = strappend(directory, "/etc/machine-id");
1722         if (!p)
1723                 return log_oom();
1724
1725         r = read_one_line_file(p, &b);
1726         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1727                 return 0;
1728         else if (r < 0)
1729                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1730
1731         id = strstrip(b);
1732         if (isempty(id) && arg_link_journal == LINK_AUTO)
1733                 return 0;
1734
1735         /* Verify validity */
1736         r = sd_id128_from_string(id, &machine_id);
1737         if (r < 0)
1738                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1739
1740         r = sd_id128_get_machine(&this_id);
1741         if (r < 0)
1742                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1743
1744         if (sd_id128_equal(machine_id, this_id)) {
1745                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1746                          "Host and machine ids are equal (%s): refusing to link journals", id);
1747                 if (arg_link_journal == LINK_AUTO)
1748                         return 0;
1749                 return -EEXIST;
1750         }
1751
1752         if (arg_link_journal == LINK_NO)
1753                 return 0;
1754
1755         free(p);
1756         p = strappend("/var/log/journal/", id);
1757         q = strjoin(directory, "/var/log/journal/", id, NULL);
1758         if (!p || !q)
1759                 return log_oom();
1760
1761         if (path_is_mount_point(p, false) > 0) {
1762                 if (arg_link_journal != LINK_AUTO) {
1763                         log_error("%s: already a mount point, refusing to use for journal", p);
1764                         return -EEXIST;
1765                 }
1766
1767                 return 0;
1768         }
1769
1770         if (path_is_mount_point(q, false) > 0) {
1771                 if (arg_link_journal != LINK_AUTO) {
1772                         log_error("%s: already a mount point, refusing to use for journal", q);
1773                         return -EEXIST;
1774                 }
1775
1776                 return 0;
1777         }
1778
1779         r = readlink_and_make_absolute(p, &d);
1780         if (r >= 0) {
1781                 if ((arg_link_journal == LINK_GUEST ||
1782                      arg_link_journal == LINK_AUTO) &&
1783                     path_equal(d, q)) {
1784
1785                         r = mkdir_p(q, 0755);
1786                         if (r < 0)
1787                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1788                         return 0;
1789                 }
1790
1791                 if (unlink(p) < 0)
1792                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1793         } else if (r == -EINVAL) {
1794
1795                 if (arg_link_journal == LINK_GUEST &&
1796                     rmdir(p) < 0) {
1797
1798                         if (errno == ENOTDIR) {
1799                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1800                                 return r;
1801                         } else {
1802                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1803                                 return -errno;
1804                         }
1805                 }
1806         } else if (r != -ENOENT) {
1807                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1808                 return r;
1809         }
1810
1811         if (arg_link_journal == LINK_GUEST) {
1812
1813                 if (symlink(q, p) < 0) {
1814                         if (arg_link_journal_try) {
1815                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1816                                 return 0;
1817                         } else {
1818                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1819                                 return -errno;
1820                         }
1821                 }
1822
1823                 r = mkdir_p(q, 0755);
1824                 if (r < 0)
1825                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1826                 return 0;
1827         }
1828
1829         if (arg_link_journal == LINK_HOST) {
1830                 /* don't create parents here -- if the host doesn't have
1831                  * permanent journal set up, don't force it here */
1832                 r = mkdir(p, 0755);
1833                 if (r < 0) {
1834                         if (arg_link_journal_try) {
1835                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1836                                 return 0;
1837                         } else {
1838                                 log_error_errno(errno, "Failed to create %s: %m", p);
1839                                 return r;
1840                         }
1841                 }
1842
1843         } else if (access(p, F_OK) < 0)
1844                 return 0;
1845
1846         if (dir_is_empty(q) == 0)
1847                 log_warning("%s is not empty, proceeding anyway.", q);
1848
1849         r = mkdir_p(q, 0755);
1850         if (r < 0) {
1851                 log_error_errno(errno, "Failed to create %s: %m", q);
1852                 return r;
1853         }
1854
1855         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1856                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1857
1858         return 0;
1859 }
1860
1861 static int drop_capabilities(void) {
1862         return capability_bounding_set_drop(~arg_retain, false);
1863 }
1864
1865 static int register_machine(pid_t pid, int local_ifindex) {
1866         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1867         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1868         int r;
1869
1870         if (!arg_register)
1871                 return 0;
1872
1873         r = sd_bus_default_system(&bus);
1874         if (r < 0)
1875                 return log_error_errno(r, "Failed to open system bus: %m");
1876
1877         if (arg_keep_unit) {
1878                 r = sd_bus_call_method(
1879                                 bus,
1880                                 "org.freedesktop.machine1",
1881                                 "/org/freedesktop/machine1",
1882                                 "org.freedesktop.machine1.Manager",
1883                                 "RegisterMachineWithNetwork",
1884                                 &error,
1885                                 NULL,
1886                                 "sayssusai",
1887                                 arg_machine,
1888                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1889                                 "nspawn",
1890                                 "container",
1891                                 (uint32_t) pid,
1892                                 strempty(arg_directory),
1893                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1894         } else {
1895                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1896
1897                 r = sd_bus_message_new_method_call(
1898                                 bus,
1899                                 &m,
1900                                 "org.freedesktop.machine1",
1901                                 "/org/freedesktop/machine1",
1902                                 "org.freedesktop.machine1.Manager",
1903                                 "CreateMachineWithNetwork");
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to create message: %m");
1906
1907                 r = sd_bus_message_append(
1908                                 m,
1909                                 "sayssusai",
1910                                 arg_machine,
1911                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1912                                 "nspawn",
1913                                 "container",
1914                                 (uint32_t) pid,
1915                                 strempty(arg_directory),
1916                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1917                 if (r < 0)
1918                         return log_error_errno(r, "Failed to append message arguments: %m");
1919
1920                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1921                 if (r < 0)
1922                         return log_error_errno(r, "Failed to open container: %m");
1923
1924                 if (!isempty(arg_slice)) {
1925                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1926                         if (r < 0)
1927                                 return log_error_errno(r, "Failed to append slice: %m");
1928                 }
1929
1930                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1931                 if (r < 0)
1932                         return log_error_errno(r, "Failed to add device policy: %m");
1933
1934                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1935                                           /* Allow the container to
1936                                            * access and create the API
1937                                            * device nodes, so that
1938                                            * PrivateDevices= in the
1939                                            * container can work
1940                                            * fine */
1941                                           "/dev/null", "rwm",
1942                                           "/dev/zero", "rwm",
1943                                           "/dev/full", "rwm",
1944                                           "/dev/random", "rwm",
1945                                           "/dev/urandom", "rwm",
1946                                           "/dev/tty", "rwm",
1947                                           "/dev/net/tun", "rwm",
1948                                           /* Allow the container
1949                                            * access to ptys. However,
1950                                            * do not permit the
1951                                            * container to ever create
1952                                            * these device nodes. */
1953                                           "/dev/pts/ptmx", "rw",
1954                                           "char-pts", "rw");
1955                 if (r < 0)
1956                         return log_error_errno(r, "Failed to add device whitelist: %m");
1957
1958                 r = sd_bus_message_close_container(m);
1959                 if (r < 0)
1960                         return log_error_errno(r, "Failed to close container: %m");
1961
1962                 r = sd_bus_call(bus, m, 0, &error, NULL);
1963         }
1964
1965         if (r < 0) {
1966                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1967                 return r;
1968         }
1969
1970         return 0;
1971 }
1972
1973 static int terminate_machine(pid_t pid) {
1974         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1975         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1976         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1977         const char *path;
1978         int r;
1979
1980         if (!arg_register)
1981                 return 0;
1982
1983         r = sd_bus_default_system(&bus);
1984         if (r < 0)
1985                 return log_error_errno(r, "Failed to open system bus: %m");
1986
1987         r = sd_bus_call_method(
1988                         bus,
1989                         "org.freedesktop.machine1",
1990                         "/org/freedesktop/machine1",
1991                         "org.freedesktop.machine1.Manager",
1992                         "GetMachineByPID",
1993                         &error,
1994                         &reply,
1995                         "u",
1996                         (uint32_t) pid);
1997         if (r < 0) {
1998                 /* Note that the machine might already have been
1999                  * cleaned up automatically, hence don't consider it a
2000                  * failure if we cannot get the machine object. */
2001                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2002                 return 0;
2003         }
2004
2005         r = sd_bus_message_read(reply, "o", &path);
2006         if (r < 0)
2007                 return bus_log_parse_error(r);
2008
2009         r = sd_bus_call_method(
2010                         bus,
2011                         "org.freedesktop.machine1",
2012                         path,
2013                         "org.freedesktop.machine1.Machine",
2014                         "Terminate",
2015                         &error,
2016                         NULL,
2017                         NULL);
2018         if (r < 0) {
2019                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2020                 return 0;
2021         }
2022
2023         return 0;
2024 }
2025
2026 static int reset_audit_loginuid(void) {
2027         _cleanup_free_ char *p = NULL;
2028         int r;
2029
2030         if (arg_share_system)
2031                 return 0;
2032
2033         r = read_one_line_file("/proc/self/loginuid", &p);
2034         if (r == -ENOENT)
2035                 return 0;
2036         if (r < 0)
2037                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2038
2039         /* Already reset? */
2040         if (streq(p, "4294967295"))
2041                 return 0;
2042
2043         r = write_string_file("/proc/self/loginuid", "4294967295");
2044         if (r < 0) {
2045                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2046                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2047                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2048                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2049                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2050
2051                 sleep(5);
2052         }
2053
2054         return 0;
2055 }
2056
2057 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2058 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2059 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2060
2061 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2062         uint8_t result[8];
2063         size_t l, sz;
2064         uint8_t *v, *i;
2065         int r;
2066
2067         l = strlen(arg_machine);
2068         sz = sizeof(sd_id128_t) + l;
2069         if (idx > 0)
2070                 sz += sizeof(idx);
2071
2072         v = alloca(sz);
2073
2074         /* fetch some persistent data unique to the host */
2075         r = sd_id128_get_machine((sd_id128_t*) v);
2076         if (r < 0)
2077                 return r;
2078
2079         /* combine with some data unique (on this host) to this
2080          * container instance */
2081         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2082         if (idx > 0) {
2083                 idx = htole64(idx);
2084                 memcpy(i, &idx, sizeof(idx));
2085         }
2086
2087         /* Let's hash the host machine ID plus the container name. We
2088          * use a fixed, but originally randomly created hash key here. */
2089         siphash24(result, v, sz, hash_key.bytes);
2090
2091         assert_cc(ETH_ALEN <= sizeof(result));
2092         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2093
2094         /* see eth_random_addr in the kernel */
2095         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2096         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2097
2098         return 0;
2099 }
2100
2101 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2102         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2103         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2104         struct ether_addr mac_host, mac_container;
2105         int r, i;
2106
2107         if (!arg_private_network)
2108                 return 0;
2109
2110         if (!arg_network_veth)
2111                 return 0;
2112
2113         /* Use two different interface name prefixes depending whether
2114          * we are in bridge mode or not. */
2115         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2116                  arg_network_bridge ? "vb" : "ve", arg_machine);
2117
2118         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2119         if (r < 0)
2120                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2121
2122         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2125
2126         r = sd_rtnl_open(&rtnl, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to connect to netlink: %m");
2129
2130         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2133
2134         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2137
2138         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2141
2142         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to open netlink container: %m");
2145
2146         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to open netlink container: %m");
2153
2154         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2157
2158         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2161
2162         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2165
2166         r = sd_rtnl_message_close_container(m);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to close netlink container: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_message_close_container(m);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to close netlink container: %m");
2177
2178         r = sd_rtnl_call(rtnl, m, 0, NULL);
2179         if (r < 0)
2180                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2181
2182         i = (int) if_nametoindex(iface_name);
2183         if (i <= 0)
2184                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2185
2186         *ifi = i;
2187
2188         return 0;
2189 }
2190
2191 static int setup_bridge(const char veth_name[], int *ifi) {
2192         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2193         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2194         int r, bridge;
2195
2196         if (!arg_private_network)
2197                 return 0;
2198
2199         if (!arg_network_veth)
2200                 return 0;
2201
2202         if (!arg_network_bridge)
2203                 return 0;
2204
2205         bridge = (int) if_nametoindex(arg_network_bridge);
2206         if (bridge <= 0)
2207                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2208
2209         *ifi = bridge;
2210
2211         r = sd_rtnl_open(&rtnl, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to connect to netlink: %m");
2214
2215         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2218
2219         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2222
2223         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2226
2227         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink master field: %m");
2230
2231         r = sd_rtnl_call(rtnl, m, 0, NULL);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2234
2235         return 0;
2236 }
2237
2238 static int parse_interface(struct udev *udev, const char *name) {
2239         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2240         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2241         int ifi;
2242
2243         ifi = (int) if_nametoindex(name);
2244         if (ifi <= 0)
2245                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2246
2247         sprintf(ifi_str, "n%i", ifi);
2248         d = udev_device_new_from_device_id(udev, ifi_str);
2249         if (!d)
2250                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2251
2252         if (udev_device_get_is_initialized(d) <= 0) {
2253                 log_error("Network interface %s is not initialized yet.", name);
2254                 return -EBUSY;
2255         }
2256
2257         return ifi;
2258 }
2259
2260 static int move_network_interfaces(pid_t pid) {
2261         _cleanup_udev_unref_ struct udev *udev = NULL;
2262         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2263         char **i;
2264         int r;
2265
2266         if (!arg_private_network)
2267                 return 0;
2268
2269         if (strv_isempty(arg_network_interfaces))
2270                 return 0;
2271
2272         r = sd_rtnl_open(&rtnl, 0);
2273         if (r < 0)
2274                 return log_error_errno(r, "Failed to connect to netlink: %m");
2275
2276         udev = udev_new();
2277         if (!udev) {
2278                 log_error("Failed to connect to udev.");
2279                 return -ENOMEM;
2280         }
2281
2282         STRV_FOREACH(i, arg_network_interfaces) {
2283                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2284                 int ifi;
2285
2286                 ifi = parse_interface(udev, *i);
2287                 if (ifi < 0)
2288                         return ifi;
2289
2290                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2291                 if (r < 0)
2292                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2293
2294                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2297
2298                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2299                 if (r < 0)
2300                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2301         }
2302
2303         return 0;
2304 }
2305
2306 static int setup_macvlan(pid_t pid) {
2307         _cleanup_udev_unref_ struct udev *udev = NULL;
2308         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2309         unsigned idx = 0;
2310         char **i;
2311         int r;
2312
2313         if (!arg_private_network)
2314                 return 0;
2315
2316         if (strv_isempty(arg_network_macvlan))
2317                 return 0;
2318
2319         r = sd_rtnl_open(&rtnl, 0);
2320         if (r < 0)
2321                 return log_error_errno(r, "Failed to connect to netlink: %m");
2322
2323         udev = udev_new();
2324         if (!udev) {
2325                 log_error("Failed to connect to udev.");
2326                 return -ENOMEM;
2327         }
2328
2329         STRV_FOREACH(i, arg_network_macvlan) {
2330                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2331                 _cleanup_free_ char *n = NULL;
2332                 struct ether_addr mac;
2333                 int ifi;
2334
2335                 ifi = parse_interface(udev, *i);
2336                 if (ifi < 0)
2337                         return ifi;
2338
2339                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2340                 if (r < 0)
2341                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2342
2343                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2346
2347                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2348                 if (r < 0)
2349                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2350
2351                 n = strappend("mv-", *i);
2352                 if (!n)
2353                         return log_oom();
2354
2355                 strshorten(n, IFNAMSIZ-1);
2356
2357                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2358                 if (r < 0)
2359                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2360
2361                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2364
2365                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2368
2369                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to open netlink container: %m");
2372
2373                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to open netlink container: %m");
2376
2377                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2380
2381                 r = sd_rtnl_message_close_container(m);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to close netlink container: %m");
2384
2385                 r = sd_rtnl_message_close_container(m);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to close netlink container: %m");
2388
2389                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2390                 if (r < 0)
2391                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2392         }
2393
2394         return 0;
2395 }
2396
2397 static int setup_ipvlan(pid_t pid) {
2398         _cleanup_udev_unref_ struct udev *udev = NULL;
2399         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2400         char **i;
2401         int r;
2402
2403         if (!arg_private_network)
2404                 return 0;
2405
2406         if (strv_isempty(arg_network_ipvlan))
2407                 return 0;
2408
2409         r = sd_rtnl_open(&rtnl, 0);
2410         if (r < 0)
2411                 return log_error_errno(r, "Failed to connect to netlink: %m");
2412
2413         udev = udev_new();
2414         if (!udev) {
2415                 log_error("Failed to connect to udev.");
2416                 return -ENOMEM;
2417         }
2418
2419         STRV_FOREACH(i, arg_network_ipvlan) {
2420                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2421                 _cleanup_free_ char *n = NULL;
2422                 int ifi;
2423
2424                 ifi = parse_interface(udev, *i);
2425                 if (ifi < 0)
2426                         return ifi;
2427
2428                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2431
2432                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2435
2436                 n = strappend("iv-", *i);
2437                 if (!n)
2438                         return log_oom();
2439
2440                 strshorten(n, IFNAMSIZ-1);
2441
2442                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2445
2446                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2449
2450                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to open netlink container: %m");
2457
2458                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_message_close_container(m);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to close netlink container: %m");
2469
2470                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2473         }
2474
2475         return 0;
2476 }
2477
2478 static int setup_seccomp(void) {
2479
2480 #ifdef HAVE_SECCOMP
2481         static const int blacklist[] = {
2482                 SCMP_SYS(kexec_load),
2483                 SCMP_SYS(open_by_handle_at),
2484                 SCMP_SYS(init_module),
2485                 SCMP_SYS(finit_module),
2486                 SCMP_SYS(delete_module),
2487                 SCMP_SYS(iopl),
2488                 SCMP_SYS(ioperm),
2489                 SCMP_SYS(swapon),
2490                 SCMP_SYS(swapoff),
2491         };
2492
2493         scmp_filter_ctx seccomp;
2494         unsigned i;
2495         int r;
2496
2497         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2498         if (!seccomp)
2499                 return log_oom();
2500
2501         r = seccomp_add_secondary_archs(seccomp);
2502         if (r < 0) {
2503                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2504                 goto finish;
2505         }
2506
2507         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2508                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2509                 if (r == -EFAULT)
2510                         continue; /* unknown syscall */
2511                 if (r < 0) {
2512                         log_error_errno(r, "Failed to block syscall: %m");
2513                         goto finish;
2514                 }
2515         }
2516
2517         /*
2518            Audit is broken in containers, much of the userspace audit
2519            hookup will fail if running inside a container. We don't
2520            care and just turn off creation of audit sockets.
2521
2522            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2523            with EAFNOSUPPORT which audit userspace uses as indication
2524            that audit is disabled in the kernel.
2525          */
2526
2527         r = seccomp_rule_add(
2528                         seccomp,
2529                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2530                         SCMP_SYS(socket),
2531                         2,
2532                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2533                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2534         if (r < 0) {
2535                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2536                 goto finish;
2537         }
2538
2539         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2540         if (r < 0) {
2541                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2542                 goto finish;
2543         }
2544
2545         r = seccomp_load(seccomp);
2546         if (r < 0)
2547                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2548
2549 finish:
2550         seccomp_release(seccomp);
2551         return r;
2552 #else
2553         return 0;
2554 #endif
2555
2556 }
2557
2558 static int setup_propagate(const char *root) {
2559         const char *p, *q;
2560
2561         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2562         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2563         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2564         (void) mkdir_p(p, 0600);
2565
2566         q = strappenda(root, "/run/systemd/nspawn/incoming");
2567         mkdir_parents(q, 0755);
2568         mkdir_p(q, 0600);
2569
2570         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2571                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2572
2573         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2574                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2575
2576         return 0;
2577 }
2578
2579 static int setup_image(char **device_path, int *loop_nr) {
2580         struct loop_info64 info = {
2581                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2582         };
2583         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2584         _cleanup_free_ char* loopdev = NULL;
2585         struct stat st;
2586         int r, nr;
2587
2588         assert(device_path);
2589         assert(loop_nr);
2590         assert(arg_image);
2591
2592         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2593         if (fd < 0)
2594                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2595
2596         if (fstat(fd, &st) < 0)
2597                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2598
2599         if (S_ISBLK(st.st_mode)) {
2600                 char *p;
2601
2602                 p = strdup(arg_image);
2603                 if (!p)
2604                         return log_oom();
2605
2606                 *device_path = p;
2607
2608                 *loop_nr = -1;
2609
2610                 r = fd;
2611                 fd = -1;
2612
2613                 return r;
2614         }
2615
2616         if (!S_ISREG(st.st_mode)) {
2617                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2618                 return -EINVAL;
2619         }
2620
2621         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2622         if (control < 0)
2623                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2624
2625         nr = ioctl(control, LOOP_CTL_GET_FREE);
2626         if (nr < 0)
2627                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2628
2629         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2630                 return log_oom();
2631
2632         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2633         if (loop < 0)
2634                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2635
2636         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2637                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2638
2639         if (arg_read_only)
2640                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2641
2642         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2643                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2644
2645         *device_path = loopdev;
2646         loopdev = NULL;
2647
2648         *loop_nr = nr;
2649
2650         r = loop;
2651         loop = -1;
2652
2653         return r;
2654 }
2655
2656 #define PARTITION_TABLE_BLURB \
2657         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2658         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2659         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2660         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2661         "to be bootable with systemd-nspawn."
2662
2663 static int dissect_image(
2664                 int fd,
2665                 char **root_device, bool *root_device_rw,
2666                 char **home_device, bool *home_device_rw,
2667                 char **srv_device, bool *srv_device_rw,
2668                 bool *secondary) {
2669
2670 #ifdef HAVE_BLKID
2671         int home_nr = -1, srv_nr = -1;
2672 #ifdef GPT_ROOT_NATIVE
2673         int root_nr = -1;
2674 #endif
2675 #ifdef GPT_ROOT_SECONDARY
2676         int secondary_root_nr = -1;
2677 #endif
2678         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2679         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2680         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2681         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2682         _cleanup_udev_unref_ struct udev *udev = NULL;
2683         struct udev_list_entry *first, *item;
2684         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2685         bool is_gpt, is_mbr, multiple_generic = false;
2686         const char *pttype = NULL;
2687         blkid_partlist pl;
2688         struct stat st;
2689         unsigned i;
2690         int r;
2691
2692         assert(fd >= 0);
2693         assert(root_device);
2694         assert(home_device);
2695         assert(srv_device);
2696         assert(secondary);
2697         assert(arg_image);
2698
2699         b = blkid_new_probe();
2700         if (!b)
2701                 return log_oom();
2702
2703         errno = 0;
2704         r = blkid_probe_set_device(b, fd, 0, 0);
2705         if (r != 0) {
2706                 if (errno == 0)
2707                         return log_oom();
2708
2709                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2710                 return -errno;
2711         }
2712
2713         blkid_probe_enable_partitions(b, 1);
2714         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2715
2716         errno = 0;
2717         r = blkid_do_safeprobe(b);
2718         if (r == -2 || r == 1) {
2719                 log_error("Failed to identify any partition table on\n"
2720                           "    %s\n"
2721                           PARTITION_TABLE_BLURB, arg_image);
2722                 return -EINVAL;
2723         } else if (r != 0) {
2724                 if (errno == 0)
2725                         errno = EIO;
2726                 log_error_errno(errno, "Failed to probe: %m");
2727                 return -errno;
2728         }
2729
2730         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2731
2732         is_gpt = streq_ptr(pttype, "gpt");
2733         is_mbr = streq_ptr(pttype, "dos");
2734
2735         if (!is_gpt && !is_mbr) {
2736                 log_error("No GPT or MBR partition table discovered on\n"
2737                           "    %s\n"
2738                           PARTITION_TABLE_BLURB, arg_image);
2739                 return -EINVAL;
2740         }
2741
2742         errno = 0;
2743         pl = blkid_probe_get_partitions(b);
2744         if (!pl) {
2745                 if (errno == 0)
2746                         return log_oom();
2747
2748                 log_error("Failed to list partitions of %s", arg_image);
2749                 return -errno;
2750         }
2751
2752         udev = udev_new();
2753         if (!udev)
2754                 return log_oom();
2755
2756         if (fstat(fd, &st) < 0)
2757                 return log_error_errno(errno, "Failed to stat block device: %m");
2758
2759         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2760         if (!d)
2761                 return log_oom();
2762
2763         for (i = 0;; i++) {
2764                 int n, m;
2765
2766                 if (i >= 10) {
2767                         log_error("Kernel partitions never appeared.");
2768                         return -ENXIO;
2769                 }
2770
2771                 e = udev_enumerate_new(udev);
2772                 if (!e)
2773                         return log_oom();
2774
2775                 r = udev_enumerate_add_match_parent(e, d);
2776                 if (r < 0)
2777                         return log_oom();
2778
2779                 r = udev_enumerate_scan_devices(e);
2780                 if (r < 0)
2781                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2782
2783                 /* Count the partitions enumerated by the kernel */
2784                 n = 0;
2785                 first = udev_enumerate_get_list_entry(e);
2786                 udev_list_entry_foreach(item, first)
2787                         n++;
2788
2789                 /* Count the partitions enumerated by blkid */
2790                 m = blkid_partlist_numof_partitions(pl);
2791                 if (n == m + 1)
2792                         break;
2793                 if (n > m + 1) {
2794                         log_error("blkid and kernel partition list do not match.");
2795                         return -EIO;
2796                 }
2797                 if (n < m + 1) {
2798                         unsigned j;
2799
2800                         /* The kernel has probed fewer partitions than
2801                          * blkid? Maybe the kernel prober is still
2802                          * running or it got EBUSY because udev
2803                          * already opened the device. Let's reprobe
2804                          * the device, which is a synchronous call
2805                          * that waits until probing is complete. */
2806
2807                         for (j = 0; j < 20; j++) {
2808
2809                                 r = ioctl(fd, BLKRRPART, 0);
2810                                 if (r < 0)
2811                                         r = -errno;
2812                                 if (r >= 0 || r != -EBUSY)
2813                                         break;
2814
2815                                 /* If something else has the device
2816                                  * open, such as an udev rule, the
2817                                  * ioctl will return EBUSY. Since
2818                                  * there's no way to wait until it
2819                                  * isn't busy anymore, let's just wait
2820                                  * a bit, and try again.
2821                                  *
2822                                  * This is really something they
2823                                  * should fix in the kernel! */
2824
2825                                 usleep(50 * USEC_PER_MSEC);
2826                         }
2827
2828                         if (r < 0)
2829                                 return log_error_errno(r, "Failed to reread partition table: %m");
2830                 }
2831
2832                 e = udev_enumerate_unref(e);
2833         }
2834
2835         first = udev_enumerate_get_list_entry(e);
2836         udev_list_entry_foreach(item, first) {
2837                 _cleanup_udev_device_unref_ struct udev_device *q;
2838                 const char *node;
2839                 unsigned long long flags;
2840                 blkid_partition pp;
2841                 dev_t qn;
2842                 int nr;
2843
2844                 errno = 0;
2845                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2846                 if (!q) {
2847                         if (!errno)
2848                                 errno = ENOMEM;
2849
2850                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2851                         return -errno;
2852                 }
2853
2854                 qn = udev_device_get_devnum(q);
2855                 if (major(qn) == 0)
2856                         continue;
2857
2858                 if (st.st_rdev == qn)
2859                         continue;
2860
2861                 node = udev_device_get_devnode(q);
2862                 if (!node)
2863                         continue;
2864
2865                 pp = blkid_partlist_devno_to_partition(pl, qn);
2866                 if (!pp)
2867                         continue;
2868
2869                 flags = blkid_partition_get_flags(pp);
2870
2871                 nr = blkid_partition_get_partno(pp);
2872                 if (nr < 0)
2873                         continue;
2874
2875                 if (is_gpt) {
2876                         sd_id128_t type_id;
2877                         const char *stype;
2878
2879                         if (flags & GPT_FLAG_NO_AUTO)
2880                                 continue;
2881
2882                         stype = blkid_partition_get_type_string(pp);
2883                         if (!stype)
2884                                 continue;
2885
2886                         if (sd_id128_from_string(stype, &type_id) < 0)
2887                                 continue;
2888
2889                         if (sd_id128_equal(type_id, GPT_HOME)) {
2890
2891                                 if (home && nr >= home_nr)
2892                                         continue;
2893
2894                                 home_nr = nr;
2895                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2896
2897                                 r = free_and_strdup(&home, node);
2898                                 if (r < 0)
2899                                         return log_oom();
2900
2901                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2902
2903                                 if (srv && nr >= srv_nr)
2904                                         continue;
2905
2906                                 srv_nr = nr;
2907                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2908
2909                                 r = free_and_strdup(&srv, node);
2910                                 if (r < 0)
2911                                         return log_oom();
2912                         }
2913 #ifdef GPT_ROOT_NATIVE
2914                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2915
2916                                 if (root && nr >= root_nr)
2917                                         continue;
2918
2919                                 root_nr = nr;
2920                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2921
2922                                 r = free_and_strdup(&root, node);
2923                                 if (r < 0)
2924                                         return log_oom();
2925                         }
2926 #endif
2927 #ifdef GPT_ROOT_SECONDARY
2928                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2929
2930                                 if (secondary_root && nr >= secondary_root_nr)
2931                                         continue;
2932
2933                                 secondary_root_nr = nr;
2934                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2935
2936                                 r = free_and_strdup(&secondary_root, node);
2937                                 if (r < 0)
2938                                         return log_oom();
2939                         }
2940 #endif
2941                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2942
2943                                 if (generic)
2944                                         multiple_generic = true;
2945                                 else {
2946                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2947
2948                                         r = free_and_strdup(&generic, node);
2949                                         if (r < 0)
2950                                                 return log_oom();
2951                                 }
2952                         }
2953
2954                 } else if (is_mbr) {
2955                         int type;
2956
2957                         if (flags != 0x80) /* Bootable flag */
2958                                 continue;
2959
2960                         type = blkid_partition_get_type(pp);
2961                         if (type != 0x83) /* Linux partition */
2962                                 continue;
2963
2964                         if (generic)
2965                                 multiple_generic = true;
2966                         else {
2967                                 generic_rw = true;
2968
2969                                 r = free_and_strdup(&root, node);
2970                                 if (r < 0)
2971                                         return log_oom();
2972                         }
2973                 }
2974         }
2975
2976         if (root) {
2977                 *root_device = root;
2978                 root = NULL;
2979
2980                 *root_device_rw = root_rw;
2981                 *secondary = false;
2982         } else if (secondary_root) {
2983                 *root_device = secondary_root;
2984                 secondary_root = NULL;
2985
2986                 *root_device_rw = secondary_root_rw;
2987                 *secondary = true;
2988         } else if (generic) {
2989
2990                 /* There were no partitions with precise meanings
2991                  * around, but we found generic partitions. In this
2992                  * case, if there's only one, we can go ahead and boot
2993                  * it, otherwise we bail out, because we really cannot
2994                  * make any sense of it. */
2995
2996                 if (multiple_generic) {
2997                         log_error("Identified multiple bootable Linux partitions on\n"
2998                                   "    %s\n"
2999                                   PARTITION_TABLE_BLURB, arg_image);
3000                         return -EINVAL;
3001                 }
3002
3003                 *root_device = generic;
3004                 generic = NULL;
3005
3006                 *root_device_rw = generic_rw;
3007                 *secondary = false;
3008         } else {
3009                 log_error("Failed to identify root partition in disk image\n"
3010                           "    %s\n"
3011                           PARTITION_TABLE_BLURB, arg_image);
3012                 return -EINVAL;
3013         }
3014
3015         if (home) {
3016                 *home_device = home;
3017                 home = NULL;
3018
3019                 *home_device_rw = home_rw;
3020         }
3021
3022         if (srv) {
3023                 *srv_device = srv;
3024                 srv = NULL;
3025
3026                 *srv_device_rw = srv_rw;
3027         }
3028
3029         return 0;
3030 #else
3031         log_error("--image= is not supported, compiled without blkid support.");
3032         return -ENOTSUP;
3033 #endif
3034 }
3035
3036 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3037 #ifdef HAVE_BLKID
3038         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3039         const char *fstype, *p;
3040         int r;
3041
3042         assert(what);
3043         assert(where);
3044
3045         if (arg_read_only)
3046                 rw = false;
3047
3048         if (directory)
3049                 p = strappenda(where, directory);
3050         else
3051                 p = where;
3052
3053         errno = 0;
3054         b = blkid_new_probe_from_filename(what);
3055         if (!b) {
3056                 if (errno == 0)
3057                         return log_oom();
3058                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3059                 return -errno;
3060         }
3061
3062         blkid_probe_enable_superblocks(b, 1);
3063         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3064
3065         errno = 0;
3066         r = blkid_do_safeprobe(b);
3067         if (r == -1 || r == 1) {
3068                 log_error("Cannot determine file system type of %s", what);
3069                 return -EINVAL;
3070         } else if (r != 0) {
3071                 if (errno == 0)
3072                         errno = EIO;
3073                 log_error_errno(errno, "Failed to probe %s: %m", what);
3074                 return -errno;
3075         }
3076
3077         errno = 0;
3078         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3079                 if (errno == 0)
3080                         errno = EINVAL;
3081                 log_error("Failed to determine file system type of %s", what);
3082                 return -errno;
3083         }
3084
3085         if (streq(fstype, "crypto_LUKS")) {
3086                 log_error("nspawn currently does not support LUKS disk images.");
3087                 return -ENOTSUP;
3088         }
3089
3090         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3091                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3092
3093         return 0;
3094 #else
3095         log_error("--image= is not supported, compiled without blkid support.");
3096         return -ENOTSUP;
3097 #endif
3098 }
3099
3100 static int mount_devices(
3101                 const char *where,
3102                 const char *root_device, bool root_device_rw,
3103                 const char *home_device, bool home_device_rw,
3104                 const char *srv_device, bool srv_device_rw) {
3105         int r;
3106
3107         assert(where);
3108
3109         if (root_device) {
3110                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3111                 if (r < 0)
3112                         return log_error_errno(r, "Failed to mount root directory: %m");
3113         }
3114
3115         if (home_device) {
3116                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3117                 if (r < 0)
3118                         return log_error_errno(r, "Failed to mount home directory: %m");
3119         }
3120
3121         if (srv_device) {
3122                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3123                 if (r < 0)
3124                         return log_error_errno(r, "Failed to mount server data directory: %m");
3125         }
3126
3127         return 0;
3128 }
3129
3130 static void loop_remove(int nr, int *image_fd) {
3131         _cleanup_close_ int control = -1;
3132         int r;
3133
3134         if (nr < 0)
3135                 return;
3136
3137         if (image_fd && *image_fd >= 0) {
3138                 r = ioctl(*image_fd, LOOP_CLR_FD);
3139                 if (r < 0)
3140                         log_debug_errno(errno, "Failed to close loop image: %m");
3141                 *image_fd = safe_close(*image_fd);
3142         }
3143
3144         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3145         if (control < 0) {
3146                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3147                 return;
3148         }
3149
3150         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3151         if (r < 0)
3152                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3153 }
3154
3155 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3156         int pipe_fds[2];
3157         pid_t pid;
3158
3159         assert(database);
3160         assert(key);
3161         assert(rpid);
3162
3163         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3164                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3165
3166         pid = fork();
3167         if (pid < 0)
3168                 return log_error_errno(errno, "Failed to fork getent child: %m");
3169         else if (pid == 0) {
3170                 int nullfd;
3171                 char *empty_env = NULL;
3172
3173                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3174                         _exit(EXIT_FAILURE);
3175
3176                 if (pipe_fds[0] > 2)
3177                         safe_close(pipe_fds[0]);
3178                 if (pipe_fds[1] > 2)
3179                         safe_close(pipe_fds[1]);
3180
3181                 nullfd = open("/dev/null", O_RDWR);
3182                 if (nullfd < 0)
3183                         _exit(EXIT_FAILURE);
3184
3185                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3186                         _exit(EXIT_FAILURE);
3187
3188                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3189                         _exit(EXIT_FAILURE);
3190
3191                 if (nullfd > 2)
3192                         safe_close(nullfd);
3193
3194                 reset_all_signal_handlers();
3195                 close_all_fds(NULL, 0);
3196
3197                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3198                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3199                 _exit(EXIT_FAILURE);
3200         }
3201
3202         pipe_fds[1] = safe_close(pipe_fds[1]);
3203
3204         *rpid = pid;
3205
3206         return pipe_fds[0];
3207 }
3208
3209 static int change_uid_gid(char **_home) {
3210         char line[LINE_MAX], *x, *u, *g, *h;
3211         const char *word, *state;
3212         _cleanup_free_ uid_t *uids = NULL;
3213         _cleanup_free_ char *home = NULL;
3214         _cleanup_fclose_ FILE *f = NULL;
3215         _cleanup_close_ int fd = -1;
3216         unsigned n_uids = 0;
3217         size_t sz = 0, l;
3218         uid_t uid;
3219         gid_t gid;
3220         pid_t pid;
3221         int r;
3222
3223         assert(_home);
3224
3225         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3226                 /* Reset everything fully to 0, just in case */
3227
3228                 if (setgroups(0, NULL) < 0)
3229                         return log_error_errno(errno, "setgroups() failed: %m");
3230
3231                 if (setresgid(0, 0, 0) < 0)
3232                         return log_error_errno(errno, "setregid() failed: %m");
3233
3234                 if (setresuid(0, 0, 0) < 0)
3235                         return log_error_errno(errno, "setreuid() failed: %m");
3236
3237                 *_home = NULL;
3238                 return 0;
3239         }
3240
3241         /* First, get user credentials */
3242         fd = spawn_getent("passwd", arg_user, &pid);
3243         if (fd < 0)
3244                 return fd;
3245
3246         f = fdopen(fd, "r");
3247         if (!f)
3248                 return log_oom();
3249         fd = -1;
3250
3251         if (!fgets(line, sizeof(line), f)) {
3252
3253                 if (!ferror(f)) {
3254                         log_error("Failed to resolve user %s.", arg_user);
3255                         return -ESRCH;
3256                 }
3257
3258                 log_error_errno(errno, "Failed to read from getent: %m");
3259                 return -errno;
3260         }
3261
3262         truncate_nl(line);
3263
3264         wait_for_terminate_and_warn("getent passwd", pid, true);
3265
3266         x = strchr(line, ':');
3267         if (!x) {
3268                 log_error("/etc/passwd entry has invalid user field.");
3269                 return -EIO;
3270         }
3271
3272         u = strchr(x+1, ':');
3273         if (!u) {
3274                 log_error("/etc/passwd entry has invalid password field.");
3275                 return -EIO;
3276         }
3277
3278         u++;
3279         g = strchr(u, ':');
3280         if (!g) {
3281                 log_error("/etc/passwd entry has invalid UID field.");
3282                 return -EIO;
3283         }
3284
3285         *g = 0;
3286         g++;
3287         x = strchr(g, ':');
3288         if (!x) {
3289                 log_error("/etc/passwd entry has invalid GID field.");
3290                 return -EIO;
3291         }
3292
3293         *x = 0;
3294         h = strchr(x+1, ':');
3295         if (!h) {
3296                 log_error("/etc/passwd entry has invalid GECOS field.");
3297                 return -EIO;
3298         }
3299
3300         h++;
3301         x = strchr(h, ':');
3302         if (!x) {
3303                 log_error("/etc/passwd entry has invalid home directory field.");
3304                 return -EIO;
3305         }
3306
3307         *x = 0;
3308
3309         r = parse_uid(u, &uid);
3310         if (r < 0) {
3311                 log_error("Failed to parse UID of user.");
3312                 return -EIO;
3313         }
3314
3315         r = parse_gid(g, &gid);
3316         if (r < 0) {
3317                 log_error("Failed to parse GID of user.");
3318                 return -EIO;
3319         }
3320
3321         home = strdup(h);
3322         if (!home)
3323                 return log_oom();
3324
3325         /* Second, get group memberships */
3326         fd = spawn_getent("initgroups", arg_user, &pid);
3327         if (fd < 0)
3328                 return fd;
3329
3330         fclose(f);
3331         f = fdopen(fd, "r");
3332         if (!f)
3333                 return log_oom();
3334         fd = -1;
3335
3336         if (!fgets(line, sizeof(line), f)) {
3337                 if (!ferror(f)) {
3338                         log_error("Failed to resolve user %s.", arg_user);
3339                         return -ESRCH;
3340                 }
3341
3342                 log_error_errno(errno, "Failed to read from getent: %m");
3343                 return -errno;
3344         }
3345
3346         truncate_nl(line);
3347
3348         wait_for_terminate_and_warn("getent initgroups", pid, true);
3349
3350         /* Skip over the username and subsequent separator whitespace */
3351         x = line;
3352         x += strcspn(x, WHITESPACE);
3353         x += strspn(x, WHITESPACE);
3354
3355         FOREACH_WORD(word, l, x, state) {
3356                 char c[l+1];
3357
3358                 memcpy(c, word, l);
3359                 c[l] = 0;
3360
3361                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3362                         return log_oom();
3363
3364                 r = parse_uid(c, &uids[n_uids++]);
3365                 if (r < 0) {
3366                         log_error("Failed to parse group data from getent.");
3367                         return -EIO;
3368                 }
3369         }
3370
3371         r = mkdir_parents(home, 0775);
3372         if (r < 0)
3373                 return log_error_errno(r, "Failed to make home root directory: %m");
3374
3375         r = mkdir_safe(home, 0755, uid, gid);
3376         if (r < 0 && r != -EEXIST)
3377                 return log_error_errno(r, "Failed to make home directory: %m");
3378
3379         fchown(STDIN_FILENO, uid, gid);
3380         fchown(STDOUT_FILENO, uid, gid);
3381         fchown(STDERR_FILENO, uid, gid);
3382
3383         if (setgroups(n_uids, uids) < 0)
3384                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3385
3386         if (setresgid(gid, gid, gid) < 0)
3387                 return log_error_errno(errno, "setregid() failed: %m");
3388
3389         if (setresuid(uid, uid, uid) < 0)
3390                 return log_error_errno(errno, "setreuid() failed: %m");
3391
3392         if (_home) {
3393                 *_home = home;
3394                 home = NULL;
3395         }
3396
3397         return 0;
3398 }
3399
3400 /*
3401  * Return values:
3402  * < 0 : wait_for_terminate() failed to get the state of the
3403  *       container, the container was terminated by a signal, or
3404  *       failed for an unknown reason.  No change is made to the
3405  *       container argument.
3406  * > 0 : The program executed in the container terminated with an
3407  *       error.  The exit code of the program executed in the
3408  *       container is returned.  The container argument has been set
3409  *       to CONTAINER_TERMINATED.
3410  *   0 : The container is being rebooted, has been shut down or exited
3411  *       successfully.  The container argument has been set to either
3412  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3413  *
3414  * That is, success is indicated by a return value of zero, and an
3415  * error is indicated by a non-zero value.
3416  */
3417 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3418         siginfo_t status;
3419         int r;
3420
3421         r = wait_for_terminate(pid, &status);
3422         if (r < 0)
3423                 return log_warning_errno(r, "Failed to wait for container: %m");
3424
3425         switch (status.si_code) {
3426
3427         case CLD_EXITED:
3428                 if (status.si_status == 0) {
3429                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3430
3431                 } else
3432                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3433
3434                 *container = CONTAINER_TERMINATED;
3435                 return status.si_status;
3436
3437         case CLD_KILLED:
3438                 if (status.si_status == SIGINT) {
3439
3440                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3441                         *container = CONTAINER_TERMINATED;
3442                         return 0;
3443
3444                 } else if (status.si_status == SIGHUP) {
3445
3446                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3447                         *container = CONTAINER_REBOOTED;
3448                         return 0;
3449                 }
3450
3451                 /* CLD_KILLED fallthrough */
3452
3453         case CLD_DUMPED:
3454                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3455                 return -EIO;
3456
3457         default:
3458                 log_error("Container %s failed due to unknown reason.", arg_machine);
3459                 return -EIO;
3460         }
3461
3462         return r;
3463 }
3464
3465 static void nop_handler(int sig) {}
3466
3467 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3468         pid_t pid;
3469
3470         pid = PTR_TO_UINT32(userdata);
3471         if (pid > 0) {
3472                 if (kill(pid, SIGRTMIN+3) >= 0) {
3473                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3474                         sd_event_source_set_userdata(s, NULL);
3475                         return 0;
3476                 }
3477         }
3478
3479         sd_event_exit(sd_event_source_get_event(s), 0);
3480         return 0;
3481 }
3482
3483 static int determine_names(void) {
3484         int r;
3485
3486         if (!arg_image && !arg_directory) {
3487                 if (arg_machine) {
3488                         _cleanup_(image_unrefp) Image *i = NULL;
3489
3490                         r = image_find(arg_machine, &i);
3491                         if (r < 0)
3492                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3493                         else if (r == 0) {
3494                                 log_error("No image for machine '%s': %m", arg_machine);
3495                                 return -ENOENT;
3496                         }
3497
3498                         if (i->type == IMAGE_RAW)
3499                                 r = set_sanitized_path(&arg_image, i->path);
3500                         else
3501                                 r = set_sanitized_path(&arg_directory, i->path);
3502                         if (r < 0)
3503                                 return log_error_errno(r, "Invalid image directory: %m");
3504
3505                         arg_read_only = arg_read_only || i->read_only;
3506                 } else
3507                         arg_directory = get_current_dir_name();
3508
3509                 if (!arg_directory && !arg_machine) {
3510                         log_error("Failed to determine path, please use -D or -i.");
3511                         return -EINVAL;
3512                 }
3513         }
3514
3515         if (!arg_machine) {
3516                 if (arg_directory && path_equal(arg_directory, "/"))
3517                         arg_machine = gethostname_malloc();
3518                 else
3519                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3520
3521                 if (!arg_machine)
3522                         return log_oom();
3523
3524                 hostname_cleanup(arg_machine, false);
3525                 if (!machine_name_is_valid(arg_machine)) {
3526                         log_error("Failed to determine machine name automatically, please use -M.");
3527                         return -EINVAL;
3528                 }
3529
3530                 if (arg_ephemeral) {
3531                         char *b;
3532
3533                         /* Add a random suffix when this is an
3534                          * ephemeral machine, so that we can run many
3535                          * instances at once without manually having
3536                          * to specify -M each time. */
3537
3538                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3539                                 return log_oom();
3540
3541                         free(arg_machine);
3542                         arg_machine = b;
3543                 }
3544         }
3545
3546         return 0;
3547 }
3548
3549 int main(int argc, char *argv[]) {
3550
3551         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3552         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3553         _cleanup_close_ int master = -1, image_fd = -1;
3554         _cleanup_fdset_free_ FDSet *fds = NULL;
3555         int r, n_fd_passed, loop_nr = -1;
3556         char veth_name[IFNAMSIZ];
3557         bool secondary = false, remove_subvol = false;
3558         sigset_t mask, mask_chld;
3559         pid_t pid = 0;
3560         int ret = EXIT_SUCCESS;
3561         union in_addr_union exposed = {};
3562         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3563
3564         log_parse_environment();
3565         log_open();
3566
3567         r = parse_argv(argc, argv);
3568         if (r <= 0)
3569                 goto finish;
3570
3571         r = determine_names();
3572         if (r < 0)
3573                 goto finish;
3574
3575         if (geteuid() != 0) {
3576                 log_error("Need to be root.");
3577                 r = -EPERM;
3578                 goto finish;
3579         }
3580
3581         if (sd_booted() <= 0) {
3582                 log_error("Not running on a systemd system.");
3583                 r = -EINVAL;
3584                 goto finish;
3585         }
3586
3587         log_close();
3588         n_fd_passed = sd_listen_fds(false);
3589         if (n_fd_passed > 0) {
3590                 r = fdset_new_listen_fds(&fds, false);
3591                 if (r < 0) {
3592                         log_error_errno(r, "Failed to collect file descriptors: %m");
3593                         goto finish;
3594                 }
3595         }
3596         fdset_close_others(fds);
3597         log_open();
3598
3599         if (arg_directory) {
3600                 assert(!arg_image);
3601
3602                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3603                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3604                         r = -EINVAL;
3605                         goto finish;
3606                 }
3607
3608                 if (arg_ephemeral) {
3609                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3610                         char *np;
3611
3612                         /* If the specified path is a mount point we
3613                          * generate the new snapshot immediately
3614                          * inside it under a random name. However if
3615                          * the specified is not a mount point we
3616                          * create the new snapshot in the parent
3617                          * directory, just next to it. */
3618                         r = path_is_mount_point(arg_directory, false);
3619                         if (r < 0) {
3620                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3621                                 goto finish;
3622                         }
3623                         if (r > 0)
3624                                 r = tempfn_random_child(arg_directory, &np);
3625                         else
3626                                 r = tempfn_random(arg_directory, &np);
3627                         if (r < 0) {
3628                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3629                                 goto finish;
3630                         }
3631
3632                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3633                         if (r < 0) {
3634                                 log_error_errno(r, "Failed to lock %s: %m", np);
3635                                 goto finish;
3636                         }
3637
3638                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3639                         if (r < 0) {
3640                                 free(np);
3641                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3642                                 goto finish;
3643                         }
3644
3645                         free(arg_directory);
3646                         arg_directory = np;
3647
3648                         remove_subvol = true;
3649
3650                 } else {
3651                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3652                         if (r == -EBUSY) {
3653                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3654                                 goto finish;
3655                         }
3656                         if (r < 0) {
3657                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3658                                 return r;
3659                         }
3660
3661                         if (arg_template) {
3662                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3663                                 if (r == -EEXIST) {
3664                                         if (!arg_quiet)
3665                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3666                                 } else if (r < 0) {
3667                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3668                                         goto finish;
3669                                 } else {
3670                                         if (!arg_quiet)
3671                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3672                                 }
3673                         }
3674                 }
3675
3676                 if (arg_boot) {
3677                         if (path_is_os_tree(arg_directory) <= 0) {
3678                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3679                                 r = -EINVAL;
3680                                 goto finish;
3681                         }
3682                 } else {
3683                         const char *p;
3684
3685                         p = strappenda(arg_directory,
3686                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3687                         if (access(p, F_OK) < 0) {
3688                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3689                                 r = -EINVAL;
3690                                 goto finish;
3691                         }
3692                 }
3693
3694         } else {
3695                 char template[] = "/tmp/nspawn-root-XXXXXX";
3696
3697                 assert(arg_image);
3698                 assert(!arg_template);
3699
3700                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3701                 if (r == -EBUSY) {
3702                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3703                         goto finish;
3704                 }
3705                 if (r < 0) {
3706                         r = log_error_errno(r, "Failed to create image lock: %m");
3707                         goto finish;
3708                 }
3709
3710                 if (!mkdtemp(template)) {
3711                         log_error_errno(errno, "Failed to create temporary directory: %m");
3712                         r = -errno;
3713                         goto finish;
3714                 }
3715
3716                 arg_directory = strdup(template);
3717                 if (!arg_directory) {
3718                         r = log_oom();
3719                         goto finish;
3720                 }
3721
3722                 image_fd = setup_image(&device_path, &loop_nr);
3723                 if (image_fd < 0) {
3724                         r = image_fd;
3725                         goto finish;
3726                 }
3727
3728                 r = dissect_image(image_fd,
3729                                   &root_device, &root_device_rw,
3730                                   &home_device, &home_device_rw,
3731                                   &srv_device, &srv_device_rw,
3732                                   &secondary);
3733                 if (r < 0)
3734                         goto finish;
3735         }
3736
3737         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3738         if (master < 0) {
3739                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3740                 goto finish;
3741         }
3742
3743         r = ptsname_malloc(master, &console);
3744         if (r < 0) {
3745                 r = log_error_errno(r, "Failed to determine tty name: %m");
3746                 goto finish;
3747         }
3748
3749         if (!arg_quiet)
3750                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3751                          arg_machine, arg_image ?: arg_directory);
3752
3753         if (unlockpt(master) < 0) {
3754                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3755                 goto finish;
3756         }
3757
3758         assert_se(sigemptyset(&mask) == 0);
3759         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3760         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3761
3762         assert_se(sigemptyset(&mask_chld) == 0);
3763         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3764
3765         for (;;) {
3766                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3767                 ContainerStatus container_status;
3768                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3769                 struct sigaction sa = {
3770                         .sa_handler = nop_handler,
3771                         .sa_flags = SA_NOCLDSTOP,
3772                 };
3773
3774                 r = barrier_create(&barrier);
3775                 if (r < 0) {
3776                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3777                         goto finish;
3778                 }
3779
3780                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3781                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3782                         goto finish;
3783                 }
3784
3785                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3786                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3787                         goto finish;
3788                 }
3789
3790                 /* Child can be killed before execv(), so handle SIGCHLD
3791                  * in order to interrupt parent's blocking calls and
3792                  * give it a chance to call wait() and terminate. */
3793                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3794                 if (r < 0) {
3795                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3796                         goto finish;
3797                 }
3798
3799                 r = sigaction(SIGCHLD, &sa, NULL);
3800                 if (r < 0) {
3801                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3802                         goto finish;
3803                 }
3804
3805                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3806                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3807                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3808                 if (pid < 0) {
3809                         if (errno == EINVAL)
3810                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3811                         else
3812                                 r = log_error_errno(errno, "clone() failed: %m");
3813
3814                         goto finish;
3815                 }
3816
3817                 if (pid == 0) {
3818                         /* child */
3819                         _cleanup_free_ char *home = NULL;
3820                         unsigned n_env = 2;
3821                         const char *envp[] = {
3822                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3823                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3824                                 NULL, /* TERM */
3825                                 NULL, /* HOME */
3826                                 NULL, /* USER */
3827                                 NULL, /* LOGNAME */
3828                                 NULL, /* container_uuid */
3829                                 NULL, /* LISTEN_FDS */
3830                                 NULL, /* LISTEN_PID */
3831                                 NULL
3832                         };
3833                         char **env_use;
3834
3835                         barrier_set_role(&barrier, BARRIER_CHILD);
3836
3837                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3838                         if (envp[n_env])
3839                                 n_env ++;
3840
3841                         master = safe_close(master);
3842
3843                         close_nointr(STDIN_FILENO);
3844                         close_nointr(STDOUT_FILENO);
3845                         close_nointr(STDERR_FILENO);
3846
3847                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3848                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3849
3850                         reset_all_signal_handlers();
3851                         reset_signal_mask();
3852
3853                         r = open_terminal(console, O_RDWR);
3854                         if (r != STDIN_FILENO) {
3855                                 if (r >= 0) {
3856                                         safe_close(r);
3857                                         r = -EINVAL;
3858                                 }
3859
3860                                 log_error_errno(r, "Failed to open console: %m");
3861                                 _exit(EXIT_FAILURE);
3862                         }
3863
3864                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3865                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3866                                 log_error_errno(errno, "Failed to duplicate console: %m");
3867                                 _exit(EXIT_FAILURE);
3868                         }
3869
3870                         if (setsid() < 0) {
3871                                 log_error_errno(errno, "setsid() failed: %m");
3872                                 _exit(EXIT_FAILURE);
3873                         }
3874
3875                         if (reset_audit_loginuid() < 0)
3876                                 _exit(EXIT_FAILURE);
3877
3878                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3879                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3880                                 _exit(EXIT_FAILURE);
3881                         }
3882
3883                         /* Mark everything as slave, so that we still
3884                          * receive mounts from the real root, but don't
3885                          * propagate mounts to the real root. */
3886                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3887                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3888                                 _exit(EXIT_FAILURE);
3889                         }
3890
3891                         if (mount_devices(arg_directory,
3892                                           root_device, root_device_rw,
3893                                           home_device, home_device_rw,
3894                                           srv_device, srv_device_rw) < 0)
3895                                 _exit(EXIT_FAILURE);
3896
3897                         /* Turn directory into bind mount */
3898                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3899                                 log_error_errno(errno, "Failed to make bind mount: %m");
3900                                 _exit(EXIT_FAILURE);
3901                         }
3902
3903                         r = setup_volatile(arg_directory);
3904                         if (r < 0)
3905                                 _exit(EXIT_FAILURE);
3906
3907                         if (setup_volatile_state(arg_directory) < 0)
3908                                 _exit(EXIT_FAILURE);
3909
3910                         r = base_filesystem_create(arg_directory);
3911                         if (r < 0)
3912                                 _exit(EXIT_FAILURE);
3913
3914                         if (arg_read_only) {
3915                                 r = bind_remount_recursive(arg_directory, true);
3916                                 if (r < 0) {
3917                                         log_error_errno(r, "Failed to make tree read-only: %m");
3918                                         _exit(EXIT_FAILURE);
3919                                 }
3920                         }
3921
3922                         if (mount_all(arg_directory) < 0)
3923                                 _exit(EXIT_FAILURE);
3924
3925                         if (copy_devnodes(arg_directory) < 0)
3926                                 _exit(EXIT_FAILURE);
3927
3928                         if (setup_ptmx(arg_directory) < 0)
3929                                 _exit(EXIT_FAILURE);
3930
3931                         dev_setup(arg_directory);
3932
3933                         if (setup_propagate(arg_directory) < 0)
3934                                 _exit(EXIT_FAILURE);
3935
3936                         if (setup_seccomp() < 0)
3937                                 _exit(EXIT_FAILURE);
3938
3939                         if (setup_dev_console(arg_directory, console) < 0)
3940                                 _exit(EXIT_FAILURE);
3941
3942                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3943                                 _exit(EXIT_FAILURE);
3944                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3945
3946                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3947                                 _exit(EXIT_FAILURE);
3948                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3949
3950                         /* Tell the parent that we are ready, and that
3951                          * it can cgroupify us to that we lack access
3952                          * to certain devices and resources. */
3953                         (void) barrier_place(&barrier);
3954
3955                         if (setup_boot_id(arg_directory) < 0)
3956                                 _exit(EXIT_FAILURE);
3957
3958                         if (setup_timezone(arg_directory) < 0)
3959                                 _exit(EXIT_FAILURE);
3960
3961                         if (setup_resolv_conf(arg_directory) < 0)
3962                                 _exit(EXIT_FAILURE);
3963
3964                         if (setup_journal(arg_directory) < 0)
3965                                 _exit(EXIT_FAILURE);
3966
3967                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3968                                 _exit(EXIT_FAILURE);
3969
3970                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3971                                 _exit(EXIT_FAILURE);
3972
3973                         if (mount_tmpfs(arg_directory) < 0)
3974                                 _exit(EXIT_FAILURE);
3975
3976                         /* Wait until we are cgroup-ified, so that we
3977                          * can mount the right cgroup path writable */
3978                         (void) barrier_sync_next(&barrier);
3979
3980                         if (mount_cgroup(arg_directory) < 0)
3981                                 _exit(EXIT_FAILURE);
3982
3983                         if (chdir(arg_directory) < 0) {
3984                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3985                                 _exit(EXIT_FAILURE);
3986                         }
3987
3988                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3989                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3990                                 _exit(EXIT_FAILURE);
3991                         }
3992
3993                         if (chroot(".") < 0) {
3994                                 log_error_errno(errno, "chroot() failed: %m");
3995                                 _exit(EXIT_FAILURE);
3996                         }
3997
3998                         if (chdir("/") < 0) {
3999                                 log_error_errno(errno, "chdir() failed: %m");
4000                                 _exit(EXIT_FAILURE);
4001                         }
4002
4003                         umask(0022);
4004
4005                         if (arg_private_network)
4006                                 loopback_setup();
4007
4008                         if (drop_capabilities() < 0) {
4009                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4010                                 _exit(EXIT_FAILURE);
4011                         }
4012
4013                         r = change_uid_gid(&home);
4014                         if (r < 0)
4015                                 _exit(EXIT_FAILURE);
4016
4017                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4018                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4019                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4020                                 log_oom();
4021                                 _exit(EXIT_FAILURE);
4022                         }
4023
4024                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4025                                 char as_uuid[37];
4026
4027                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4028                                         log_oom();
4029                                         _exit(EXIT_FAILURE);
4030                                 }
4031                         }
4032
4033                         if (fdset_size(fds) > 0) {
4034                                 r = fdset_cloexec(fds, false);
4035                                 if (r < 0) {
4036                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4037                                         _exit(EXIT_FAILURE);
4038                                 }
4039
4040                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4041                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4042                                         log_oom();
4043                                         _exit(EXIT_FAILURE);
4044                                 }
4045                         }
4046
4047                         setup_hostname();
4048
4049                         if (arg_personality != 0xffffffffLU) {
4050                                 if (personality(arg_personality) < 0) {
4051                                         log_error_errno(errno, "personality() failed: %m");
4052                                         _exit(EXIT_FAILURE);
4053                                 }
4054                         } else if (secondary) {
4055                                 if (personality(PER_LINUX32) < 0) {
4056                                         log_error_errno(errno, "personality() failed: %m");
4057                                         _exit(EXIT_FAILURE);
4058                                 }
4059                         }
4060
4061 #ifdef HAVE_SELINUX
4062                         if (arg_selinux_context)
4063                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4064                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4065                                         _exit(EXIT_FAILURE);
4066                                 }
4067 #endif
4068
4069                         if (!strv_isempty(arg_setenv)) {
4070                                 char **n;
4071
4072                                 n = strv_env_merge(2, envp, arg_setenv);
4073                                 if (!n) {
4074                                         log_oom();
4075                                         _exit(EXIT_FAILURE);
4076                                 }
4077
4078                                 env_use = n;
4079                         } else
4080                                 env_use = (char**) envp;
4081
4082                         /* Wait until the parent is ready with the setup, too... */
4083                         if (!barrier_place_and_sync(&barrier))
4084                                 _exit(EXIT_FAILURE);
4085
4086                         if (arg_boot) {
4087                                 char **a;
4088                                 size_t l;
4089
4090                                 /* Automatically search for the init system */
4091
4092                                 l = 1 + argc - optind;
4093                                 a = newa(char*, l + 1);
4094                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4095
4096                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4097                                 execve(a[0], a, env_use);
4098
4099                                 a[0] = (char*) "/lib/systemd/systemd";
4100                                 execve(a[0], a, env_use);
4101
4102                                 a[0] = (char*) "/sbin/init";
4103                                 execve(a[0], a, env_use);
4104                         } else if (argc > optind)
4105                                 execvpe(argv[optind], argv + optind, env_use);
4106                         else {
4107                                 chdir(home ? home : "/root");
4108                                 execle("/bin/bash", "-bash", NULL, env_use);
4109                                 execle("/bin/sh", "-sh", NULL, env_use);
4110                         }
4111
4112                         log_error_errno(errno, "execv() failed: %m");
4113                         _exit(EXIT_FAILURE);
4114                 }
4115
4116                 barrier_set_role(&barrier, BARRIER_PARENT);
4117                 fdset_free(fds);
4118                 fds = NULL;
4119
4120                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4121                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4122
4123                 /* Wait for the most basic Child-setup to be done,
4124                  * before we add hardware to it, and place it in a
4125                  * cgroup. */
4126                 if (barrier_sync_next(&barrier)) {
4127                         int ifi = 0;
4128
4129                         r = move_network_interfaces(pid);
4130                         if (r < 0)
4131                                 goto finish;
4132
4133                         r = setup_veth(pid, veth_name, &ifi);
4134                         if (r < 0)
4135                                 goto finish;
4136
4137                         r = setup_bridge(veth_name, &ifi);
4138                         if (r < 0)
4139                                 goto finish;
4140
4141                         r = setup_macvlan(pid);
4142                         if (r < 0)
4143                                 goto finish;
4144
4145                         r = setup_ipvlan(pid);
4146                         if (r < 0)
4147                                 goto finish;
4148
4149                         r = register_machine(pid, ifi);
4150                         if (r < 0)
4151                                 goto finish;
4152
4153                         /* Block SIGCHLD here, before notifying child.
4154                          * process_pty() will handle it with the other signals. */
4155                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4156                         if (r < 0)
4157                                 goto finish;
4158
4159                         /* Reset signal to default */
4160                         r = default_signals(SIGCHLD, -1);
4161                         if (r < 0)
4162                                 goto finish;
4163
4164                         /* Notify the child that the parent is ready with all
4165                          * its setup, and that the child can now hand over
4166                          * control to the code to run inside the container. */
4167                         (void) barrier_place(&barrier);
4168
4169                         /* And wait that the child is completely ready now. */
4170                         if (barrier_place_and_sync(&barrier)) {
4171                                 _cleanup_event_unref_ sd_event *event = NULL;
4172                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4173                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4174                                 char last_char = 0;
4175
4176                                 sd_notifyf(false,
4177                                            "READY=1\n"
4178                                            "STATUS=Container running.\n"
4179                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4180
4181                                 r = sd_event_new(&event);
4182                                 if (r < 0) {
4183                                         log_error_errno(r, "Failed to get default event source: %m");
4184                                         goto finish;
4185                                 }
4186
4187                                 if (arg_boot) {
4188                                         /* Try to kill the init system on SIGINT or SIGTERM */
4189                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4190                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4191                                 } else {
4192                                         /* Immediately exit */
4193                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4194                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4195                                 }
4196
4197                                 /* simply exit on sigchld */
4198                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4199
4200                                 if (arg_expose_ports) {
4201                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4202                                         if (r < 0)
4203                                                 goto finish;
4204
4205                                         (void) expose_ports(rtnl, &exposed);
4206                                 }
4207
4208                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4209
4210                                 r = pty_forward_new(event, master, true, &forward);
4211                                 if (r < 0) {
4212                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4213                                         goto finish;
4214                                 }
4215
4216                                 r = sd_event_loop(event);
4217                                 if (r < 0) {
4218                                         log_error_errno(r, "Failed to run event loop: %m");
4219                                         goto finish;
4220                                 }
4221
4222                                 pty_forward_get_last_char(forward, &last_char);
4223
4224                                 forward = pty_forward_free(forward);
4225
4226                                 if (!arg_quiet && last_char != '\n')
4227                                         putc('\n', stdout);
4228
4229                                 /* Kill if it is not dead yet anyway */
4230                                 terminate_machine(pid);
4231                         }
4232                 }
4233
4234                 /* Normally redundant, but better safe than sorry */
4235                 kill(pid, SIGKILL);
4236
4237                 r = wait_for_container(pid, &container_status);
4238                 pid = 0;
4239
4240                 if (r < 0)
4241                         /* We failed to wait for the container, or the
4242                          * container exited abnormally */
4243                         goto finish;
4244                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4245                         /* The container exited with a non-zero
4246                          * status, or with zero status and no reboot
4247                          * was requested. */
4248                         ret = r;
4249                         break;
4250                 }
4251
4252                 /* CONTAINER_REBOOTED, loop again */
4253
4254                 if (arg_keep_unit) {
4255                         /* Special handling if we are running as a
4256                          * service: instead of simply restarting the
4257                          * machine we want to restart the entire
4258                          * service, so let's inform systemd about this
4259                          * with the special exit code 133. The service
4260                          * file uses RestartForceExitStatus=133 so
4261                          * that this results in a full nspawn
4262                          * restart. This is necessary since we might
4263                          * have cgroup parameters set we want to have
4264                          * flushed out. */
4265                         ret = 133;
4266                         r = 0;
4267                         break;
4268                 }
4269
4270                 flush_ports(&exposed);
4271         }
4272
4273 finish:
4274         sd_notify(false,
4275                   "STOPPING=1\n"
4276                   "STATUS=Terminating...");
4277
4278         loop_remove(loop_nr, &image_fd);
4279
4280         if (pid > 0)
4281                 kill(pid, SIGKILL);
4282
4283         if (remove_subvol && arg_directory) {
4284                 int k;
4285
4286                 k = btrfs_subvol_remove(arg_directory);
4287                 if (k < 0)
4288                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4289         }
4290
4291         if (arg_machine) {
4292                 const char *p;
4293
4294                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4295                 (void) rm_rf(p, false, true, false);
4296         }
4297
4298         free(arg_directory);
4299         free(arg_template);
4300         free(arg_image);
4301         free(arg_machine);
4302         free(arg_user);
4303         strv_free(arg_setenv);
4304         strv_free(arg_network_interfaces);
4305         strv_free(arg_network_macvlan);
4306         strv_free(arg_network_ipvlan);
4307         strv_free(arg_bind);
4308         strv_free(arg_bind_ro);
4309         strv_free(arg_tmpfs);
4310
4311         flush_ports(&exposed);
4312
4313         while (arg_expose_ports) {
4314                 ExposePort *p = arg_expose_ports;
4315                 LIST_REMOVE(ports, arg_expose_ports, p);
4316                 free(p);
4317         }
4318
4319         return r < 0 ? EXIT_FAILURE : ret;
4320 }