chiark / gitweb /
fb672510b4ff6e718a71b581d4350c48ba46d577
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
820 #ifdef HAVE_SELINUX
821                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
822                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
823 #endif
824         };
825
826         unsigned k;
827         int r = 0;
828
829         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
830                 _cleanup_free_ char *where = NULL;
831 #ifdef HAVE_SELINUX
832                 _cleanup_free_ char *options = NULL;
833 #endif
834                 const char *o;
835                 int t;
836
837                 where = strjoin(dest, "/", mount_table[k].where, NULL);
838                 if (!where)
839                         return log_oom();
840
841                 t = path_is_mount_point(where, true);
842                 if (t < 0) {
843                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
844
845                         if (r == 0)
846                                 r = t;
847
848                         continue;
849                 }
850
851                 /* Skip this entry if it is not a remount. */
852                 if (mount_table[k].what && t > 0)
853                         continue;
854
855                 t = mkdir_p(where, 0755);
856                 if (t < 0) {
857                         if (mount_table[k].fatal) {
858                                log_error_errno(t, "Failed to create directory %s: %m", where);
859
860                                 if (r == 0)
861                                         r = t;
862                         } else
863                                log_warning_errno(t, "Failed to create directory %s: %m", where);
864
865                         continue;
866                 }
867
868 #ifdef HAVE_SELINUX
869                 if (arg_selinux_apifs_context &&
870                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
871                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
872                         if (!options)
873                                 return log_oom();
874
875                         o = options;
876                 } else
877 #endif
878                         o = mount_table[k].options;
879
880
881                 if (mount(mount_table[k].what,
882                           where,
883                           mount_table[k].type,
884                           mount_table[k].flags,
885                           o) < 0) {
886
887                         if (mount_table[k].fatal) {
888                                 log_error_errno(errno, "mount(%s) failed: %m", where);
889
890                                 if (r == 0)
891                                         r = -errno;
892                         } else
893                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
894                 }
895         }
896
897         return r;
898 }
899
900 static int mount_binds(const char *dest, char **l, bool ro) {
901         char **x, **y;
902
903         STRV_FOREACH_PAIR(x, y, l) {
904                 _cleanup_free_ char *where = NULL;
905                 struct stat source_st, dest_st;
906                 int r;
907
908                 if (stat(*x, &source_st) < 0)
909                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
910
911                 where = strappend(dest, *y);
912                 if (!where)
913                         return log_oom();
914
915                 r = stat(where, &dest_st);
916                 if (r == 0) {
917                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
918                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
919                                 return -EINVAL;
920                         }
921                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
922                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
923                                 return -EINVAL;
924                         }
925                 } else if (errno == ENOENT) {
926                         r = mkdir_parents_label(where, 0755);
927                         if (r < 0)
928                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
929                 } else {
930                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
931                         return -errno;
932                 }
933
934                 /* Create the mount point. Any non-directory file can be
935                  * mounted on any non-directory file (regular, fifo, socket,
936                  * char, block).
937                  */
938                 if (S_ISDIR(source_st.st_mode)) {
939                         r = mkdir_label(where, 0755);
940                         if (r < 0 && errno != EEXIST)
941                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
942                 } else {
943                         r = touch(where);
944                         if (r < 0)
945                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
946                 }
947
948                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
949                         return log_error_errno(errno, "mount(%s) failed: %m", where);
950
951                 if (ro) {
952                         r = bind_remount_recursive(where, true);
953                         if (r < 0)
954                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
955                 }
956         }
957
958         return 0;
959 }
960
961 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
962         char *to;
963         int r;
964
965         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
966
967         r = path_is_mount_point(to, false);
968         if (r < 0)
969                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
970         if (r > 0)
971                 return 0;
972
973         mkdir_p(to, 0755);
974
975         /* The superblock mount options of the mount point need to be
976          * identical to the hosts', and hence writable... */
977         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
978                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
979
980         /* ... hence let's only make the bind mount read-only, not the
981          * superblock. */
982         if (read_only) {
983                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
984                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
985         }
986         return 1;
987 }
988
989 static int mount_cgroup(const char *dest) {
990         _cleanup_set_free_free_ Set *controllers = NULL;
991         _cleanup_free_ char *own_cgroup_path = NULL;
992         const char *cgroup_root, *systemd_root, *systemd_own;
993         int r;
994
995         controllers = set_new(&string_hash_ops);
996         if (!controllers)
997                 return log_oom();
998
999         r = cg_kernel_controllers(controllers);
1000         if (r < 0)
1001                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1002
1003         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1004         if (r < 0)
1005                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1006
1007         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1008         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1009                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1010
1011         for (;;) {
1012                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1013
1014                 controller = set_steal_first(controllers);
1015                 if (!controller)
1016                         break;
1017
1018                 origin = strappend("/sys/fs/cgroup/", controller);
1019                 if (!origin)
1020                         return log_oom();
1021
1022                 r = readlink_malloc(origin, &combined);
1023                 if (r == -EINVAL) {
1024                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1025
1026                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1027                         if (r < 0)
1028                                 return r;
1029
1030                 } else if (r < 0)
1031                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1032                 else {
1033                         _cleanup_free_ char *target = NULL;
1034
1035                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1036                         if (!target)
1037                                 return log_oom();
1038
1039                         /* A symbolic link, a combination of controllers in one hierarchy */
1040
1041                         if (!filename_is_valid(combined)) {
1042                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1043                                 continue;
1044                         }
1045
1046                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1047                         if (r < 0)
1048                                 return r;
1049
1050                         if (symlink(combined, target) < 0)
1051                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1052                 }
1053         }
1054
1055         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1056         if (r < 0)
1057                 return r;
1058
1059         /* Make our own cgroup a (writable) bind mount */
1060         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1061         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1062                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1063
1064         /* And then remount the systemd cgroup root read-only */
1065         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1066         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1067                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1068
1069         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1070                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1071
1072         return 0;
1073 }
1074
1075 static int mount_tmpfs(const char *dest) {
1076         char **i, **o;
1077
1078         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1079                 _cleanup_free_ char *where = NULL;
1080                 int r;
1081
1082                 where = strappend(dest, *i);
1083                 if (!where)
1084                         return log_oom();
1085
1086                 r = mkdir_label(where, 0755);
1087                 if (r < 0 && r != -EEXIST)
1088                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1089
1090                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1091                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1092         }
1093
1094         return 0;
1095 }
1096
1097 static int setup_timezone(const char *dest) {
1098         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1099         char *z, *y;
1100         int r;
1101
1102         assert(dest);
1103
1104         /* Fix the timezone, if possible */
1105         r = readlink_malloc("/etc/localtime", &p);
1106         if (r < 0) {
1107                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1108                 return 0;
1109         }
1110
1111         z = path_startswith(p, "../usr/share/zoneinfo/");
1112         if (!z)
1113                 z = path_startswith(p, "/usr/share/zoneinfo/");
1114         if (!z) {
1115                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1116                 return 0;
1117         }
1118
1119         where = strappend(dest, "/etc/localtime");
1120         if (!where)
1121                 return log_oom();
1122
1123         r = readlink_malloc(where, &q);
1124         if (r >= 0) {
1125                 y = path_startswith(q, "../usr/share/zoneinfo/");
1126                 if (!y)
1127                         y = path_startswith(q, "/usr/share/zoneinfo/");
1128
1129                 /* Already pointing to the right place? Then do nothing .. */
1130                 if (y && streq(y, z))
1131                         return 0;
1132         }
1133
1134         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1135         if (!check)
1136                 return log_oom();
1137
1138         if (access(check, F_OK) < 0) {
1139                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1140                 return 0;
1141         }
1142
1143         what = strappend("../usr/share/zoneinfo/", z);
1144         if (!what)
1145                 return log_oom();
1146
1147         r = mkdir_parents(where, 0755);
1148         if (r < 0) {
1149                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1150
1151                 return 0;
1152         }
1153
1154         r = unlink(where);
1155         if (r < 0 && errno != ENOENT) {
1156                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1157
1158                 return 0;
1159         }
1160
1161         if (symlink(what, where) < 0) {
1162                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1163                 return 0;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int setup_resolv_conf(const char *dest) {
1170         _cleanup_free_ char *where = NULL;
1171         int r;
1172
1173         assert(dest);
1174
1175         if (arg_private_network)
1176                 return 0;
1177
1178         /* Fix resolv.conf, if possible */
1179         where = strappend(dest, "/etc/resolv.conf");
1180         if (!where)
1181                 return log_oom();
1182
1183         /* We don't really care for the results of this really. If it
1184          * fails, it fails, but meh... */
1185         r = mkdir_parents(where, 0755);
1186         if (r < 0) {
1187                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1188
1189                 return 0;
1190         }
1191
1192         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1193         if (r < 0) {
1194                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1195
1196                 return 0;
1197         }
1198
1199         return 0;
1200 }
1201
1202 static int setup_volatile_state(const char *directory) {
1203         const char *p;
1204         int r;
1205
1206         assert(directory);
1207
1208         if (arg_volatile != VOLATILE_STATE)
1209                 return 0;
1210
1211         /* --volatile=state means we simply overmount /var
1212            with a tmpfs, and the rest read-only. */
1213
1214         r = bind_remount_recursive(directory, true);
1215         if (r < 0)
1216                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1217
1218         p = strjoina(directory, "/var");
1219         r = mkdir(p, 0755);
1220         if (r < 0 && errno != EEXIST)
1221                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1222
1223         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1224                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1225
1226         return 0;
1227 }
1228
1229 static int setup_volatile(const char *directory) {
1230         bool tmpfs_mounted = false, bind_mounted = false;
1231         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1232         const char *f, *t;
1233         int r;
1234
1235         assert(directory);
1236
1237         if (arg_volatile != VOLATILE_YES)
1238                 return 0;
1239
1240         /* --volatile=yes means we mount a tmpfs to the root dir, and
1241            the original /usr to use inside it, and that read-only. */
1242
1243         if (!mkdtemp(template))
1244                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1245
1246         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1247                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1248                 r = -errno;
1249                 goto fail;
1250         }
1251
1252         tmpfs_mounted = true;
1253
1254         f = strjoina(directory, "/usr");
1255         t = strjoina(template, "/usr");
1256
1257         r = mkdir(t, 0755);
1258         if (r < 0 && errno != EEXIST) {
1259                 log_error_errno(errno, "Failed to create %s: %m", t);
1260                 r = -errno;
1261                 goto fail;
1262         }
1263
1264         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1265                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1266                 r = -errno;
1267                 goto fail;
1268         }
1269
1270         bind_mounted = true;
1271
1272         r = bind_remount_recursive(t, true);
1273         if (r < 0) {
1274                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1275                 goto fail;
1276         }
1277
1278         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1279                 log_error_errno(errno, "Failed to move root mount: %m");
1280                 r = -errno;
1281                 goto fail;
1282         }
1283
1284         rmdir(template);
1285
1286         return 0;
1287
1288 fail:
1289         if (bind_mounted)
1290                 umount(t);
1291         if (tmpfs_mounted)
1292                 umount(template);
1293         rmdir(template);
1294         return r;
1295 }
1296
1297 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1298
1299         snprintf(s, 37,
1300                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1301                  SD_ID128_FORMAT_VAL(id));
1302
1303         return s;
1304 }
1305
1306 static int setup_boot_id(const char *dest) {
1307         _cleanup_free_ char *from = NULL, *to = NULL;
1308         sd_id128_t rnd = {};
1309         char as_uuid[37];
1310         int r;
1311
1312         assert(dest);
1313
1314         if (arg_share_system)
1315                 return 0;
1316
1317         /* Generate a new randomized boot ID, so that each boot-up of
1318          * the container gets a new one */
1319
1320         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1321         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1322         if (!from || !to)
1323                 return log_oom();
1324
1325         r = sd_id128_randomize(&rnd);
1326         if (r < 0)
1327                 return log_error_errno(r, "Failed to generate random boot id: %m");
1328
1329         id128_format_as_uuid(rnd, as_uuid);
1330
1331         r = write_string_file(from, as_uuid);
1332         if (r < 0)
1333                 return log_error_errno(r, "Failed to write boot id: %m");
1334
1335         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1336                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1337                 r = -errno;
1338         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1339                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1340
1341         unlink(from);
1342         return r;
1343 }
1344
1345 static int copy_devnodes(const char *dest) {
1346
1347         static const char devnodes[] =
1348                 "null\0"
1349                 "zero\0"
1350                 "full\0"
1351                 "random\0"
1352                 "urandom\0"
1353                 "tty\0"
1354                 "net/tun\0";
1355
1356         const char *d;
1357         int r = 0;
1358         _cleanup_umask_ mode_t u;
1359
1360         assert(dest);
1361
1362         u = umask(0000);
1363
1364         NULSTR_FOREACH(d, devnodes) {
1365                 _cleanup_free_ char *from = NULL, *to = NULL;
1366                 struct stat st;
1367
1368                 from = strappend("/dev/", d);
1369                 to = strjoin(dest, "/dev/", d, NULL);
1370                 if (!from || !to)
1371                         return log_oom();
1372
1373                 if (stat(from, &st) < 0) {
1374
1375                         if (errno != ENOENT)
1376                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1377
1378                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1379
1380                         log_error("%s is not a char or block device, cannot copy", from);
1381                         return -EIO;
1382
1383                 } else {
1384                         r = mkdir_parents(to, 0775);
1385                         if (r < 0) {
1386                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1387                                 return -r;
1388                         }
1389
1390                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1391                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1392                 }
1393         }
1394
1395         return r;
1396 }
1397
1398 static int setup_ptmx(const char *dest) {
1399         _cleanup_free_ char *p = NULL;
1400
1401         p = strappend(dest, "/dev/ptmx");
1402         if (!p)
1403                 return log_oom();
1404
1405         if (symlink("pts/ptmx", p) < 0)
1406                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1407
1408         return 0;
1409 }
1410
1411 static int setup_dev_console(const char *dest, const char *console) {
1412         _cleanup_umask_ mode_t u;
1413         const char *to;
1414         struct stat st;
1415         int r;
1416
1417         assert(dest);
1418         assert(console);
1419
1420         u = umask(0000);
1421
1422         if (stat("/dev/null", &st) < 0)
1423                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1424
1425         r = chmod_and_chown(console, 0600, 0, 0);
1426         if (r < 0)
1427                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1428
1429         /* We need to bind mount the right tty to /dev/console since
1430          * ptys can only exist on pts file systems. To have something
1431          * to bind mount things on we create a device node first, and
1432          * use /dev/null for that since we the cgroups device policy
1433          * allows us to create that freely, while we cannot create
1434          * /dev/console. (Note that the major minor doesn't actually
1435          * matter here, since we mount it over anyway). */
1436
1437         to = strjoina(dest, "/dev/console");
1438         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1439                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1440
1441         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1442                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1443
1444         return 0;
1445 }
1446
1447 static int setup_kmsg(const char *dest, int kmsg_socket) {
1448         _cleanup_free_ char *from = NULL, *to = NULL;
1449         _cleanup_umask_ mode_t u;
1450         int r, fd, k;
1451         union {
1452                 struct cmsghdr cmsghdr;
1453                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1454         } control = {};
1455         struct msghdr mh = {
1456                 .msg_control = &control,
1457                 .msg_controllen = sizeof(control),
1458         };
1459         struct cmsghdr *cmsg;
1460
1461         assert(dest);
1462         assert(kmsg_socket >= 0);
1463
1464         u = umask(0000);
1465
1466         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1467          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1468          * on the reading side behave very similar to /proc/kmsg,
1469          * their writing side behaves differently from /dev/kmsg in
1470          * that writing blocks when nothing is reading. In order to
1471          * avoid any problems with containers deadlocking due to this
1472          * we simply make /dev/kmsg unavailable to the container. */
1473         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1474             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1475                 return log_oom();
1476
1477         if (mkfifo(from, 0600) < 0)
1478                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1479
1480         r = chmod_and_chown(from, 0600, 0, 0);
1481         if (r < 0)
1482                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1483
1484         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1485                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1486
1487         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1488         if (fd < 0)
1489                 return log_error_errno(errno, "Failed to open fifo: %m");
1490
1491         cmsg = CMSG_FIRSTHDR(&mh);
1492         cmsg->cmsg_level = SOL_SOCKET;
1493         cmsg->cmsg_type = SCM_RIGHTS;
1494         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1495         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1496
1497         mh.msg_controllen = cmsg->cmsg_len;
1498
1499         /* Store away the fd in the socket, so that it stays open as
1500          * long as we run the child */
1501         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1502         safe_close(fd);
1503
1504         if (k < 0)
1505                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1506
1507         /* And now make the FIFO unavailable as /dev/kmsg... */
1508         unlink(from);
1509         return 0;
1510 }
1511
1512 static int send_rtnl(int send_fd) {
1513         union {
1514                 struct cmsghdr cmsghdr;
1515                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1516         } control = {};
1517         struct msghdr mh = {
1518                 .msg_control = &control,
1519                 .msg_controllen = sizeof(control),
1520         };
1521         struct cmsghdr *cmsg;
1522         _cleanup_close_ int fd = -1;
1523         ssize_t k;
1524
1525         assert(send_fd >= 0);
1526
1527         if (!arg_expose_ports)
1528                 return 0;
1529
1530         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1531         if (fd < 0)
1532                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1533
1534         cmsg = CMSG_FIRSTHDR(&mh);
1535         cmsg->cmsg_level = SOL_SOCKET;
1536         cmsg->cmsg_type = SCM_RIGHTS;
1537         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1538         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1539
1540         mh.msg_controllen = cmsg->cmsg_len;
1541
1542         /* Store away the fd in the socket, so that it stays open as
1543          * long as we run the child */
1544         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1545         if (k < 0)
1546                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1547
1548         return 0;
1549 }
1550
1551 static int flush_ports(union in_addr_union *exposed) {
1552         ExposePort *p;
1553         int r, af = AF_INET;
1554
1555         assert(exposed);
1556
1557         if (!arg_expose_ports)
1558                 return 0;
1559
1560         if (in_addr_is_null(af, exposed))
1561                 return 0;
1562
1563         log_debug("Lost IP address.");
1564
1565         LIST_FOREACH(ports, p, arg_expose_ports) {
1566                 r = fw_add_local_dnat(false,
1567                                       af,
1568                                       p->protocol,
1569                                       NULL,
1570                                       NULL, 0,
1571                                       NULL, 0,
1572                                       p->host_port,
1573                                       exposed,
1574                                       p->container_port,
1575                                       NULL);
1576                 if (r < 0)
1577                         log_warning_errno(r, "Failed to modify firewall: %m");
1578         }
1579
1580         *exposed = IN_ADDR_NULL;
1581         return 0;
1582 }
1583
1584 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1585         _cleanup_free_ struct local_address *addresses = NULL;
1586         _cleanup_free_ char *pretty = NULL;
1587         union in_addr_union new_exposed;
1588         ExposePort *p;
1589         bool add;
1590         int af = AF_INET, r;
1591
1592         assert(exposed);
1593
1594         /* Invoked each time an address is added or removed inside the
1595          * container */
1596
1597         if (!arg_expose_ports)
1598                 return 0;
1599
1600         r = local_addresses(rtnl, 0, af, &addresses);
1601         if (r < 0)
1602                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1603
1604         add = r > 0 &&
1605                 addresses[0].family == af &&
1606                 addresses[0].scope < RT_SCOPE_LINK;
1607
1608         if (!add)
1609                 return flush_ports(exposed);
1610
1611         new_exposed = addresses[0].address;
1612         if (in_addr_equal(af, exposed, &new_exposed))
1613                 return 0;
1614
1615         in_addr_to_string(af, &new_exposed, &pretty);
1616         log_debug("New container IP is %s.", strna(pretty));
1617
1618         LIST_FOREACH(ports, p, arg_expose_ports) {
1619
1620                 r = fw_add_local_dnat(true,
1621                                       af,
1622                                       p->protocol,
1623                                       NULL,
1624                                       NULL, 0,
1625                                       NULL, 0,
1626                                       p->host_port,
1627                                       &new_exposed,
1628                                       p->container_port,
1629                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1630                 if (r < 0)
1631                         log_warning_errno(r, "Failed to modify firewall: %m");
1632         }
1633
1634         *exposed = new_exposed;
1635         return 0;
1636 }
1637
1638 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1639         union in_addr_union *exposed = userdata;
1640
1641         assert(rtnl);
1642         assert(m);
1643         assert(exposed);
1644
1645         expose_ports(rtnl, exposed);
1646         return 0;
1647 }
1648
1649 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1650         union {
1651                 struct cmsghdr cmsghdr;
1652                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1653         } control = {};
1654         struct msghdr mh = {
1655                 .msg_control = &control,
1656                 .msg_controllen = sizeof(control),
1657         };
1658         struct cmsghdr *cmsg;
1659         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1660         int fd, r;
1661         ssize_t k;
1662
1663         assert(event);
1664         assert(recv_fd >= 0);
1665         assert(ret);
1666
1667         if (!arg_expose_ports)
1668                 return 0;
1669
1670         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1671         if (k < 0)
1672                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1673
1674         cmsg = CMSG_FIRSTHDR(&mh);
1675         assert(cmsg->cmsg_level == SOL_SOCKET);
1676         assert(cmsg->cmsg_type == SCM_RIGHTS);
1677         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1678         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1679
1680         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1681         if (r < 0) {
1682                 safe_close(fd);
1683                 return log_error_errno(r, "Failed to create rtnl object: %m");
1684         }
1685
1686         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1689
1690         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1691         if (r < 0)
1692                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1693
1694         r = sd_rtnl_attach_event(rtnl, event, 0);
1695         if (r < 0)
1696                 return log_error_errno(r, "Failed to add to even loop: %m");
1697
1698         *ret = rtnl;
1699         rtnl = NULL;
1700
1701         return 0;
1702 }
1703
1704 static int setup_hostname(void) {
1705
1706         if (arg_share_system)
1707                 return 0;
1708
1709         if (sethostname_idempotent(arg_machine) < 0)
1710                 return -errno;
1711
1712         return 0;
1713 }
1714
1715 static int setup_journal(const char *directory) {
1716         sd_id128_t machine_id, this_id;
1717         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1718         char *id;
1719         int r;
1720
1721         /* Don't link journals in ephemeral mode */
1722         if (arg_ephemeral)
1723                 return 0;
1724
1725         p = strappend(directory, "/etc/machine-id");
1726         if (!p)
1727                 return log_oom();
1728
1729         r = read_one_line_file(p, &b);
1730         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1731                 return 0;
1732         else if (r < 0)
1733                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1734
1735         id = strstrip(b);
1736         if (isempty(id) && arg_link_journal == LINK_AUTO)
1737                 return 0;
1738
1739         /* Verify validity */
1740         r = sd_id128_from_string(id, &machine_id);
1741         if (r < 0)
1742                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1743
1744         r = sd_id128_get_machine(&this_id);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1747
1748         if (sd_id128_equal(machine_id, this_id)) {
1749                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1750                          "Host and machine ids are equal (%s): refusing to link journals", id);
1751                 if (arg_link_journal == LINK_AUTO)
1752                         return 0;
1753                 return -EEXIST;
1754         }
1755
1756         if (arg_link_journal == LINK_NO)
1757                 return 0;
1758
1759         free(p);
1760         p = strappend("/var/log/journal/", id);
1761         q = strjoin(directory, "/var/log/journal/", id, NULL);
1762         if (!p || !q)
1763                 return log_oom();
1764
1765         if (path_is_mount_point(p, false) > 0) {
1766                 if (arg_link_journal != LINK_AUTO) {
1767                         log_error("%s: already a mount point, refusing to use for journal", p);
1768                         return -EEXIST;
1769                 }
1770
1771                 return 0;
1772         }
1773
1774         if (path_is_mount_point(q, false) > 0) {
1775                 if (arg_link_journal != LINK_AUTO) {
1776                         log_error("%s: already a mount point, refusing to use for journal", q);
1777                         return -EEXIST;
1778                 }
1779
1780                 return 0;
1781         }
1782
1783         r = readlink_and_make_absolute(p, &d);
1784         if (r >= 0) {
1785                 if ((arg_link_journal == LINK_GUEST ||
1786                      arg_link_journal == LINK_AUTO) &&
1787                     path_equal(d, q)) {
1788
1789                         r = mkdir_p(q, 0755);
1790                         if (r < 0)
1791                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1792                         return 0;
1793                 }
1794
1795                 if (unlink(p) < 0)
1796                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1797         } else if (r == -EINVAL) {
1798
1799                 if (arg_link_journal == LINK_GUEST &&
1800                     rmdir(p) < 0) {
1801
1802                         if (errno == ENOTDIR) {
1803                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1804                                 return r;
1805                         } else {
1806                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1807                                 return -errno;
1808                         }
1809                 }
1810         } else if (r != -ENOENT) {
1811                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1812                 return r;
1813         }
1814
1815         if (arg_link_journal == LINK_GUEST) {
1816
1817                 if (symlink(q, p) < 0) {
1818                         if (arg_link_journal_try) {
1819                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1820                                 return 0;
1821                         } else {
1822                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1823                                 return -errno;
1824                         }
1825                 }
1826
1827                 r = mkdir_p(q, 0755);
1828                 if (r < 0)
1829                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1830                 return 0;
1831         }
1832
1833         if (arg_link_journal == LINK_HOST) {
1834                 /* don't create parents here -- if the host doesn't have
1835                  * permanent journal set up, don't force it here */
1836                 r = mkdir(p, 0755);
1837                 if (r < 0) {
1838                         if (arg_link_journal_try) {
1839                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1840                                 return 0;
1841                         } else {
1842                                 log_error_errno(errno, "Failed to create %s: %m", p);
1843                                 return r;
1844                         }
1845                 }
1846
1847         } else if (access(p, F_OK) < 0)
1848                 return 0;
1849
1850         if (dir_is_empty(q) == 0)
1851                 log_warning("%s is not empty, proceeding anyway.", q);
1852
1853         r = mkdir_p(q, 0755);
1854         if (r < 0) {
1855                 log_error_errno(errno, "Failed to create %s: %m", q);
1856                 return r;
1857         }
1858
1859         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1860                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1861
1862         return 0;
1863 }
1864
1865 static int drop_capabilities(void) {
1866         return capability_bounding_set_drop(~arg_retain, false);
1867 }
1868
1869 static int register_machine(pid_t pid, int local_ifindex) {
1870         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1871         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1872         int r;
1873
1874         if (!arg_register)
1875                 return 0;
1876
1877         r = sd_bus_default_system(&bus);
1878         if (r < 0)
1879                 return log_error_errno(r, "Failed to open system bus: %m");
1880
1881         if (arg_keep_unit) {
1882                 r = sd_bus_call_method(
1883                                 bus,
1884                                 "org.freedesktop.machine1",
1885                                 "/org/freedesktop/machine1",
1886                                 "org.freedesktop.machine1.Manager",
1887                                 "RegisterMachineWithNetwork",
1888                                 &error,
1889                                 NULL,
1890                                 "sayssusai",
1891                                 arg_machine,
1892                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1893                                 "nspawn",
1894                                 "container",
1895                                 (uint32_t) pid,
1896                                 strempty(arg_directory),
1897                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1898         } else {
1899                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1900
1901                 r = sd_bus_message_new_method_call(
1902                                 bus,
1903                                 &m,
1904                                 "org.freedesktop.machine1",
1905                                 "/org/freedesktop/machine1",
1906                                 "org.freedesktop.machine1.Manager",
1907                                 "CreateMachineWithNetwork");
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to create message: %m");
1910
1911                 r = sd_bus_message_append(
1912                                 m,
1913                                 "sayssusai",
1914                                 arg_machine,
1915                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1916                                 "nspawn",
1917                                 "container",
1918                                 (uint32_t) pid,
1919                                 strempty(arg_directory),
1920                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1921                 if (r < 0)
1922                         return log_error_errno(r, "Failed to append message arguments: %m");
1923
1924                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1925                 if (r < 0)
1926                         return log_error_errno(r, "Failed to open container: %m");
1927
1928                 if (!isempty(arg_slice)) {
1929                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1930                         if (r < 0)
1931                                 return log_error_errno(r, "Failed to append slice: %m");
1932                 }
1933
1934                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1935                 if (r < 0)
1936                         return log_error_errno(r, "Failed to add device policy: %m");
1937
1938                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1939                                           /* Allow the container to
1940                                            * access and create the API
1941                                            * device nodes, so that
1942                                            * PrivateDevices= in the
1943                                            * container can work
1944                                            * fine */
1945                                           "/dev/null", "rwm",
1946                                           "/dev/zero", "rwm",
1947                                           "/dev/full", "rwm",
1948                                           "/dev/random", "rwm",
1949                                           "/dev/urandom", "rwm",
1950                                           "/dev/tty", "rwm",
1951                                           "/dev/net/tun", "rwm",
1952                                           /* Allow the container
1953                                            * access to ptys. However,
1954                                            * do not permit the
1955                                            * container to ever create
1956                                            * these device nodes. */
1957                                           "/dev/pts/ptmx", "rw",
1958                                           "char-pts", "rw");
1959                 if (r < 0)
1960                         return log_error_errno(r, "Failed to add device whitelist: %m");
1961
1962                 r = sd_bus_message_close_container(m);
1963                 if (r < 0)
1964                         return log_error_errno(r, "Failed to close container: %m");
1965
1966                 r = sd_bus_call(bus, m, 0, &error, NULL);
1967         }
1968
1969         if (r < 0) {
1970                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1971                 return r;
1972         }
1973
1974         return 0;
1975 }
1976
1977 static int terminate_machine(pid_t pid) {
1978         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1979         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1980         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1981         const char *path;
1982         int r;
1983
1984         if (!arg_register)
1985                 return 0;
1986
1987         r = sd_bus_default_system(&bus);
1988         if (r < 0)
1989                 return log_error_errno(r, "Failed to open system bus: %m");
1990
1991         r = sd_bus_call_method(
1992                         bus,
1993                         "org.freedesktop.machine1",
1994                         "/org/freedesktop/machine1",
1995                         "org.freedesktop.machine1.Manager",
1996                         "GetMachineByPID",
1997                         &error,
1998                         &reply,
1999                         "u",
2000                         (uint32_t) pid);
2001         if (r < 0) {
2002                 /* Note that the machine might already have been
2003                  * cleaned up automatically, hence don't consider it a
2004                  * failure if we cannot get the machine object. */
2005                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2006                 return 0;
2007         }
2008
2009         r = sd_bus_message_read(reply, "o", &path);
2010         if (r < 0)
2011                 return bus_log_parse_error(r);
2012
2013         r = sd_bus_call_method(
2014                         bus,
2015                         "org.freedesktop.machine1",
2016                         path,
2017                         "org.freedesktop.machine1.Machine",
2018                         "Terminate",
2019                         &error,
2020                         NULL,
2021                         NULL);
2022         if (r < 0) {
2023                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2024                 return 0;
2025         }
2026
2027         return 0;
2028 }
2029
2030 static int reset_audit_loginuid(void) {
2031         _cleanup_free_ char *p = NULL;
2032         int r;
2033
2034         if (arg_share_system)
2035                 return 0;
2036
2037         r = read_one_line_file("/proc/self/loginuid", &p);
2038         if (r == -ENOENT)
2039                 return 0;
2040         if (r < 0)
2041                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2042
2043         /* Already reset? */
2044         if (streq(p, "4294967295"))
2045                 return 0;
2046
2047         r = write_string_file("/proc/self/loginuid", "4294967295");
2048         if (r < 0) {
2049                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2050                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2051                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2052                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2053                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2054
2055                 sleep(5);
2056         }
2057
2058         return 0;
2059 }
2060
2061 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2062 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2063 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2064
2065 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2066         uint8_t result[8];
2067         size_t l, sz;
2068         uint8_t *v, *i;
2069         int r;
2070
2071         l = strlen(arg_machine);
2072         sz = sizeof(sd_id128_t) + l;
2073         if (idx > 0)
2074                 sz += sizeof(idx);
2075
2076         v = alloca(sz);
2077
2078         /* fetch some persistent data unique to the host */
2079         r = sd_id128_get_machine((sd_id128_t*) v);
2080         if (r < 0)
2081                 return r;
2082
2083         /* combine with some data unique (on this host) to this
2084          * container instance */
2085         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2086         if (idx > 0) {
2087                 idx = htole64(idx);
2088                 memcpy(i, &idx, sizeof(idx));
2089         }
2090
2091         /* Let's hash the host machine ID plus the container name. We
2092          * use a fixed, but originally randomly created hash key here. */
2093         siphash24(result, v, sz, hash_key.bytes);
2094
2095         assert_cc(ETH_ALEN <= sizeof(result));
2096         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2097
2098         /* see eth_random_addr in the kernel */
2099         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2100         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2101
2102         return 0;
2103 }
2104
2105 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2106         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2107         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2108         struct ether_addr mac_host, mac_container;
2109         int r, i;
2110
2111         if (!arg_private_network)
2112                 return 0;
2113
2114         if (!arg_network_veth)
2115                 return 0;
2116
2117         /* Use two different interface name prefixes depending whether
2118          * we are in bridge mode or not. */
2119         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2120                  arg_network_bridge ? "vb" : "ve", arg_machine);
2121
2122         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2125
2126         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2129
2130         r = sd_rtnl_open(&rtnl, 0);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to connect to netlink: %m");
2133
2134         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2137
2138         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2141
2142         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2145
2146         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to open netlink container: %m");
2153
2154         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to open netlink container: %m");
2157
2158         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2161
2162         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2165
2166         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_message_close_container(m);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to close netlink container: %m");
2177
2178         r = sd_rtnl_message_close_container(m);
2179         if (r < 0)
2180                 return log_error_errno(r, "Failed to close netlink container: %m");
2181
2182         r = sd_rtnl_call(rtnl, m, 0, NULL);
2183         if (r < 0)
2184                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2185
2186         i = (int) if_nametoindex(iface_name);
2187         if (i <= 0)
2188                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2189
2190         *ifi = i;
2191
2192         return 0;
2193 }
2194
2195 static int setup_bridge(const char veth_name[], int *ifi) {
2196         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2197         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2198         int r, bridge;
2199
2200         if (!arg_private_network)
2201                 return 0;
2202
2203         if (!arg_network_veth)
2204                 return 0;
2205
2206         if (!arg_network_bridge)
2207                 return 0;
2208
2209         bridge = (int) if_nametoindex(arg_network_bridge);
2210         if (bridge <= 0)
2211                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2212
2213         *ifi = bridge;
2214
2215         r = sd_rtnl_open(&rtnl, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to connect to netlink: %m");
2218
2219         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2222
2223         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2226
2227         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2230
2231         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add netlink master field: %m");
2234
2235         r = sd_rtnl_call(rtnl, m, 0, NULL);
2236         if (r < 0)
2237                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2238
2239         return 0;
2240 }
2241
2242 static int parse_interface(struct udev *udev, const char *name) {
2243         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2244         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2245         int ifi;
2246
2247         ifi = (int) if_nametoindex(name);
2248         if (ifi <= 0)
2249                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2250
2251         sprintf(ifi_str, "n%i", ifi);
2252         d = udev_device_new_from_device_id(udev, ifi_str);
2253         if (!d)
2254                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2255
2256         if (udev_device_get_is_initialized(d) <= 0) {
2257                 log_error("Network interface %s is not initialized yet.", name);
2258                 return -EBUSY;
2259         }
2260
2261         return ifi;
2262 }
2263
2264 static int move_network_interfaces(pid_t pid) {
2265         _cleanup_udev_unref_ struct udev *udev = NULL;
2266         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2267         char **i;
2268         int r;
2269
2270         if (!arg_private_network)
2271                 return 0;
2272
2273         if (strv_isempty(arg_network_interfaces))
2274                 return 0;
2275
2276         r = sd_rtnl_open(&rtnl, 0);
2277         if (r < 0)
2278                 return log_error_errno(r, "Failed to connect to netlink: %m");
2279
2280         udev = udev_new();
2281         if (!udev) {
2282                 log_error("Failed to connect to udev.");
2283                 return -ENOMEM;
2284         }
2285
2286         STRV_FOREACH(i, arg_network_interfaces) {
2287                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2288                 int ifi;
2289
2290                 ifi = parse_interface(udev, *i);
2291                 if (ifi < 0)
2292                         return ifi;
2293
2294                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2297
2298                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2299                 if (r < 0)
2300                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2301
2302                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2303                 if (r < 0)
2304                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2305         }
2306
2307         return 0;
2308 }
2309
2310 static int setup_macvlan(pid_t pid) {
2311         _cleanup_udev_unref_ struct udev *udev = NULL;
2312         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2313         unsigned idx = 0;
2314         char **i;
2315         int r;
2316
2317         if (!arg_private_network)
2318                 return 0;
2319
2320         if (strv_isempty(arg_network_macvlan))
2321                 return 0;
2322
2323         r = sd_rtnl_open(&rtnl, 0);
2324         if (r < 0)
2325                 return log_error_errno(r, "Failed to connect to netlink: %m");
2326
2327         udev = udev_new();
2328         if (!udev) {
2329                 log_error("Failed to connect to udev.");
2330                 return -ENOMEM;
2331         }
2332
2333         STRV_FOREACH(i, arg_network_macvlan) {
2334                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2335                 _cleanup_free_ char *n = NULL;
2336                 struct ether_addr mac;
2337                 int ifi;
2338
2339                 ifi = parse_interface(udev, *i);
2340                 if (ifi < 0)
2341                         return ifi;
2342
2343                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2346
2347                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2348                 if (r < 0)
2349                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2350
2351                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2352                 if (r < 0)
2353                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2354
2355                 n = strappend("mv-", *i);
2356                 if (!n)
2357                         return log_oom();
2358
2359                 strshorten(n, IFNAMSIZ-1);
2360
2361                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2364
2365                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2368
2369                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2372
2373                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to open netlink container: %m");
2376
2377                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to open netlink container: %m");
2380
2381                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2384
2385                 r = sd_rtnl_message_close_container(m);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to close netlink container: %m");
2388
2389                 r = sd_rtnl_message_close_container(m);
2390                 if (r < 0)
2391                         return log_error_errno(r, "Failed to close netlink container: %m");
2392
2393                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2394                 if (r < 0)
2395                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2396         }
2397
2398         return 0;
2399 }
2400
2401 static int setup_ipvlan(pid_t pid) {
2402         _cleanup_udev_unref_ struct udev *udev = NULL;
2403         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2404         char **i;
2405         int r;
2406
2407         if (!arg_private_network)
2408                 return 0;
2409
2410         if (strv_isempty(arg_network_ipvlan))
2411                 return 0;
2412
2413         r = sd_rtnl_open(&rtnl, 0);
2414         if (r < 0)
2415                 return log_error_errno(r, "Failed to connect to netlink: %m");
2416
2417         udev = udev_new();
2418         if (!udev) {
2419                 log_error("Failed to connect to udev.");
2420                 return -ENOMEM;
2421         }
2422
2423         STRV_FOREACH(i, arg_network_ipvlan) {
2424                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2425                 _cleanup_free_ char *n = NULL;
2426                 int ifi;
2427
2428                 ifi = parse_interface(udev, *i);
2429                 if (ifi < 0)
2430                         return ifi;
2431
2432                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2435
2436                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2437                 if (r < 0)
2438                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2439
2440                 n = strappend("iv-", *i);
2441                 if (!n)
2442                         return log_oom();
2443
2444                 strshorten(n, IFNAMSIZ-1);
2445
2446                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2449
2450                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2453
2454                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to open netlink container: %m");
2457
2458                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to open netlink container: %m");
2461
2462                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2465
2466                 r = sd_rtnl_message_close_container(m);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to close netlink container: %m");
2469
2470                 r = sd_rtnl_message_close_container(m);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to close netlink container: %m");
2473
2474                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2475                 if (r < 0)
2476                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2477         }
2478
2479         return 0;
2480 }
2481
2482 static int setup_seccomp(void) {
2483
2484 #ifdef HAVE_SECCOMP
2485         static const int blacklist[] = {
2486                 SCMP_SYS(kexec_load),
2487                 SCMP_SYS(open_by_handle_at),
2488                 SCMP_SYS(iopl),
2489                 SCMP_SYS(ioperm),
2490                 SCMP_SYS(swapon),
2491                 SCMP_SYS(swapoff),
2492         };
2493
2494         static const int kmod_blacklist[] = {
2495                 SCMP_SYS(init_module),
2496                 SCMP_SYS(finit_module),
2497                 SCMP_SYS(delete_module),
2498         };
2499
2500         scmp_filter_ctx seccomp;
2501         unsigned i;
2502         int r;
2503
2504         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2505         if (!seccomp)
2506                 return log_oom();
2507
2508         r = seccomp_add_secondary_archs(seccomp);
2509         if (r < 0) {
2510                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2511                 goto finish;
2512         }
2513
2514         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2515                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2516                 if (r == -EFAULT)
2517                         continue; /* unknown syscall */
2518                 if (r < 0) {
2519                         log_error_errno(r, "Failed to block syscall: %m");
2520                         goto finish;
2521                 }
2522         }
2523
2524         /* If the CAP_SYS_MODULE capability is not requested then
2525          * we'll block the kmod syscalls too */
2526         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2527                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2528                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2529                         if (r == -EFAULT)
2530                                 continue; /* unknown syscall */
2531                         if (r < 0) {
2532                                 log_error_errno(r, "Failed to block syscall: %m");
2533                                 goto finish;
2534                         }
2535                 }
2536         }
2537
2538         /*
2539            Audit is broken in containers, much of the userspace audit
2540            hookup will fail if running inside a container. We don't
2541            care and just turn off creation of audit sockets.
2542
2543            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2544            with EAFNOSUPPORT which audit userspace uses as indication
2545            that audit is disabled in the kernel.
2546          */
2547
2548         r = seccomp_rule_add(
2549                         seccomp,
2550                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2551                         SCMP_SYS(socket),
2552                         2,
2553                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2554                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2555         if (r < 0) {
2556                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2557                 goto finish;
2558         }
2559
2560         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2561         if (r < 0) {
2562                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2563                 goto finish;
2564         }
2565
2566         r = seccomp_load(seccomp);
2567         if (r < 0)
2568                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2569
2570 finish:
2571         seccomp_release(seccomp);
2572         return r;
2573 #else
2574         return 0;
2575 #endif
2576
2577 }
2578
2579 static int setup_propagate(const char *root) {
2580         const char *p, *q;
2581
2582         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2583         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2584         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2585         (void) mkdir_p(p, 0600);
2586
2587         q = strjoina(root, "/run/systemd/nspawn/incoming");
2588         mkdir_parents(q, 0755);
2589         mkdir_p(q, 0600);
2590
2591         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2592                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2593
2594         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2595                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2596
2597         return 0;
2598 }
2599
2600 static int setup_image(char **device_path, int *loop_nr) {
2601         struct loop_info64 info = {
2602                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2603         };
2604         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2605         _cleanup_free_ char* loopdev = NULL;
2606         struct stat st;
2607         int r, nr;
2608
2609         assert(device_path);
2610         assert(loop_nr);
2611         assert(arg_image);
2612
2613         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2614         if (fd < 0)
2615                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2616
2617         if (fstat(fd, &st) < 0)
2618                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2619
2620         if (S_ISBLK(st.st_mode)) {
2621                 char *p;
2622
2623                 p = strdup(arg_image);
2624                 if (!p)
2625                         return log_oom();
2626
2627                 *device_path = p;
2628
2629                 *loop_nr = -1;
2630
2631                 r = fd;
2632                 fd = -1;
2633
2634                 return r;
2635         }
2636
2637         if (!S_ISREG(st.st_mode)) {
2638                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2639                 return -EINVAL;
2640         }
2641
2642         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2643         if (control < 0)
2644                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2645
2646         nr = ioctl(control, LOOP_CTL_GET_FREE);
2647         if (nr < 0)
2648                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2649
2650         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2651                 return log_oom();
2652
2653         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2654         if (loop < 0)
2655                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2656
2657         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2658                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2659
2660         if (arg_read_only)
2661                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2662
2663         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2664                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2665
2666         *device_path = loopdev;
2667         loopdev = NULL;
2668
2669         *loop_nr = nr;
2670
2671         r = loop;
2672         loop = -1;
2673
2674         return r;
2675 }
2676
2677 #define PARTITION_TABLE_BLURB \
2678         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2679         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2680         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2681         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2682         "to be bootable with systemd-nspawn."
2683
2684 static int dissect_image(
2685                 int fd,
2686                 char **root_device, bool *root_device_rw,
2687                 char **home_device, bool *home_device_rw,
2688                 char **srv_device, bool *srv_device_rw,
2689                 bool *secondary) {
2690
2691 #ifdef HAVE_BLKID
2692         int home_nr = -1, srv_nr = -1;
2693 #ifdef GPT_ROOT_NATIVE
2694         int root_nr = -1;
2695 #endif
2696 #ifdef GPT_ROOT_SECONDARY
2697         int secondary_root_nr = -1;
2698 #endif
2699         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2700         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2701         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2702         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2703         _cleanup_udev_unref_ struct udev *udev = NULL;
2704         struct udev_list_entry *first, *item;
2705         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2706         bool is_gpt, is_mbr, multiple_generic = false;
2707         const char *pttype = NULL;
2708         blkid_partlist pl;
2709         struct stat st;
2710         unsigned i;
2711         int r;
2712
2713         assert(fd >= 0);
2714         assert(root_device);
2715         assert(home_device);
2716         assert(srv_device);
2717         assert(secondary);
2718         assert(arg_image);
2719
2720         b = blkid_new_probe();
2721         if (!b)
2722                 return log_oom();
2723
2724         errno = 0;
2725         r = blkid_probe_set_device(b, fd, 0, 0);
2726         if (r != 0) {
2727                 if (errno == 0)
2728                         return log_oom();
2729
2730                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2731                 return -errno;
2732         }
2733
2734         blkid_probe_enable_partitions(b, 1);
2735         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2736
2737         errno = 0;
2738         r = blkid_do_safeprobe(b);
2739         if (r == -2 || r == 1) {
2740                 log_error("Failed to identify any partition table on\n"
2741                           "    %s\n"
2742                           PARTITION_TABLE_BLURB, arg_image);
2743                 return -EINVAL;
2744         } else if (r != 0) {
2745                 if (errno == 0)
2746                         errno = EIO;
2747                 log_error_errno(errno, "Failed to probe: %m");
2748                 return -errno;
2749         }
2750
2751         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2752
2753         is_gpt = streq_ptr(pttype, "gpt");
2754         is_mbr = streq_ptr(pttype, "dos");
2755
2756         if (!is_gpt && !is_mbr) {
2757                 log_error("No GPT or MBR partition table discovered on\n"
2758                           "    %s\n"
2759                           PARTITION_TABLE_BLURB, arg_image);
2760                 return -EINVAL;
2761         }
2762
2763         errno = 0;
2764         pl = blkid_probe_get_partitions(b);
2765         if (!pl) {
2766                 if (errno == 0)
2767                         return log_oom();
2768
2769                 log_error("Failed to list partitions of %s", arg_image);
2770                 return -errno;
2771         }
2772
2773         udev = udev_new();
2774         if (!udev)
2775                 return log_oom();
2776
2777         if (fstat(fd, &st) < 0)
2778                 return log_error_errno(errno, "Failed to stat block device: %m");
2779
2780         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2781         if (!d)
2782                 return log_oom();
2783
2784         for (i = 0;; i++) {
2785                 int n, m;
2786
2787                 if (i >= 10) {
2788                         log_error("Kernel partitions never appeared.");
2789                         return -ENXIO;
2790                 }
2791
2792                 e = udev_enumerate_new(udev);
2793                 if (!e)
2794                         return log_oom();
2795
2796                 r = udev_enumerate_add_match_parent(e, d);
2797                 if (r < 0)
2798                         return log_oom();
2799
2800                 r = udev_enumerate_scan_devices(e);
2801                 if (r < 0)
2802                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2803
2804                 /* Count the partitions enumerated by the kernel */
2805                 n = 0;
2806                 first = udev_enumerate_get_list_entry(e);
2807                 udev_list_entry_foreach(item, first)
2808                         n++;
2809
2810                 /* Count the partitions enumerated by blkid */
2811                 m = blkid_partlist_numof_partitions(pl);
2812                 if (n == m + 1)
2813                         break;
2814                 if (n > m + 1) {
2815                         log_error("blkid and kernel partition list do not match.");
2816                         return -EIO;
2817                 }
2818                 if (n < m + 1) {
2819                         unsigned j;
2820
2821                         /* The kernel has probed fewer partitions than
2822                          * blkid? Maybe the kernel prober is still
2823                          * running or it got EBUSY because udev
2824                          * already opened the device. Let's reprobe
2825                          * the device, which is a synchronous call
2826                          * that waits until probing is complete. */
2827
2828                         for (j = 0; j < 20; j++) {
2829
2830                                 r = ioctl(fd, BLKRRPART, 0);
2831                                 if (r < 0)
2832                                         r = -errno;
2833                                 if (r >= 0 || r != -EBUSY)
2834                                         break;
2835
2836                                 /* If something else has the device
2837                                  * open, such as an udev rule, the
2838                                  * ioctl will return EBUSY. Since
2839                                  * there's no way to wait until it
2840                                  * isn't busy anymore, let's just wait
2841                                  * a bit, and try again.
2842                                  *
2843                                  * This is really something they
2844                                  * should fix in the kernel! */
2845
2846                                 usleep(50 * USEC_PER_MSEC);
2847                         }
2848
2849                         if (r < 0)
2850                                 return log_error_errno(r, "Failed to reread partition table: %m");
2851                 }
2852
2853                 e = udev_enumerate_unref(e);
2854         }
2855
2856         first = udev_enumerate_get_list_entry(e);
2857         udev_list_entry_foreach(item, first) {
2858                 _cleanup_udev_device_unref_ struct udev_device *q;
2859                 const char *node;
2860                 unsigned long long flags;
2861                 blkid_partition pp;
2862                 dev_t qn;
2863                 int nr;
2864
2865                 errno = 0;
2866                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2867                 if (!q) {
2868                         if (!errno)
2869                                 errno = ENOMEM;
2870
2871                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2872                         return -errno;
2873                 }
2874
2875                 qn = udev_device_get_devnum(q);
2876                 if (major(qn) == 0)
2877                         continue;
2878
2879                 if (st.st_rdev == qn)
2880                         continue;
2881
2882                 node = udev_device_get_devnode(q);
2883                 if (!node)
2884                         continue;
2885
2886                 pp = blkid_partlist_devno_to_partition(pl, qn);
2887                 if (!pp)
2888                         continue;
2889
2890                 flags = blkid_partition_get_flags(pp);
2891
2892                 nr = blkid_partition_get_partno(pp);
2893                 if (nr < 0)
2894                         continue;
2895
2896                 if (is_gpt) {
2897                         sd_id128_t type_id;
2898                         const char *stype;
2899
2900                         if (flags & GPT_FLAG_NO_AUTO)
2901                                 continue;
2902
2903                         stype = blkid_partition_get_type_string(pp);
2904                         if (!stype)
2905                                 continue;
2906
2907                         if (sd_id128_from_string(stype, &type_id) < 0)
2908                                 continue;
2909
2910                         if (sd_id128_equal(type_id, GPT_HOME)) {
2911
2912                                 if (home && nr >= home_nr)
2913                                         continue;
2914
2915                                 home_nr = nr;
2916                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918                                 r = free_and_strdup(&home, node);
2919                                 if (r < 0)
2920                                         return log_oom();
2921
2922                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2923
2924                                 if (srv && nr >= srv_nr)
2925                                         continue;
2926
2927                                 srv_nr = nr;
2928                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2929
2930                                 r = free_and_strdup(&srv, node);
2931                                 if (r < 0)
2932                                         return log_oom();
2933                         }
2934 #ifdef GPT_ROOT_NATIVE
2935                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2936
2937                                 if (root && nr >= root_nr)
2938                                         continue;
2939
2940                                 root_nr = nr;
2941                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943                                 r = free_and_strdup(&root, node);
2944                                 if (r < 0)
2945                                         return log_oom();
2946                         }
2947 #endif
2948 #ifdef GPT_ROOT_SECONDARY
2949                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2950
2951                                 if (secondary_root && nr >= secondary_root_nr)
2952                                         continue;
2953
2954                                 secondary_root_nr = nr;
2955                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2956
2957                                 r = free_and_strdup(&secondary_root, node);
2958                                 if (r < 0)
2959                                         return log_oom();
2960                         }
2961 #endif
2962                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2963
2964                                 if (generic)
2965                                         multiple_generic = true;
2966                                 else {
2967                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2968
2969                                         r = free_and_strdup(&generic, node);
2970                                         if (r < 0)
2971                                                 return log_oom();
2972                                 }
2973                         }
2974
2975                 } else if (is_mbr) {
2976                         int type;
2977
2978                         if (flags != 0x80) /* Bootable flag */
2979                                 continue;
2980
2981                         type = blkid_partition_get_type(pp);
2982                         if (type != 0x83) /* Linux partition */
2983                                 continue;
2984
2985                         if (generic)
2986                                 multiple_generic = true;
2987                         else {
2988                                 generic_rw = true;
2989
2990                                 r = free_and_strdup(&root, node);
2991                                 if (r < 0)
2992                                         return log_oom();
2993                         }
2994                 }
2995         }
2996
2997         if (root) {
2998                 *root_device = root;
2999                 root = NULL;
3000
3001                 *root_device_rw = root_rw;
3002                 *secondary = false;
3003         } else if (secondary_root) {
3004                 *root_device = secondary_root;
3005                 secondary_root = NULL;
3006
3007                 *root_device_rw = secondary_root_rw;
3008                 *secondary = true;
3009         } else if (generic) {
3010
3011                 /* There were no partitions with precise meanings
3012                  * around, but we found generic partitions. In this
3013                  * case, if there's only one, we can go ahead and boot
3014                  * it, otherwise we bail out, because we really cannot
3015                  * make any sense of it. */
3016
3017                 if (multiple_generic) {
3018                         log_error("Identified multiple bootable Linux partitions on\n"
3019                                   "    %s\n"
3020                                   PARTITION_TABLE_BLURB, arg_image);
3021                         return -EINVAL;
3022                 }
3023
3024                 *root_device = generic;
3025                 generic = NULL;
3026
3027                 *root_device_rw = generic_rw;
3028                 *secondary = false;
3029         } else {
3030                 log_error("Failed to identify root partition in disk image\n"
3031                           "    %s\n"
3032                           PARTITION_TABLE_BLURB, arg_image);
3033                 return -EINVAL;
3034         }
3035
3036         if (home) {
3037                 *home_device = home;
3038                 home = NULL;
3039
3040                 *home_device_rw = home_rw;
3041         }
3042
3043         if (srv) {
3044                 *srv_device = srv;
3045                 srv = NULL;
3046
3047                 *srv_device_rw = srv_rw;
3048         }
3049
3050         return 0;
3051 #else
3052         log_error("--image= is not supported, compiled without blkid support.");
3053         return -ENOTSUP;
3054 #endif
3055 }
3056
3057 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3058 #ifdef HAVE_BLKID
3059         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3060         const char *fstype, *p;
3061         int r;
3062
3063         assert(what);
3064         assert(where);
3065
3066         if (arg_read_only)
3067                 rw = false;
3068
3069         if (directory)
3070                 p = strjoina(where, directory);
3071         else
3072                 p = where;
3073
3074         errno = 0;
3075         b = blkid_new_probe_from_filename(what);
3076         if (!b) {
3077                 if (errno == 0)
3078                         return log_oom();
3079                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3080                 return -errno;
3081         }
3082
3083         blkid_probe_enable_superblocks(b, 1);
3084         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3085
3086         errno = 0;
3087         r = blkid_do_safeprobe(b);
3088         if (r == -1 || r == 1) {
3089                 log_error("Cannot determine file system type of %s", what);
3090                 return -EINVAL;
3091         } else if (r != 0) {
3092                 if (errno == 0)
3093                         errno = EIO;
3094                 log_error_errno(errno, "Failed to probe %s: %m", what);
3095                 return -errno;
3096         }
3097
3098         errno = 0;
3099         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3100                 if (errno == 0)
3101                         errno = EINVAL;
3102                 log_error("Failed to determine file system type of %s", what);
3103                 return -errno;
3104         }
3105
3106         if (streq(fstype, "crypto_LUKS")) {
3107                 log_error("nspawn currently does not support LUKS disk images.");
3108                 return -ENOTSUP;
3109         }
3110
3111         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3112                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3113
3114         return 0;
3115 #else
3116         log_error("--image= is not supported, compiled without blkid support.");
3117         return -ENOTSUP;
3118 #endif
3119 }
3120
3121 static int mount_devices(
3122                 const char *where,
3123                 const char *root_device, bool root_device_rw,
3124                 const char *home_device, bool home_device_rw,
3125                 const char *srv_device, bool srv_device_rw) {
3126         int r;
3127
3128         assert(where);
3129
3130         if (root_device) {
3131                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3132                 if (r < 0)
3133                         return log_error_errno(r, "Failed to mount root directory: %m");
3134         }
3135
3136         if (home_device) {
3137                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3138                 if (r < 0)
3139                         return log_error_errno(r, "Failed to mount home directory: %m");
3140         }
3141
3142         if (srv_device) {
3143                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3144                 if (r < 0)
3145                         return log_error_errno(r, "Failed to mount server data directory: %m");
3146         }
3147
3148         return 0;
3149 }
3150
3151 static void loop_remove(int nr, int *image_fd) {
3152         _cleanup_close_ int control = -1;
3153         int r;
3154
3155         if (nr < 0)
3156                 return;
3157
3158         if (image_fd && *image_fd >= 0) {
3159                 r = ioctl(*image_fd, LOOP_CLR_FD);
3160                 if (r < 0)
3161                         log_debug_errno(errno, "Failed to close loop image: %m");
3162                 *image_fd = safe_close(*image_fd);
3163         }
3164
3165         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3166         if (control < 0) {
3167                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3168                 return;
3169         }
3170
3171         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3172         if (r < 0)
3173                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3174 }
3175
3176 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3177         int pipe_fds[2];
3178         pid_t pid;
3179
3180         assert(database);
3181         assert(key);
3182         assert(rpid);
3183
3184         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3185                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3186
3187         pid = fork();
3188         if (pid < 0)
3189                 return log_error_errno(errno, "Failed to fork getent child: %m");
3190         else if (pid == 0) {
3191                 int nullfd;
3192                 char *empty_env = NULL;
3193
3194                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3195                         _exit(EXIT_FAILURE);
3196
3197                 if (pipe_fds[0] > 2)
3198                         safe_close(pipe_fds[0]);
3199                 if (pipe_fds[1] > 2)
3200                         safe_close(pipe_fds[1]);
3201
3202                 nullfd = open("/dev/null", O_RDWR);
3203                 if (nullfd < 0)
3204                         _exit(EXIT_FAILURE);
3205
3206                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3207                         _exit(EXIT_FAILURE);
3208
3209                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3210                         _exit(EXIT_FAILURE);
3211
3212                 if (nullfd > 2)
3213                         safe_close(nullfd);
3214
3215                 reset_all_signal_handlers();
3216                 close_all_fds(NULL, 0);
3217
3218                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3219                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3220                 _exit(EXIT_FAILURE);
3221         }
3222
3223         pipe_fds[1] = safe_close(pipe_fds[1]);
3224
3225         *rpid = pid;
3226
3227         return pipe_fds[0];
3228 }
3229
3230 static int change_uid_gid(char **_home) {
3231         char line[LINE_MAX], *x, *u, *g, *h;
3232         const char *word, *state;
3233         _cleanup_free_ uid_t *uids = NULL;
3234         _cleanup_free_ char *home = NULL;
3235         _cleanup_fclose_ FILE *f = NULL;
3236         _cleanup_close_ int fd = -1;
3237         unsigned n_uids = 0;
3238         size_t sz = 0, l;
3239         uid_t uid;
3240         gid_t gid;
3241         pid_t pid;
3242         int r;
3243
3244         assert(_home);
3245
3246         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3247                 /* Reset everything fully to 0, just in case */
3248
3249                 if (setgroups(0, NULL) < 0)
3250                         return log_error_errno(errno, "setgroups() failed: %m");
3251
3252                 if (setresgid(0, 0, 0) < 0)
3253                         return log_error_errno(errno, "setregid() failed: %m");
3254
3255                 if (setresuid(0, 0, 0) < 0)
3256                         return log_error_errno(errno, "setreuid() failed: %m");
3257
3258                 *_home = NULL;
3259                 return 0;
3260         }
3261
3262         /* First, get user credentials */
3263         fd = spawn_getent("passwd", arg_user, &pid);
3264         if (fd < 0)
3265                 return fd;
3266
3267         f = fdopen(fd, "r");
3268         if (!f)
3269                 return log_oom();
3270         fd = -1;
3271
3272         if (!fgets(line, sizeof(line), f)) {
3273
3274                 if (!ferror(f)) {
3275                         log_error("Failed to resolve user %s.", arg_user);
3276                         return -ESRCH;
3277                 }
3278
3279                 log_error_errno(errno, "Failed to read from getent: %m");
3280                 return -errno;
3281         }
3282
3283         truncate_nl(line);
3284
3285         wait_for_terminate_and_warn("getent passwd", pid, true);
3286
3287         x = strchr(line, ':');
3288         if (!x) {
3289                 log_error("/etc/passwd entry has invalid user field.");
3290                 return -EIO;
3291         }
3292
3293         u = strchr(x+1, ':');
3294         if (!u) {
3295                 log_error("/etc/passwd entry has invalid password field.");
3296                 return -EIO;
3297         }
3298
3299         u++;
3300         g = strchr(u, ':');
3301         if (!g) {
3302                 log_error("/etc/passwd entry has invalid UID field.");
3303                 return -EIO;
3304         }
3305
3306         *g = 0;
3307         g++;
3308         x = strchr(g, ':');
3309         if (!x) {
3310                 log_error("/etc/passwd entry has invalid GID field.");
3311                 return -EIO;
3312         }
3313
3314         *x = 0;
3315         h = strchr(x+1, ':');
3316         if (!h) {
3317                 log_error("/etc/passwd entry has invalid GECOS field.");
3318                 return -EIO;
3319         }
3320
3321         h++;
3322         x = strchr(h, ':');
3323         if (!x) {
3324                 log_error("/etc/passwd entry has invalid home directory field.");
3325                 return -EIO;
3326         }
3327
3328         *x = 0;
3329
3330         r = parse_uid(u, &uid);
3331         if (r < 0) {
3332                 log_error("Failed to parse UID of user.");
3333                 return -EIO;
3334         }
3335
3336         r = parse_gid(g, &gid);
3337         if (r < 0) {
3338                 log_error("Failed to parse GID of user.");
3339                 return -EIO;
3340         }
3341
3342         home = strdup(h);
3343         if (!home)
3344                 return log_oom();
3345
3346         /* Second, get group memberships */
3347         fd = spawn_getent("initgroups", arg_user, &pid);
3348         if (fd < 0)
3349                 return fd;
3350
3351         fclose(f);
3352         f = fdopen(fd, "r");
3353         if (!f)
3354                 return log_oom();
3355         fd = -1;
3356
3357         if (!fgets(line, sizeof(line), f)) {
3358                 if (!ferror(f)) {
3359                         log_error("Failed to resolve user %s.", arg_user);
3360                         return -ESRCH;
3361                 }
3362
3363                 log_error_errno(errno, "Failed to read from getent: %m");
3364                 return -errno;
3365         }
3366
3367         truncate_nl(line);
3368
3369         wait_for_terminate_and_warn("getent initgroups", pid, true);
3370
3371         /* Skip over the username and subsequent separator whitespace */
3372         x = line;
3373         x += strcspn(x, WHITESPACE);
3374         x += strspn(x, WHITESPACE);
3375
3376         FOREACH_WORD(word, l, x, state) {
3377                 char c[l+1];
3378
3379                 memcpy(c, word, l);
3380                 c[l] = 0;
3381
3382                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3383                         return log_oom();
3384
3385                 r = parse_uid(c, &uids[n_uids++]);
3386                 if (r < 0) {
3387                         log_error("Failed to parse group data from getent.");
3388                         return -EIO;
3389                 }
3390         }
3391
3392         r = mkdir_parents(home, 0775);
3393         if (r < 0)
3394                 return log_error_errno(r, "Failed to make home root directory: %m");
3395
3396         r = mkdir_safe(home, 0755, uid, gid);
3397         if (r < 0 && r != -EEXIST)
3398                 return log_error_errno(r, "Failed to make home directory: %m");
3399
3400         fchown(STDIN_FILENO, uid, gid);
3401         fchown(STDOUT_FILENO, uid, gid);
3402         fchown(STDERR_FILENO, uid, gid);
3403
3404         if (setgroups(n_uids, uids) < 0)
3405                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3406
3407         if (setresgid(gid, gid, gid) < 0)
3408                 return log_error_errno(errno, "setregid() failed: %m");
3409
3410         if (setresuid(uid, uid, uid) < 0)
3411                 return log_error_errno(errno, "setreuid() failed: %m");
3412
3413         if (_home) {
3414                 *_home = home;
3415                 home = NULL;
3416         }
3417
3418         return 0;
3419 }
3420
3421 /*
3422  * Return values:
3423  * < 0 : wait_for_terminate() failed to get the state of the
3424  *       container, the container was terminated by a signal, or
3425  *       failed for an unknown reason.  No change is made to the
3426  *       container argument.
3427  * > 0 : The program executed in the container terminated with an
3428  *       error.  The exit code of the program executed in the
3429  *       container is returned.  The container argument has been set
3430  *       to CONTAINER_TERMINATED.
3431  *   0 : The container is being rebooted, has been shut down or exited
3432  *       successfully.  The container argument has been set to either
3433  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3434  *
3435  * That is, success is indicated by a return value of zero, and an
3436  * error is indicated by a non-zero value.
3437  */
3438 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3439         siginfo_t status;
3440         int r;
3441
3442         r = wait_for_terminate(pid, &status);
3443         if (r < 0)
3444                 return log_warning_errno(r, "Failed to wait for container: %m");
3445
3446         switch (status.si_code) {
3447
3448         case CLD_EXITED:
3449                 if (status.si_status == 0) {
3450                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3451
3452                 } else
3453                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3454
3455                 *container = CONTAINER_TERMINATED;
3456                 return status.si_status;
3457
3458         case CLD_KILLED:
3459                 if (status.si_status == SIGINT) {
3460
3461                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3462                         *container = CONTAINER_TERMINATED;
3463                         return 0;
3464
3465                 } else if (status.si_status == SIGHUP) {
3466
3467                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3468                         *container = CONTAINER_REBOOTED;
3469                         return 0;
3470                 }
3471
3472                 /* CLD_KILLED fallthrough */
3473
3474         case CLD_DUMPED:
3475                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3476                 return -EIO;
3477
3478         default:
3479                 log_error("Container %s failed due to unknown reason.", arg_machine);
3480                 return -EIO;
3481         }
3482
3483         return r;
3484 }
3485
3486 static void nop_handler(int sig) {}
3487
3488 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3489         pid_t pid;
3490
3491         pid = PTR_TO_UINT32(userdata);
3492         if (pid > 0) {
3493                 if (kill(pid, SIGRTMIN+3) >= 0) {
3494                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3495                         sd_event_source_set_userdata(s, NULL);
3496                         return 0;
3497                 }
3498         }
3499
3500         sd_event_exit(sd_event_source_get_event(s), 0);
3501         return 0;
3502 }
3503
3504 static int determine_names(void) {
3505         int r;
3506
3507         if (!arg_image && !arg_directory) {
3508                 if (arg_machine) {
3509                         _cleanup_(image_unrefp) Image *i = NULL;
3510
3511                         r = image_find(arg_machine, &i);
3512                         if (r < 0)
3513                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3514                         else if (r == 0) {
3515                                 log_error("No image for machine '%s': %m", arg_machine);
3516                                 return -ENOENT;
3517                         }
3518
3519                         if (i->type == IMAGE_RAW)
3520                                 r = set_sanitized_path(&arg_image, i->path);
3521                         else
3522                                 r = set_sanitized_path(&arg_directory, i->path);
3523                         if (r < 0)
3524                                 return log_error_errno(r, "Invalid image directory: %m");
3525
3526                         arg_read_only = arg_read_only || i->read_only;
3527                 } else
3528                         arg_directory = get_current_dir_name();
3529
3530                 if (!arg_directory && !arg_machine) {
3531                         log_error("Failed to determine path, please use -D or -i.");
3532                         return -EINVAL;
3533                 }
3534         }
3535
3536         if (!arg_machine) {
3537                 if (arg_directory && path_equal(arg_directory, "/"))
3538                         arg_machine = gethostname_malloc();
3539                 else
3540                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3541
3542                 if (!arg_machine)
3543                         return log_oom();
3544
3545                 hostname_cleanup(arg_machine, false);
3546                 if (!machine_name_is_valid(arg_machine)) {
3547                         log_error("Failed to determine machine name automatically, please use -M.");
3548                         return -EINVAL;
3549                 }
3550
3551                 if (arg_ephemeral) {
3552                         char *b;
3553
3554                         /* Add a random suffix when this is an
3555                          * ephemeral machine, so that we can run many
3556                          * instances at once without manually having
3557                          * to specify -M each time. */
3558
3559                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3560                                 return log_oom();
3561
3562                         free(arg_machine);
3563                         arg_machine = b;
3564                 }
3565         }
3566
3567         return 0;
3568 }
3569
3570 int main(int argc, char *argv[]) {
3571
3572         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3573         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3574         _cleanup_close_ int master = -1, image_fd = -1;
3575         _cleanup_fdset_free_ FDSet *fds = NULL;
3576         int r, n_fd_passed, loop_nr = -1;
3577         char veth_name[IFNAMSIZ];
3578         bool secondary = false, remove_subvol = false;
3579         sigset_t mask, mask_chld;
3580         pid_t pid = 0;
3581         int ret = EXIT_SUCCESS;
3582         union in_addr_union exposed = {};
3583         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3584
3585         log_parse_environment();
3586         log_open();
3587
3588         r = parse_argv(argc, argv);
3589         if (r <= 0)
3590                 goto finish;
3591
3592         r = determine_names();
3593         if (r < 0)
3594                 goto finish;
3595
3596         if (geteuid() != 0) {
3597                 log_error("Need to be root.");
3598                 r = -EPERM;
3599                 goto finish;
3600         }
3601
3602         if (sd_booted() <= 0) {
3603                 log_error("Not running on a systemd system.");
3604                 r = -EINVAL;
3605                 goto finish;
3606         }
3607
3608         log_close();
3609         n_fd_passed = sd_listen_fds(false);
3610         if (n_fd_passed > 0) {
3611                 r = fdset_new_listen_fds(&fds, false);
3612                 if (r < 0) {
3613                         log_error_errno(r, "Failed to collect file descriptors: %m");
3614                         goto finish;
3615                 }
3616         }
3617         fdset_close_others(fds);
3618         log_open();
3619
3620         if (arg_directory) {
3621                 assert(!arg_image);
3622
3623                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3624                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3625                         r = -EINVAL;
3626                         goto finish;
3627                 }
3628
3629                 if (arg_ephemeral) {
3630                         char *np;
3631
3632                         /* If the specified path is a mount point we
3633                          * generate the new snapshot immediately
3634                          * inside it under a random name. However if
3635                          * the specified is not a mount point we
3636                          * create the new snapshot in the parent
3637                          * directory, just next to it. */
3638                         r = path_is_mount_point(arg_directory, false);
3639                         if (r < 0) {
3640                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3641                                 goto finish;
3642                         }
3643                         if (r > 0)
3644                                 r = tempfn_random_child(arg_directory, &np);
3645                         else
3646                                 r = tempfn_random(arg_directory, &np);
3647                         if (r < 0) {
3648                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3649                                 goto finish;
3650                         }
3651
3652                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3653                         if (r < 0) {
3654                                 log_error_errno(r, "Failed to lock %s: %m", np);
3655                                 goto finish;
3656                         }
3657
3658                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3659                         if (r < 0) {
3660                                 free(np);
3661                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3662                                 goto finish;
3663                         }
3664
3665                         free(arg_directory);
3666                         arg_directory = np;
3667
3668                         remove_subvol = true;
3669
3670                 } else {
3671                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3672                         if (r == -EBUSY) {
3673                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3674                                 goto finish;
3675                         }
3676                         if (r < 0) {
3677                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3678                                 return r;
3679                         }
3680
3681                         if (arg_template) {
3682                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3683                                 if (r == -EEXIST) {
3684                                         if (!arg_quiet)
3685                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3686                                 } else if (r < 0) {
3687                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3688                                         goto finish;
3689                                 } else {
3690                                         if (!arg_quiet)
3691                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3692                                 }
3693                         }
3694                 }
3695
3696                 if (arg_boot) {
3697                         if (path_is_os_tree(arg_directory) <= 0) {
3698                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3699                                 r = -EINVAL;
3700                                 goto finish;
3701                         }
3702                 } else {
3703                         const char *p;
3704
3705                         p = strjoina(arg_directory,
3706                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3707                         if (access(p, F_OK) < 0) {
3708                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3709                                 r = -EINVAL;
3710                                 goto finish;
3711                         }
3712                 }
3713
3714         } else {
3715                 char template[] = "/tmp/nspawn-root-XXXXXX";
3716
3717                 assert(arg_image);
3718                 assert(!arg_template);
3719
3720                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3721                 if (r == -EBUSY) {
3722                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3723                         goto finish;
3724                 }
3725                 if (r < 0) {
3726                         r = log_error_errno(r, "Failed to create image lock: %m");
3727                         goto finish;
3728                 }
3729
3730                 if (!mkdtemp(template)) {
3731                         log_error_errno(errno, "Failed to create temporary directory: %m");
3732                         r = -errno;
3733                         goto finish;
3734                 }
3735
3736                 arg_directory = strdup(template);
3737                 if (!arg_directory) {
3738                         r = log_oom();
3739                         goto finish;
3740                 }
3741
3742                 image_fd = setup_image(&device_path, &loop_nr);
3743                 if (image_fd < 0) {
3744                         r = image_fd;
3745                         goto finish;
3746                 }
3747
3748                 r = dissect_image(image_fd,
3749                                   &root_device, &root_device_rw,
3750                                   &home_device, &home_device_rw,
3751                                   &srv_device, &srv_device_rw,
3752                                   &secondary);
3753                 if (r < 0)
3754                         goto finish;
3755         }
3756
3757         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3758         if (master < 0) {
3759                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3760                 goto finish;
3761         }
3762
3763         r = ptsname_malloc(master, &console);
3764         if (r < 0) {
3765                 r = log_error_errno(r, "Failed to determine tty name: %m");
3766                 goto finish;
3767         }
3768
3769         if (!arg_quiet)
3770                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3771                          arg_machine, arg_image ?: arg_directory);
3772
3773         if (unlockpt(master) < 0) {
3774                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3775                 goto finish;
3776         }
3777
3778         assert_se(sigemptyset(&mask) == 0);
3779         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3780         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3781
3782         assert_se(sigemptyset(&mask_chld) == 0);
3783         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3784
3785         for (;;) {
3786                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3787                 ContainerStatus container_status;
3788                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3789                 struct sigaction sa = {
3790                         .sa_handler = nop_handler,
3791                         .sa_flags = SA_NOCLDSTOP,
3792                 };
3793
3794                 r = barrier_create(&barrier);
3795                 if (r < 0) {
3796                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3797                         goto finish;
3798                 }
3799
3800                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3801                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3802                         goto finish;
3803                 }
3804
3805                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3806                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3807                         goto finish;
3808                 }
3809
3810                 /* Child can be killed before execv(), so handle SIGCHLD
3811                  * in order to interrupt parent's blocking calls and
3812                  * give it a chance to call wait() and terminate. */
3813                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3814                 if (r < 0) {
3815                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3816                         goto finish;
3817                 }
3818
3819                 r = sigaction(SIGCHLD, &sa, NULL);
3820                 if (r < 0) {
3821                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3822                         goto finish;
3823                 }
3824
3825                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3826                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3827                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3828                 if (pid < 0) {
3829                         if (errno == EINVAL)
3830                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3831                         else
3832                                 r = log_error_errno(errno, "clone() failed: %m");
3833
3834                         goto finish;
3835                 }
3836
3837                 if (pid == 0) {
3838                         /* child */
3839                         _cleanup_free_ char *home = NULL;
3840                         unsigned n_env = 2;
3841                         const char *envp[] = {
3842                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3843                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3844                                 NULL, /* TERM */
3845                                 NULL, /* HOME */
3846                                 NULL, /* USER */
3847                                 NULL, /* LOGNAME */
3848                                 NULL, /* container_uuid */
3849                                 NULL, /* LISTEN_FDS */
3850                                 NULL, /* LISTEN_PID */
3851                                 NULL
3852                         };
3853                         char **env_use;
3854
3855                         barrier_set_role(&barrier, BARRIER_CHILD);
3856
3857                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3858                         if (envp[n_env])
3859                                 n_env ++;
3860
3861                         master = safe_close(master);
3862
3863                         close_nointr(STDIN_FILENO);
3864                         close_nointr(STDOUT_FILENO);
3865                         close_nointr(STDERR_FILENO);
3866
3867                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3868                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3869
3870                         reset_all_signal_handlers();
3871                         reset_signal_mask();
3872
3873                         r = open_terminal(console, O_RDWR);
3874                         if (r != STDIN_FILENO) {
3875                                 if (r >= 0) {
3876                                         safe_close(r);
3877                                         r = -EINVAL;
3878                                 }
3879
3880                                 log_error_errno(r, "Failed to open console: %m");
3881                                 _exit(EXIT_FAILURE);
3882                         }
3883
3884                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3885                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3886                                 log_error_errno(errno, "Failed to duplicate console: %m");
3887                                 _exit(EXIT_FAILURE);
3888                         }
3889
3890                         if (setsid() < 0) {
3891                                 log_error_errno(errno, "setsid() failed: %m");
3892                                 _exit(EXIT_FAILURE);
3893                         }
3894
3895                         if (reset_audit_loginuid() < 0)
3896                                 _exit(EXIT_FAILURE);
3897
3898                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3899                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3900                                 _exit(EXIT_FAILURE);
3901                         }
3902
3903                         /* Mark everything as slave, so that we still
3904                          * receive mounts from the real root, but don't
3905                          * propagate mounts to the real root. */
3906                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3907                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3908                                 _exit(EXIT_FAILURE);
3909                         }
3910
3911                         if (mount_devices(arg_directory,
3912                                           root_device, root_device_rw,
3913                                           home_device, home_device_rw,
3914                                           srv_device, srv_device_rw) < 0)
3915                                 _exit(EXIT_FAILURE);
3916
3917                         /* Turn directory into bind mount */
3918                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3919                                 log_error_errno(errno, "Failed to make bind mount: %m");
3920                                 _exit(EXIT_FAILURE);
3921                         }
3922
3923                         r = setup_volatile(arg_directory);
3924                         if (r < 0)
3925                                 _exit(EXIT_FAILURE);
3926
3927                         if (setup_volatile_state(arg_directory) < 0)
3928                                 _exit(EXIT_FAILURE);
3929
3930                         r = base_filesystem_create(arg_directory);
3931                         if (r < 0)
3932                                 _exit(EXIT_FAILURE);
3933
3934                         if (arg_read_only) {
3935                                 r = bind_remount_recursive(arg_directory, true);
3936                                 if (r < 0) {
3937                                         log_error_errno(r, "Failed to make tree read-only: %m");
3938                                         _exit(EXIT_FAILURE);
3939                                 }
3940                         }
3941
3942                         if (mount_all(arg_directory) < 0)
3943                                 _exit(EXIT_FAILURE);
3944
3945                         if (copy_devnodes(arg_directory) < 0)
3946                                 _exit(EXIT_FAILURE);
3947
3948                         if (setup_ptmx(arg_directory) < 0)
3949                                 _exit(EXIT_FAILURE);
3950
3951                         dev_setup(arg_directory);
3952
3953                         if (setup_propagate(arg_directory) < 0)
3954                                 _exit(EXIT_FAILURE);
3955
3956                         if (setup_seccomp() < 0)
3957                                 _exit(EXIT_FAILURE);
3958
3959                         if (setup_dev_console(arg_directory, console) < 0)
3960                                 _exit(EXIT_FAILURE);
3961
3962                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3963                                 _exit(EXIT_FAILURE);
3964                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3965
3966                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3967                                 _exit(EXIT_FAILURE);
3968                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3969
3970                         /* Tell the parent that we are ready, and that
3971                          * it can cgroupify us to that we lack access
3972                          * to certain devices and resources. */
3973                         (void) barrier_place(&barrier);
3974
3975                         if (setup_boot_id(arg_directory) < 0)
3976                                 _exit(EXIT_FAILURE);
3977
3978                         if (setup_timezone(arg_directory) < 0)
3979                                 _exit(EXIT_FAILURE);
3980
3981                         if (setup_resolv_conf(arg_directory) < 0)
3982                                 _exit(EXIT_FAILURE);
3983
3984                         if (setup_journal(arg_directory) < 0)
3985                                 _exit(EXIT_FAILURE);
3986
3987                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3988                                 _exit(EXIT_FAILURE);
3989
3990                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3991                                 _exit(EXIT_FAILURE);
3992
3993                         if (mount_tmpfs(arg_directory) < 0)
3994                                 _exit(EXIT_FAILURE);
3995
3996                         /* Wait until we are cgroup-ified, so that we
3997                          * can mount the right cgroup path writable */
3998                         (void) barrier_sync_next(&barrier);
3999
4000                         if (mount_cgroup(arg_directory) < 0)
4001                                 _exit(EXIT_FAILURE);
4002
4003                         if (chdir(arg_directory) < 0) {
4004                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4005                                 _exit(EXIT_FAILURE);
4006                         }
4007
4008                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4009                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4010                                 _exit(EXIT_FAILURE);
4011                         }
4012
4013                         if (chroot(".") < 0) {
4014                                 log_error_errno(errno, "chroot() failed: %m");
4015                                 _exit(EXIT_FAILURE);
4016                         }
4017
4018                         if (chdir("/") < 0) {
4019                                 log_error_errno(errno, "chdir() failed: %m");
4020                                 _exit(EXIT_FAILURE);
4021                         }
4022
4023                         umask(0022);
4024
4025                         if (arg_private_network)
4026                                 loopback_setup();
4027
4028                         if (drop_capabilities() < 0) {
4029                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4030                                 _exit(EXIT_FAILURE);
4031                         }
4032
4033                         r = change_uid_gid(&home);
4034                         if (r < 0)
4035                                 _exit(EXIT_FAILURE);
4036
4037                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4038                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4039                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4040                                 log_oom();
4041                                 _exit(EXIT_FAILURE);
4042                         }
4043
4044                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4045                                 char as_uuid[37];
4046
4047                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4048                                         log_oom();
4049                                         _exit(EXIT_FAILURE);
4050                                 }
4051                         }
4052
4053                         if (fdset_size(fds) > 0) {
4054                                 r = fdset_cloexec(fds, false);
4055                                 if (r < 0) {
4056                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4057                                         _exit(EXIT_FAILURE);
4058                                 }
4059
4060                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4061                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4062                                         log_oom();
4063                                         _exit(EXIT_FAILURE);
4064                                 }
4065                         }
4066
4067                         setup_hostname();
4068
4069                         if (arg_personality != 0xffffffffLU) {
4070                                 if (personality(arg_personality) < 0) {
4071                                         log_error_errno(errno, "personality() failed: %m");
4072                                         _exit(EXIT_FAILURE);
4073                                 }
4074                         } else if (secondary) {
4075                                 if (personality(PER_LINUX32) < 0) {
4076                                         log_error_errno(errno, "personality() failed: %m");
4077                                         _exit(EXIT_FAILURE);
4078                                 }
4079                         }
4080
4081 #ifdef HAVE_SELINUX
4082                         if (arg_selinux_context)
4083                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4084                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4085                                         _exit(EXIT_FAILURE);
4086                                 }
4087 #endif
4088
4089                         if (!strv_isempty(arg_setenv)) {
4090                                 char **n;
4091
4092                                 n = strv_env_merge(2, envp, arg_setenv);
4093                                 if (!n) {
4094                                         log_oom();
4095                                         _exit(EXIT_FAILURE);
4096                                 }
4097
4098                                 env_use = n;
4099                         } else
4100                                 env_use = (char**) envp;
4101
4102                         /* Wait until the parent is ready with the setup, too... */
4103                         if (!barrier_place_and_sync(&barrier))
4104                                 _exit(EXIT_FAILURE);
4105
4106                         if (arg_boot) {
4107                                 char **a;
4108                                 size_t l;
4109
4110                                 /* Automatically search for the init system */
4111
4112                                 l = 1 + argc - optind;
4113                                 a = newa(char*, l + 1);
4114                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4115
4116                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4117                                 execve(a[0], a, env_use);
4118
4119                                 a[0] = (char*) "/lib/systemd/systemd";
4120                                 execve(a[0], a, env_use);
4121
4122                                 a[0] = (char*) "/sbin/init";
4123                                 execve(a[0], a, env_use);
4124                         } else if (argc > optind)
4125                                 execvpe(argv[optind], argv + optind, env_use);
4126                         else {
4127                                 chdir(home ? home : "/root");
4128                                 execle("/bin/bash", "-bash", NULL, env_use);
4129                                 execle("/bin/sh", "-sh", NULL, env_use);
4130                         }
4131
4132                         log_error_errno(errno, "execv() failed: %m");
4133                         _exit(EXIT_FAILURE);
4134                 }
4135
4136                 barrier_set_role(&barrier, BARRIER_PARENT);
4137                 fdset_free(fds);
4138                 fds = NULL;
4139
4140                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4141                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4142
4143                 /* Wait for the most basic Child-setup to be done,
4144                  * before we add hardware to it, and place it in a
4145                  * cgroup. */
4146                 if (barrier_sync_next(&barrier)) {
4147                         int ifi = 0;
4148
4149                         r = move_network_interfaces(pid);
4150                         if (r < 0)
4151                                 goto finish;
4152
4153                         r = setup_veth(pid, veth_name, &ifi);
4154                         if (r < 0)
4155                                 goto finish;
4156
4157                         r = setup_bridge(veth_name, &ifi);
4158                         if (r < 0)
4159                                 goto finish;
4160
4161                         r = setup_macvlan(pid);
4162                         if (r < 0)
4163                                 goto finish;
4164
4165                         r = setup_ipvlan(pid);
4166                         if (r < 0)
4167                                 goto finish;
4168
4169                         r = register_machine(pid, ifi);
4170                         if (r < 0)
4171                                 goto finish;
4172
4173                         /* Block SIGCHLD here, before notifying child.
4174                          * process_pty() will handle it with the other signals. */
4175                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4176                         if (r < 0)
4177                                 goto finish;
4178
4179                         /* Reset signal to default */
4180                         r = default_signals(SIGCHLD, -1);
4181                         if (r < 0)
4182                                 goto finish;
4183
4184                         /* Notify the child that the parent is ready with all
4185                          * its setup, and that the child can now hand over
4186                          * control to the code to run inside the container. */
4187                         (void) barrier_place(&barrier);
4188
4189                         /* And wait that the child is completely ready now. */
4190                         if (barrier_place_and_sync(&barrier)) {
4191                                 _cleanup_event_unref_ sd_event *event = NULL;
4192                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4193                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4194                                 char last_char = 0;
4195
4196                                 sd_notifyf(false,
4197                                            "READY=1\n"
4198                                            "STATUS=Container running.\n"
4199                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4200
4201                                 r = sd_event_new(&event);
4202                                 if (r < 0) {
4203                                         log_error_errno(r, "Failed to get default event source: %m");
4204                                         goto finish;
4205                                 }
4206
4207                                 if (arg_boot) {
4208                                         /* Try to kill the init system on SIGINT or SIGTERM */
4209                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4210                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4211                                 } else {
4212                                         /* Immediately exit */
4213                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4214                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4215                                 }
4216
4217                                 /* simply exit on sigchld */
4218                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4219
4220                                 if (arg_expose_ports) {
4221                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4222                                         if (r < 0)
4223                                                 goto finish;
4224
4225                                         (void) expose_ports(rtnl, &exposed);
4226                                 }
4227
4228                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4229
4230                                 r = pty_forward_new(event, master, true, &forward);
4231                                 if (r < 0) {
4232                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4233                                         goto finish;
4234                                 }
4235
4236                                 r = sd_event_loop(event);
4237                                 if (r < 0) {
4238                                         log_error_errno(r, "Failed to run event loop: %m");
4239                                         goto finish;
4240                                 }
4241
4242                                 pty_forward_get_last_char(forward, &last_char);
4243
4244                                 forward = pty_forward_free(forward);
4245
4246                                 if (!arg_quiet && last_char != '\n')
4247                                         putc('\n', stdout);
4248
4249                                 /* Kill if it is not dead yet anyway */
4250                                 terminate_machine(pid);
4251                         }
4252                 }
4253
4254                 /* Normally redundant, but better safe than sorry */
4255                 kill(pid, SIGKILL);
4256
4257                 r = wait_for_container(pid, &container_status);
4258                 pid = 0;
4259
4260                 if (r < 0)
4261                         /* We failed to wait for the container, or the
4262                          * container exited abnormally */
4263                         goto finish;
4264                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4265                         /* The container exited with a non-zero
4266                          * status, or with zero status and no reboot
4267                          * was requested. */
4268                         ret = r;
4269                         break;
4270                 }
4271
4272                 /* CONTAINER_REBOOTED, loop again */
4273
4274                 if (arg_keep_unit) {
4275                         /* Special handling if we are running as a
4276                          * service: instead of simply restarting the
4277                          * machine we want to restart the entire
4278                          * service, so let's inform systemd about this
4279                          * with the special exit code 133. The service
4280                          * file uses RestartForceExitStatus=133 so
4281                          * that this results in a full nspawn
4282                          * restart. This is necessary since we might
4283                          * have cgroup parameters set we want to have
4284                          * flushed out. */
4285                         ret = 133;
4286                         r = 0;
4287                         break;
4288                 }
4289
4290                 flush_ports(&exposed);
4291         }
4292
4293 finish:
4294         sd_notify(false,
4295                   "STOPPING=1\n"
4296                   "STATUS=Terminating...");
4297
4298         loop_remove(loop_nr, &image_fd);
4299
4300         if (pid > 0)
4301                 kill(pid, SIGKILL);
4302
4303         if (remove_subvol && arg_directory) {
4304                 int k;
4305
4306                 k = btrfs_subvol_remove(arg_directory);
4307                 if (k < 0)
4308                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4309         }
4310
4311         if (arg_machine) {
4312                 const char *p;
4313
4314                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4315                 (void) rm_rf(p, false, true, false);
4316         }
4317
4318         free(arg_directory);
4319         free(arg_template);
4320         free(arg_image);
4321         free(arg_machine);
4322         free(arg_user);
4323         strv_free(arg_setenv);
4324         strv_free(arg_network_interfaces);
4325         strv_free(arg_network_macvlan);
4326         strv_free(arg_network_ipvlan);
4327         strv_free(arg_bind);
4328         strv_free(arg_bind_ro);
4329         strv_free(arg_tmpfs);
4330
4331         flush_ports(&exposed);
4332
4333         while (arg_expose_ports) {
4334                 ExposePort *p = arg_expose_ports;
4335                 LIST_REMOVE(ports, arg_expose_ports, p);
4336                 free(p);
4337         }
4338
4339         return r < 0 ? EXIT_FAILURE : ret;
4340 }