chiark / gitweb /
588a8ae8ac8c4ff230d704eb25a7ef0342a35184
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190
191 static void help(void) {
192         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
193                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
194                "  -h --help                 Show this help\n"
195                "     --version              Print version string\n"
196                "  -q --quiet                Do not show status information\n"
197                "  -D --directory=PATH       Root directory for the container\n"
198                "     --template=PATH        Initialize root directory from template directory,\n"
199                "                            if missing\n"
200                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
201                "                            remove it after exit\n"
202                "  -i --image=PATH           File system device or disk image for the container\n"
203                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
204                "  -u --user=USER            Run the command under specified user or uid\n"
205                "  -M --machine=NAME         Set the machine name for the container\n"
206                "     --uuid=UUID            Set a specific machine UUID for the container\n"
207                "  -S --slice=SLICE          Place the container in the specified slice\n"
208                "     --private-network      Disable network in container\n"
209                "     --network-interface=INTERFACE\n"
210                "                            Assign an existing network interface to the\n"
211                "                            container\n"
212                "     --network-macvlan=INTERFACE\n"
213                "                            Create a macvlan network interface based on an\n"
214                "                            existing network interface to the container\n"
215                "     --network-ipvlan=INTERFACE\n"
216                "                            Create a ipvlan network interface based on an\n"
217                "                            existing network interface to the container\n"
218                "  -n --network-veth         Add a virtual ethernet connection between host\n"
219                "                            and container\n"
220                "     --network-bridge=INTERFACE\n"
221                "                            Add a virtual ethernet connection between host\n"
222                "                            and container and add it to an existing bridge on\n"
223                "                            the host\n"
224                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
225                "                            Expose a container IP port on the host\n"
226                "  -Z --selinux-context=SECLABEL\n"
227                "                            Set the SELinux security context to be used by\n"
228                "                            processes in the container\n"
229                "  -L --selinux-apifs-context=SECLABEL\n"
230                "                            Set the SELinux security context to be used by\n"
231                "                            API/tmpfs file systems in the container\n"
232                "     --capability=CAP       In addition to the default, retain specified\n"
233                "                            capability\n"
234                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
235                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
236                "                            try-guest, try-host\n"
237                "  -j                        Equivalent to --link-journal=try-guest\n"
238                "     --read-only            Mount the root directory read-only\n"
239                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
240                "                            the container\n"
241                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
242                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
243                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
244                "     --share-system         Share system namespaces with host\n"
245                "     --register=BOOLEAN     Register container as machine\n"
246                "     --keep-unit            Do not register a scope for the machine, reuse\n"
247                "                            the service unit nspawn is running in\n"
248                "     --volatile[=MODE]      Run the system in volatile mode\n"
249                , program_invocation_short_name);
250 }
251
252 static int set_sanitized_path(char **b, const char *path) {
253         char *p;
254
255         assert(b);
256         assert(path);
257
258         p = canonicalize_file_name(path);
259         if (!p) {
260                 if (errno != ENOENT)
261                         return -errno;
262
263                 p = path_make_absolute_cwd(path);
264                 if (!p)
265                         return -ENOMEM;
266         }
267
268         free(*b);
269         *b = path_kill_slashes(p);
270         return 0;
271 }
272
273 static int parse_argv(int argc, char *argv[]) {
274
275         enum {
276                 ARG_VERSION = 0x100,
277                 ARG_PRIVATE_NETWORK,
278                 ARG_UUID,
279                 ARG_READ_ONLY,
280                 ARG_CAPABILITY,
281                 ARG_DROP_CAPABILITY,
282                 ARG_LINK_JOURNAL,
283                 ARG_BIND,
284                 ARG_BIND_RO,
285                 ARG_TMPFS,
286                 ARG_SETENV,
287                 ARG_SHARE_SYSTEM,
288                 ARG_REGISTER,
289                 ARG_KEEP_UNIT,
290                 ARG_NETWORK_INTERFACE,
291                 ARG_NETWORK_MACVLAN,
292                 ARG_NETWORK_IPVLAN,
293                 ARG_NETWORK_BRIDGE,
294                 ARG_PERSONALITY,
295                 ARG_VOLATILE,
296                 ARG_TEMPLATE,
297         };
298
299         static const struct option options[] = {
300                 { "help",                  no_argument,       NULL, 'h'                   },
301                 { "version",               no_argument,       NULL, ARG_VERSION           },
302                 { "directory",             required_argument, NULL, 'D'                   },
303                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
304                 { "ephemeral",             no_argument,       NULL, 'x'                   },
305                 { "user",                  required_argument, NULL, 'u'                   },
306                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
307                 { "boot",                  no_argument,       NULL, 'b'                   },
308                 { "uuid",                  required_argument, NULL, ARG_UUID              },
309                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
310                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
311                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
312                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
313                 { "bind",                  required_argument, NULL, ARG_BIND              },
314                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
315                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
316                 { "machine",               required_argument, NULL, 'M'                   },
317                 { "slice",                 required_argument, NULL, 'S'                   },
318                 { "setenv",                required_argument, NULL, ARG_SETENV            },
319                 { "selinux-context",       required_argument, NULL, 'Z'                   },
320                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
321                 { "quiet",                 no_argument,       NULL, 'q'                   },
322                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
323                 { "register",              required_argument, NULL, ARG_REGISTER          },
324                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
325                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
326                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
327                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
328                 { "network-veth",          no_argument,       NULL, 'n'                   },
329                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
330                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
331                 { "image",                 required_argument, NULL, 'i'                   },
332                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
333                 { "port",                  required_argument, NULL, 'p'                   },
334                 {}
335         };
336
337         int c, r;
338         uint64_t plus = 0, minus = 0;
339
340         assert(argc >= 0);
341         assert(argv);
342
343         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
344
345                 switch (c) {
346
347                 case 'h':
348                         help();
349                         return 0;
350
351                 case ARG_VERSION:
352                         puts(PACKAGE_STRING);
353                         puts(SYSTEMD_FEATURES);
354                         return 0;
355
356                 case 'D':
357                         r = set_sanitized_path(&arg_directory, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid root directory: %m");
360
361                         break;
362
363                 case ARG_TEMPLATE:
364                         r = set_sanitized_path(&arg_template, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid template directory: %m");
367
368                         break;
369
370                 case 'i':
371                         r = set_sanitized_path(&arg_image, optarg);
372                         if (r < 0)
373                                 return log_error_errno(r, "Invalid image path: %m");
374
375                         break;
376
377                 case 'x':
378                         arg_ephemeral = true;
379                         break;
380
381                 case 'u':
382                         free(arg_user);
383                         arg_user = strdup(optarg);
384                         if (!arg_user)
385                                 return log_oom();
386
387                         break;
388
389                 case ARG_NETWORK_BRIDGE:
390                         arg_network_bridge = optarg;
391
392                         /* fall through */
393
394                 case 'n':
395                         arg_network_veth = true;
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_INTERFACE:
400                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
401                                 return log_oom();
402
403                         arg_private_network = true;
404                         break;
405
406                 case ARG_NETWORK_MACVLAN:
407                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
408                                 return log_oom();
409
410                         arg_private_network = true;
411                         break;
412
413                 case ARG_NETWORK_IPVLAN:
414                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
415                                 return log_oom();
416
417                         /* fall through */
418
419                 case ARG_PRIVATE_NETWORK:
420                         arg_private_network = true;
421                         break;
422
423                 case 'b':
424                         arg_boot = true;
425                         break;
426
427                 case ARG_UUID:
428                         r = sd_id128_from_string(optarg, &arg_uuid);
429                         if (r < 0) {
430                                 log_error("Invalid UUID: %s", optarg);
431                                 return r;
432                         }
433                         break;
434
435                 case 'S':
436                         arg_slice = optarg;
437                         break;
438
439                 case 'M':
440                         if (isempty(optarg)) {
441                                 free(arg_machine);
442                                 arg_machine = NULL;
443                         } else {
444                                 if (!machine_name_is_valid(optarg)) {
445                                         log_error("Invalid machine name: %s", optarg);
446                                         return -EINVAL;
447                                 }
448
449                                 r = free_and_strdup(&arg_machine, optarg);
450                                 if (r < 0)
451                                         return log_oom();
452
453                                 break;
454                         }
455
456                 case 'Z':
457                         arg_selinux_context = optarg;
458                         break;
459
460                 case 'L':
461                         arg_selinux_apifs_context = optarg;
462                         break;
463
464                 case ARG_READ_ONLY:
465                         arg_read_only = true;
466                         break;
467
468                 case ARG_CAPABILITY:
469                 case ARG_DROP_CAPABILITY: {
470                         const char *state, *word;
471                         size_t length;
472
473                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
474                                 _cleanup_free_ char *t;
475
476                                 t = strndup(word, length);
477                                 if (!t)
478                                         return log_oom();
479
480                                 if (streq(t, "all")) {
481                                         if (c == ARG_CAPABILITY)
482                                                 plus = (uint64_t) -1;
483                                         else
484                                                 minus = (uint64_t) -1;
485                                 } else {
486                                         int cap;
487
488                                         cap = capability_from_name(t);
489                                         if (cap < 0) {
490                                                 log_error("Failed to parse capability %s.", t);
491                                                 return -EINVAL;
492                                         }
493
494                                         if (c == ARG_CAPABILITY)
495                                                 plus |= 1ULL << (uint64_t) cap;
496                                         else
497                                                 minus |= 1ULL << (uint64_t) cap;
498                                 }
499                         }
500
501                         break;
502                 }
503
504                 case 'j':
505                         arg_link_journal = LINK_GUEST;
506                         arg_link_journal_try = true;
507                         break;
508
509                 case ARG_LINK_JOURNAL:
510                         if (streq(optarg, "auto")) {
511                                 arg_link_journal = LINK_AUTO;
512                                 arg_link_journal_try = false;
513                         } else if (streq(optarg, "no")) {
514                                 arg_link_journal = LINK_NO;
515                                 arg_link_journal_try = false;
516                         } else if (streq(optarg, "guest")) {
517                                 arg_link_journal = LINK_GUEST;
518                                 arg_link_journal_try = false;
519                         } else if (streq(optarg, "host")) {
520                                 arg_link_journal = LINK_HOST;
521                                 arg_link_journal_try = false;
522                         } else if (streq(optarg, "try-guest")) {
523                                 arg_link_journal = LINK_GUEST;
524                                 arg_link_journal_try = true;
525                         } else if (streq(optarg, "try-host")) {
526                                 arg_link_journal = LINK_HOST;
527                                 arg_link_journal_try = true;
528                         } else {
529                                 log_error("Failed to parse link journal mode %s", optarg);
530                                 return -EINVAL;
531                         }
532
533                         break;
534
535                 case ARG_BIND:
536                 case ARG_BIND_RO: {
537                         _cleanup_free_ char *a = NULL, *b = NULL;
538                         char *e;
539                         char ***x;
540
541                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
542
543                         e = strchr(optarg, ':');
544                         if (e) {
545                                 a = strndup(optarg, e - optarg);
546                                 b = strdup(e + 1);
547                         } else {
548                                 a = strdup(optarg);
549                                 b = strdup(optarg);
550                         }
551
552                         if (!a || !b)
553                                 return log_oom();
554
555                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
556                                 log_error("Invalid bind mount specification: %s", optarg);
557                                 return -EINVAL;
558                         }
559
560                         r = strv_extend(x, a);
561                         if (r < 0)
562                                 return log_oom();
563
564                         r = strv_extend(x, b);
565                         if (r < 0)
566                                 return log_oom();
567
568                         break;
569                 }
570
571                 case ARG_TMPFS: {
572                         _cleanup_free_ char *a = NULL, *b = NULL;
573                         char *e;
574
575                         e = strchr(optarg, ':');
576                         if (e) {
577                                 a = strndup(optarg, e - optarg);
578                                 b = strdup(e + 1);
579                         } else {
580                                 a = strdup(optarg);
581                                 b = strdup("mode=0755");
582                         }
583
584                         if (!a || !b)
585                                 return log_oom();
586
587                         if (!path_is_absolute(a)) {
588                                 log_error("Invalid tmpfs specification: %s", optarg);
589                                 return -EINVAL;
590                         }
591
592                         r = strv_push(&arg_tmpfs, a);
593                         if (r < 0)
594                                 return log_oom();
595
596                         a = NULL;
597
598                         r = strv_push(&arg_tmpfs, b);
599                         if (r < 0)
600                                 return log_oom();
601
602                         b = NULL;
603
604                         break;
605                 }
606
607                 case ARG_SETENV: {
608                         char **n;
609
610                         if (!env_assignment_is_valid(optarg)) {
611                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
612                                 return -EINVAL;
613                         }
614
615                         n = strv_env_set(arg_setenv, optarg);
616                         if (!n)
617                                 return log_oom();
618
619                         strv_free(arg_setenv);
620                         arg_setenv = n;
621                         break;
622                 }
623
624                 case 'q':
625                         arg_quiet = true;
626                         break;
627
628                 case ARG_SHARE_SYSTEM:
629                         arg_share_system = true;
630                         break;
631
632                 case ARG_REGISTER:
633                         r = parse_boolean(optarg);
634                         if (r < 0) {
635                                 log_error("Failed to parse --register= argument: %s", optarg);
636                                 return r;
637                         }
638
639                         arg_register = r;
640                         break;
641
642                 case ARG_KEEP_UNIT:
643                         arg_keep_unit = true;
644                         break;
645
646                 case ARG_PERSONALITY:
647
648                         arg_personality = personality_from_string(optarg);
649                         if (arg_personality == 0xffffffffLU) {
650                                 log_error("Unknown or unsupported personality '%s'.", optarg);
651                                 return -EINVAL;
652                         }
653
654                         break;
655
656                 case ARG_VOLATILE:
657
658                         if (!optarg)
659                                 arg_volatile = VOLATILE_YES;
660                         else {
661                                 r = parse_boolean(optarg);
662                                 if (r < 0) {
663                                         if (streq(optarg, "state"))
664                                                 arg_volatile = VOLATILE_STATE;
665                                         else {
666                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
667                                                 return r;
668                                         }
669                                 } else
670                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
671                         }
672
673                         break;
674
675                 case 'p': {
676                         const char *split, *e;
677                         uint16_t container_port, host_port;
678                         int protocol;
679                         ExposePort *p;
680
681                         if ((e = startswith(optarg, "tcp:")))
682                                 protocol = IPPROTO_TCP;
683                         else if ((e = startswith(optarg, "udp:")))
684                                 protocol = IPPROTO_UDP;
685                         else {
686                                 e = optarg;
687                                 protocol = IPPROTO_TCP;
688                         }
689
690                         split = strchr(e, ':');
691                         if (split) {
692                                 char v[split - e + 1];
693
694                                 memcpy(v, e, split - e);
695                                 v[split - e] = 0;
696
697                                 r = safe_atou16(v, &host_port);
698                                 if (r < 0 || host_port <= 0) {
699                                         log_error("Failed to parse host port: %s", optarg);
700                                         return -EINVAL;
701                                 }
702
703                                 r = safe_atou16(split + 1, &container_port);
704                         } else {
705                                 r = safe_atou16(e, &container_port);
706                                 host_port = container_port;
707                         }
708
709                         if (r < 0 || container_port <= 0) {
710                                 log_error("Failed to parse host port: %s", optarg);
711                                 return -EINVAL;
712                         }
713
714                         LIST_FOREACH(ports, p, arg_expose_ports) {
715                                 if (p->protocol == protocol && p->host_port == host_port) {
716                                         log_error("Duplicate port specification: %s", optarg);
717                                         return -EINVAL;
718                                 }
719                         }
720
721                         p = new(ExposePort, 1);
722                         if (!p)
723                                 return log_oom();
724
725                         p->protocol = protocol;
726                         p->host_port = host_port;
727                         p->container_port = container_port;
728
729                         LIST_PREPEND(ports, arg_expose_ports, p);
730
731                         break;
732                 }
733
734                 case '?':
735                         return -EINVAL;
736
737                 default:
738                         assert_not_reached("Unhandled option");
739                 }
740
741         if (arg_share_system)
742                 arg_register = false;
743
744         if (arg_boot && arg_share_system) {
745                 log_error("--boot and --share-system may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
750                 log_error("--keep-unit may not be used when invoked from a user session.");
751                 return -EINVAL;
752         }
753
754         if (arg_directory && arg_image) {
755                 log_error("--directory= and --image= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_template && arg_image) {
760                 log_error("--template= and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_template && !(arg_directory || arg_machine)) {
765                 log_error("--template= needs --directory= or --machine=.");
766                 return -EINVAL;
767         }
768
769         if (arg_ephemeral && arg_template) {
770                 log_error("--ephemeral and --template= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_ephemeral && arg_image) {
775                 log_error("--ephemeral and --image= may not be combined.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
780                 log_error("--ephemeral and --link-journal= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_volatile != VOLATILE_NO && arg_read_only) {
785                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
786                 return -EINVAL;
787         }
788
789         if (arg_expose_ports && !arg_private_network) {
790                 log_error("Cannot use --port= without private networking.");
791                 return -EINVAL;
792         }
793
794         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
795
796         return 1;
797 }
798
799 static int mount_all(const char *dest) {
800
801         typedef struct MountPoint {
802                 const char *what;
803                 const char *where;
804                 const char *type;
805                 const char *options;
806                 unsigned long flags;
807                 bool fatal;
808         } MountPoint;
809
810         static const MountPoint mount_table[] = {
811                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
812                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
813                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
814                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
815                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
816                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
817                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
818                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
819 #ifdef HAVE_SELINUX
820                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
821                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
822 #endif
823         };
824
825         unsigned k;
826         int r = 0;
827
828         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
829                 _cleanup_free_ char *where = NULL;
830 #ifdef HAVE_SELINUX
831                 _cleanup_free_ char *options = NULL;
832 #endif
833                 const char *o;
834                 int t;
835
836                 where = strjoin(dest, "/", mount_table[k].where, NULL);
837                 if (!where)
838                         return log_oom();
839
840                 t = path_is_mount_point(where, true);
841                 if (t < 0) {
842                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
843
844                         if (r == 0)
845                                 r = t;
846
847                         continue;
848                 }
849
850                 /* Skip this entry if it is not a remount. */
851                 if (mount_table[k].what && t > 0)
852                         continue;
853
854                 t = mkdir_p(where, 0755);
855                 if (t < 0) {
856                         if (mount_table[k].fatal) {
857                                log_error_errno(t, "Failed to create directory %s: %m", where);
858
859                                 if (r == 0)
860                                         r = t;
861                         } else
862                                log_warning_errno(t, "Failed to create directory %s: %m", where);
863
864                         continue;
865                 }
866
867 #ifdef HAVE_SELINUX
868                 if (arg_selinux_apifs_context &&
869                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
870                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
871                         if (!options)
872                                 return log_oom();
873
874                         o = options;
875                 } else
876 #endif
877                         o = mount_table[k].options;
878
879
880                 if (mount(mount_table[k].what,
881                           where,
882                           mount_table[k].type,
883                           mount_table[k].flags,
884                           o) < 0) {
885
886                         if (mount_table[k].fatal) {
887                                 log_error_errno(errno, "mount(%s) failed: %m", where);
888
889                                 if (r == 0)
890                                         r = -errno;
891                         } else
892                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
893                 }
894         }
895
896         return r;
897 }
898
899 static int mount_binds(const char *dest, char **l, bool ro) {
900         char **x, **y;
901
902         STRV_FOREACH_PAIR(x, y, l) {
903                 _cleanup_free_ char *where = NULL;
904                 struct stat source_st, dest_st;
905                 int r;
906
907                 if (stat(*x, &source_st) < 0)
908                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
909
910                 where = strappend(dest, *y);
911                 if (!where)
912                         return log_oom();
913
914                 r = stat(where, &dest_st);
915                 if (r == 0) {
916                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
917                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
918                                 return -EINVAL;
919                         }
920                 } else if (errno == ENOENT) {
921                         r = mkdir_parents_label(where, 0755);
922                         if (r < 0)
923                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
924                 } else {
925                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
926                         return -errno;
927                 }
928
929                 /* Create the mount point, but be conservative -- refuse to create block
930                  * and char devices. */
931                 if (S_ISDIR(source_st.st_mode)) {
932                         r = mkdir_label(where, 0755);
933                         if (r < 0 && errno != EEXIST)
934                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
935                 } else if (S_ISFIFO(source_st.st_mode)) {
936                         r = mkfifo(where, 0644);
937                         if (r < 0 && errno != EEXIST)
938                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
939                 } else if (S_ISSOCK(source_st.st_mode)) {
940                         r = mknod(where, 0644 | S_IFSOCK, 0);
941                         if (r < 0 && errno != EEXIST)
942                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
943                 } else if (S_ISREG(source_st.st_mode)) {
944                         r = touch(where);
945                         if (r < 0)
946                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
947                 } else {
948                         log_error("Refusing to create mountpoint for file: %s", *x);
949                         return -ENOTSUP;
950                 }
951
952                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
953                         return log_error_errno(errno, "mount(%s) failed: %m", where);
954
955                 if (ro) {
956                         r = bind_remount_recursive(where, true);
957                         if (r < 0)
958                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
959                 }
960         }
961
962         return 0;
963 }
964
965 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
966         char *to;
967         int r;
968
969         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
970
971         r = path_is_mount_point(to, false);
972         if (r < 0)
973                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
974         if (r > 0)
975                 return 0;
976
977         mkdir_p(to, 0755);
978
979         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
980                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
981
982         return 1;
983 }
984
985 static int mount_cgroup(const char *dest) {
986         _cleanup_set_free_free_ Set *controllers = NULL;
987         _cleanup_free_ char *own_cgroup_path = NULL;
988         const char *cgroup_root, *systemd_root, *systemd_own;
989         int r;
990
991         controllers = set_new(&string_hash_ops);
992         if (!controllers)
993                 return log_oom();
994
995         r = cg_kernel_controllers(controllers);
996         if (r < 0)
997                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
998
999         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1000         if (r < 0)
1001                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1002
1003         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
1004         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1005                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1006
1007         for (;;) {
1008                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1009
1010                 controller = set_steal_first(controllers);
1011                 if (!controller)
1012                         break;
1013
1014                 origin = strappend("/sys/fs/cgroup/", controller);
1015                 if (!origin)
1016                         return log_oom();
1017
1018                 r = readlink_malloc(origin, &combined);
1019                 if (r == -EINVAL) {
1020                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1021
1022                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1023                         if (r < 0)
1024                                 return r;
1025
1026                 } else if (r < 0)
1027                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1028                 else {
1029                         _cleanup_free_ char *target = NULL;
1030
1031                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1032                         if (!target)
1033                                 return log_oom();
1034
1035                         /* A symbolic link, a combination of controllers in one hierarchy */
1036
1037                         if (!filename_is_valid(combined)) {
1038                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1039                                 continue;
1040                         }
1041
1042                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1043                         if (r < 0)
1044                                 return r;
1045
1046                         if (symlink(combined, target) < 0)
1047                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1048                 }
1049         }
1050
1051         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1052         if (r < 0)
1053                 return r;
1054
1055         /* Make our own cgroup a (writable) bind mount */
1056         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1057         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1058                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1059
1060         /* And then remount the systemd cgroup root read-only */
1061         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1062         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1063                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1064
1065         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1066                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1067
1068         return 0;
1069 }
1070
1071 static int mount_tmpfs(const char *dest) {
1072         char **i, **o;
1073
1074         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1075                 _cleanup_free_ char *where = NULL;
1076                 int r;
1077
1078                 where = strappend(dest, *i);
1079                 if (!where)
1080                         return log_oom();
1081
1082                 r = mkdir_label(where, 0755);
1083                 if (r < 0 && r != -EEXIST)
1084                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1085
1086                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1087                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1088         }
1089
1090         return 0;
1091 }
1092
1093 static int setup_timezone(const char *dest) {
1094         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1095         char *z, *y;
1096         int r;
1097
1098         assert(dest);
1099
1100         /* Fix the timezone, if possible */
1101         r = readlink_malloc("/etc/localtime", &p);
1102         if (r < 0) {
1103                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1104                 return 0;
1105         }
1106
1107         z = path_startswith(p, "../usr/share/zoneinfo/");
1108         if (!z)
1109                 z = path_startswith(p, "/usr/share/zoneinfo/");
1110         if (!z) {
1111                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1112                 return 0;
1113         }
1114
1115         where = strappend(dest, "/etc/localtime");
1116         if (!where)
1117                 return log_oom();
1118
1119         r = readlink_malloc(where, &q);
1120         if (r >= 0) {
1121                 y = path_startswith(q, "../usr/share/zoneinfo/");
1122                 if (!y)
1123                         y = path_startswith(q, "/usr/share/zoneinfo/");
1124
1125                 /* Already pointing to the right place? Then do nothing .. */
1126                 if (y && streq(y, z))
1127                         return 0;
1128         }
1129
1130         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1131         if (!check)
1132                 return log_oom();
1133
1134         if (access(check, F_OK) < 0) {
1135                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1136                 return 0;
1137         }
1138
1139         what = strappend("../usr/share/zoneinfo/", z);
1140         if (!what)
1141                 return log_oom();
1142
1143         r = mkdir_parents(where, 0755);
1144         if (r < 0) {
1145                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1146
1147                 return 0;
1148         }
1149
1150         r = unlink(where);
1151         if (r < 0 && errno != ENOENT) {
1152                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1153
1154                 return 0;
1155         }
1156
1157         if (symlink(what, where) < 0) {
1158                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1159                 return 0;
1160         }
1161
1162         return 0;
1163 }
1164
1165 static int setup_resolv_conf(const char *dest) {
1166         _cleanup_free_ char *where = NULL;
1167         int r;
1168
1169         assert(dest);
1170
1171         if (arg_private_network)
1172                 return 0;
1173
1174         /* Fix resolv.conf, if possible */
1175         where = strappend(dest, "/etc/resolv.conf");
1176         if (!where)
1177                 return log_oom();
1178
1179         /* We don't really care for the results of this really. If it
1180          * fails, it fails, but meh... */
1181         r = mkdir_parents(where, 0755);
1182         if (r < 0) {
1183                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1184
1185                 return 0;
1186         }
1187
1188         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1189         if (r < 0) {
1190                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1191
1192                 return 0;
1193         }
1194
1195         return 0;
1196 }
1197
1198 static int setup_volatile_state(const char *directory) {
1199         const char *p;
1200         int r;
1201
1202         assert(directory);
1203
1204         if (arg_volatile != VOLATILE_STATE)
1205                 return 0;
1206
1207         /* --volatile=state means we simply overmount /var
1208            with a tmpfs, and the rest read-only. */
1209
1210         r = bind_remount_recursive(directory, true);
1211         if (r < 0)
1212                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1213
1214         p = strappenda(directory, "/var");
1215         r = mkdir(p, 0755);
1216         if (r < 0 && errno != EEXIST)
1217                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1218
1219         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1220                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1221
1222         return 0;
1223 }
1224
1225 static int setup_volatile(const char *directory) {
1226         bool tmpfs_mounted = false, bind_mounted = false;
1227         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1228         const char *f, *t;
1229         int r;
1230
1231         assert(directory);
1232
1233         if (arg_volatile != VOLATILE_YES)
1234                 return 0;
1235
1236         /* --volatile=yes means we mount a tmpfs to the root dir, and
1237            the original /usr to use inside it, and that read-only. */
1238
1239         if (!mkdtemp(template))
1240                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1241
1242         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1243                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1244                 r = -errno;
1245                 goto fail;
1246         }
1247
1248         tmpfs_mounted = true;
1249
1250         f = strappenda(directory, "/usr");
1251         t = strappenda(template, "/usr");
1252
1253         r = mkdir(t, 0755);
1254         if (r < 0 && errno != EEXIST) {
1255                 log_error_errno(errno, "Failed to create %s: %m", t);
1256                 r = -errno;
1257                 goto fail;
1258         }
1259
1260         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1261                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1262                 r = -errno;
1263                 goto fail;
1264         }
1265
1266         bind_mounted = true;
1267
1268         r = bind_remount_recursive(t, true);
1269         if (r < 0) {
1270                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1271                 goto fail;
1272         }
1273
1274         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1275                 log_error_errno(errno, "Failed to move root mount: %m");
1276                 r = -errno;
1277                 goto fail;
1278         }
1279
1280         rmdir(template);
1281
1282         return 0;
1283
1284 fail:
1285         if (bind_mounted)
1286                 umount(t);
1287         if (tmpfs_mounted)
1288                 umount(template);
1289         rmdir(template);
1290         return r;
1291 }
1292
1293 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1294
1295         snprintf(s, 37,
1296                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1297                  SD_ID128_FORMAT_VAL(id));
1298
1299         return s;
1300 }
1301
1302 static int setup_boot_id(const char *dest) {
1303         _cleanup_free_ char *from = NULL, *to = NULL;
1304         sd_id128_t rnd = {};
1305         char as_uuid[37];
1306         int r;
1307
1308         assert(dest);
1309
1310         if (arg_share_system)
1311                 return 0;
1312
1313         /* Generate a new randomized boot ID, so that each boot-up of
1314          * the container gets a new one */
1315
1316         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1317         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1318         if (!from || !to)
1319                 return log_oom();
1320
1321         r = sd_id128_randomize(&rnd);
1322         if (r < 0)
1323                 return log_error_errno(r, "Failed to generate random boot id: %m");
1324
1325         id128_format_as_uuid(rnd, as_uuid);
1326
1327         r = write_string_file(from, as_uuid);
1328         if (r < 0)
1329                 return log_error_errno(r, "Failed to write boot id: %m");
1330
1331         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1332                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1333                 r = -errno;
1334         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1335                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1336
1337         unlink(from);
1338         return r;
1339 }
1340
1341 static int copy_devnodes(const char *dest) {
1342
1343         static const char devnodes[] =
1344                 "null\0"
1345                 "zero\0"
1346                 "full\0"
1347                 "random\0"
1348                 "urandom\0"
1349                 "tty\0"
1350                 "net/tun\0";
1351
1352         const char *d;
1353         int r = 0;
1354         _cleanup_umask_ mode_t u;
1355
1356         assert(dest);
1357
1358         u = umask(0000);
1359
1360         NULSTR_FOREACH(d, devnodes) {
1361                 _cleanup_free_ char *from = NULL, *to = NULL;
1362                 struct stat st;
1363
1364                 from = strappend("/dev/", d);
1365                 to = strjoin(dest, "/dev/", d, NULL);
1366                 if (!from || !to)
1367                         return log_oom();
1368
1369                 if (stat(from, &st) < 0) {
1370
1371                         if (errno != ENOENT)
1372                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1373
1374                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1375
1376                         log_error("%s is not a char or block device, cannot copy", from);
1377                         return -EIO;
1378
1379                 } else {
1380                         r = mkdir_parents(to, 0775);
1381                         if (r < 0) {
1382                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1383                                 return -r;
1384                         }
1385
1386                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1387                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1388                 }
1389         }
1390
1391         return r;
1392 }
1393
1394 static int setup_ptmx(const char *dest) {
1395         _cleanup_free_ char *p = NULL;
1396
1397         p = strappend(dest, "/dev/ptmx");
1398         if (!p)
1399                 return log_oom();
1400
1401         if (symlink("pts/ptmx", p) < 0)
1402                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1403
1404         return 0;
1405 }
1406
1407 static int setup_dev_console(const char *dest, const char *console) {
1408         _cleanup_umask_ mode_t u;
1409         const char *to;
1410         struct stat st;
1411         int r;
1412
1413         assert(dest);
1414         assert(console);
1415
1416         u = umask(0000);
1417
1418         if (stat("/dev/null", &st) < 0)
1419                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1420
1421         r = chmod_and_chown(console, 0600, 0, 0);
1422         if (r < 0)
1423                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1424
1425         /* We need to bind mount the right tty to /dev/console since
1426          * ptys can only exist on pts file systems. To have something
1427          * to bind mount things on we create a device node first, and
1428          * use /dev/null for that since we the cgroups device policy
1429          * allows us to create that freely, while we cannot create
1430          * /dev/console. (Note that the major minor doesn't actually
1431          * matter here, since we mount it over anyway). */
1432
1433         to = strappenda(dest, "/dev/console");
1434         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1435                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1436
1437         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1438                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1439
1440         return 0;
1441 }
1442
1443 static int setup_kmsg(const char *dest, int kmsg_socket) {
1444         _cleanup_free_ char *from = NULL, *to = NULL;
1445         _cleanup_umask_ mode_t u;
1446         int r, fd, k;
1447         union {
1448                 struct cmsghdr cmsghdr;
1449                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1450         } control = {};
1451         struct msghdr mh = {
1452                 .msg_control = &control,
1453                 .msg_controllen = sizeof(control),
1454         };
1455         struct cmsghdr *cmsg;
1456
1457         assert(dest);
1458         assert(kmsg_socket >= 0);
1459
1460         u = umask(0000);
1461
1462         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1463          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1464          * on the reading side behave very similar to /proc/kmsg,
1465          * their writing side behaves differently from /dev/kmsg in
1466          * that writing blocks when nothing is reading. In order to
1467          * avoid any problems with containers deadlocking due to this
1468          * we simply make /dev/kmsg unavailable to the container. */
1469         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1470             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1471                 return log_oom();
1472
1473         if (mkfifo(from, 0600) < 0)
1474                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1475
1476         r = chmod_and_chown(from, 0600, 0, 0);
1477         if (r < 0)
1478                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1479
1480         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1481                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1482
1483         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1484         if (fd < 0)
1485                 return log_error_errno(errno, "Failed to open fifo: %m");
1486
1487         cmsg = CMSG_FIRSTHDR(&mh);
1488         cmsg->cmsg_level = SOL_SOCKET;
1489         cmsg->cmsg_type = SCM_RIGHTS;
1490         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1491         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1492
1493         mh.msg_controllen = cmsg->cmsg_len;
1494
1495         /* Store away the fd in the socket, so that it stays open as
1496          * long as we run the child */
1497         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1498         safe_close(fd);
1499
1500         if (k < 0)
1501                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1502
1503         /* And now make the FIFO unavailable as /dev/kmsg... */
1504         unlink(from);
1505         return 0;
1506 }
1507
1508 static int send_rtnl(int send_fd) {
1509         union {
1510                 struct cmsghdr cmsghdr;
1511                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1512         } control = {};
1513         struct msghdr mh = {
1514                 .msg_control = &control,
1515                 .msg_controllen = sizeof(control),
1516         };
1517         struct cmsghdr *cmsg;
1518         _cleanup_close_ int fd = -1;
1519         ssize_t k;
1520
1521         assert(send_fd >= 0);
1522
1523         if (!arg_expose_ports)
1524                 return 0;
1525
1526         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1527         if (fd < 0)
1528                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1529
1530         cmsg = CMSG_FIRSTHDR(&mh);
1531         cmsg->cmsg_level = SOL_SOCKET;
1532         cmsg->cmsg_type = SCM_RIGHTS;
1533         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1534         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1535
1536         mh.msg_controllen = cmsg->cmsg_len;
1537
1538         /* Store away the fd in the socket, so that it stays open as
1539          * long as we run the child */
1540         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1541         if (k < 0)
1542                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1543
1544         return 0;
1545 }
1546
1547 static int flush_ports(union in_addr_union *exposed) {
1548         ExposePort *p;
1549         int r, af = AF_INET;
1550
1551         assert(exposed);
1552
1553         if (!arg_expose_ports)
1554                 return 0;
1555
1556         if (in_addr_is_null(af, exposed))
1557                 return 0;
1558
1559         log_debug("Lost IP address.");
1560
1561         LIST_FOREACH(ports, p, arg_expose_ports) {
1562                 r = fw_add_local_dnat(false,
1563                                       af,
1564                                       p->protocol,
1565                                       NULL,
1566                                       NULL, 0,
1567                                       NULL, 0,
1568                                       p->host_port,
1569                                       exposed,
1570                                       p->container_port,
1571                                       NULL);
1572                 if (r < 0)
1573                         log_warning_errno(r, "Failed to modify firewall: %m");
1574         }
1575
1576         *exposed = IN_ADDR_NULL;
1577         return 0;
1578 }
1579
1580 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1581         _cleanup_free_ struct local_address *addresses = NULL;
1582         _cleanup_free_ char *pretty = NULL;
1583         union in_addr_union new_exposed;
1584         ExposePort *p;
1585         bool add;
1586         int af = AF_INET, r;
1587
1588         assert(exposed);
1589
1590         /* Invoked each time an address is added or removed inside the
1591          * container */
1592
1593         if (!arg_expose_ports)
1594                 return 0;
1595
1596         r = local_addresses(rtnl, 0, af, &addresses);
1597         if (r < 0)
1598                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1599
1600         add = r > 0 &&
1601                 addresses[0].family == af &&
1602                 addresses[0].scope < RT_SCOPE_LINK;
1603
1604         if (!add)
1605                 return flush_ports(exposed);
1606
1607         new_exposed = addresses[0].address;
1608         if (in_addr_equal(af, exposed, &new_exposed))
1609                 return 0;
1610
1611         in_addr_to_string(af, &new_exposed, &pretty);
1612         log_debug("New container IP is %s.", strna(pretty));
1613
1614         LIST_FOREACH(ports, p, arg_expose_ports) {
1615
1616                 r = fw_add_local_dnat(true,
1617                                       af,
1618                                       p->protocol,
1619                                       NULL,
1620                                       NULL, 0,
1621                                       NULL, 0,
1622                                       p->host_port,
1623                                       &new_exposed,
1624                                       p->container_port,
1625                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1626                 if (r < 0)
1627                         log_warning_errno(r, "Failed to modify firewall: %m");
1628         }
1629
1630         *exposed = new_exposed;
1631         return 0;
1632 }
1633
1634 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1635         union in_addr_union *exposed = userdata;
1636
1637         assert(rtnl);
1638         assert(m);
1639         assert(exposed);
1640
1641         expose_ports(rtnl, exposed);
1642         return 0;
1643 }
1644
1645 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1646         union {
1647                 struct cmsghdr cmsghdr;
1648                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1649         } control = {};
1650         struct msghdr mh = {
1651                 .msg_control = &control,
1652                 .msg_controllen = sizeof(control),
1653         };
1654         struct cmsghdr *cmsg;
1655         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1656         int fd, r;
1657         ssize_t k;
1658
1659         assert(event);
1660         assert(recv_fd >= 0);
1661         assert(ret);
1662
1663         if (!arg_expose_ports)
1664                 return 0;
1665
1666         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1667         if (k < 0)
1668                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1669
1670         cmsg = CMSG_FIRSTHDR(&mh);
1671         assert(cmsg->cmsg_level == SOL_SOCKET);
1672         assert(cmsg->cmsg_type == SCM_RIGHTS);
1673         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1674         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1675
1676         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1677         if (r < 0) {
1678                 safe_close(fd);
1679                 return log_error_errno(r, "Failed to create rtnl object: %m");
1680         }
1681
1682         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1683         if (r < 0)
1684                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1685
1686         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1689
1690         r = sd_rtnl_attach_event(rtnl, event, 0);
1691         if (r < 0)
1692                 return log_error_errno(r, "Failed to add to even loop: %m");
1693
1694         *ret = rtnl;
1695         rtnl = NULL;
1696
1697         return 0;
1698 }
1699
1700 static int setup_hostname(void) {
1701
1702         if (arg_share_system)
1703                 return 0;
1704
1705         if (sethostname_idempotent(arg_machine) < 0)
1706                 return -errno;
1707
1708         return 0;
1709 }
1710
1711 static int setup_journal(const char *directory) {
1712         sd_id128_t machine_id, this_id;
1713         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1714         char *id;
1715         int r;
1716
1717         /* Don't link journals in ephemeral mode */
1718         if (arg_ephemeral)
1719                 return 0;
1720
1721         p = strappend(directory, "/etc/machine-id");
1722         if (!p)
1723                 return log_oom();
1724
1725         r = read_one_line_file(p, &b);
1726         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1727                 return 0;
1728         else if (r < 0)
1729                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1730
1731         id = strstrip(b);
1732         if (isempty(id) && arg_link_journal == LINK_AUTO)
1733                 return 0;
1734
1735         /* Verify validity */
1736         r = sd_id128_from_string(id, &machine_id);
1737         if (r < 0)
1738                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1739
1740         r = sd_id128_get_machine(&this_id);
1741         if (r < 0)
1742                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1743
1744         if (sd_id128_equal(machine_id, this_id)) {
1745                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1746                          "Host and machine ids are equal (%s): refusing to link journals", id);
1747                 if (arg_link_journal == LINK_AUTO)
1748                         return 0;
1749                 return -EEXIST;
1750         }
1751
1752         if (arg_link_journal == LINK_NO)
1753                 return 0;
1754
1755         free(p);
1756         p = strappend("/var/log/journal/", id);
1757         q = strjoin(directory, "/var/log/journal/", id, NULL);
1758         if (!p || !q)
1759                 return log_oom();
1760
1761         if (path_is_mount_point(p, false) > 0) {
1762                 if (arg_link_journal != LINK_AUTO) {
1763                         log_error("%s: already a mount point, refusing to use for journal", p);
1764                         return -EEXIST;
1765                 }
1766
1767                 return 0;
1768         }
1769
1770         if (path_is_mount_point(q, false) > 0) {
1771                 if (arg_link_journal != LINK_AUTO) {
1772                         log_error("%s: already a mount point, refusing to use for journal", q);
1773                         return -EEXIST;
1774                 }
1775
1776                 return 0;
1777         }
1778
1779         r = readlink_and_make_absolute(p, &d);
1780         if (r >= 0) {
1781                 if ((arg_link_journal == LINK_GUEST ||
1782                      arg_link_journal == LINK_AUTO) &&
1783                     path_equal(d, q)) {
1784
1785                         r = mkdir_p(q, 0755);
1786                         if (r < 0)
1787                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1788                         return 0;
1789                 }
1790
1791                 if (unlink(p) < 0)
1792                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1793         } else if (r == -EINVAL) {
1794
1795                 if (arg_link_journal == LINK_GUEST &&
1796                     rmdir(p) < 0) {
1797
1798                         if (errno == ENOTDIR) {
1799                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1800                                 return r;
1801                         } else {
1802                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1803                                 return -errno;
1804                         }
1805                 }
1806         } else if (r != -ENOENT) {
1807                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1808                 return r;
1809         }
1810
1811         if (arg_link_journal == LINK_GUEST) {
1812
1813                 if (symlink(q, p) < 0) {
1814                         if (arg_link_journal_try) {
1815                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1816                                 return 0;
1817                         } else {
1818                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1819                                 return -errno;
1820                         }
1821                 }
1822
1823                 r = mkdir_p(q, 0755);
1824                 if (r < 0)
1825                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1826                 return 0;
1827         }
1828
1829         if (arg_link_journal == LINK_HOST) {
1830                 /* don't create parents here -- if the host doesn't have
1831                  * permanent journal set up, don't force it here */
1832                 r = mkdir(p, 0755);
1833                 if (r < 0) {
1834                         if (arg_link_journal_try) {
1835                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1836                                 return 0;
1837                         } else {
1838                                 log_error_errno(errno, "Failed to create %s: %m", p);
1839                                 return r;
1840                         }
1841                 }
1842
1843         } else if (access(p, F_OK) < 0)
1844                 return 0;
1845
1846         if (dir_is_empty(q) == 0)
1847                 log_warning("%s is not empty, proceeding anyway.", q);
1848
1849         r = mkdir_p(q, 0755);
1850         if (r < 0) {
1851                 log_error_errno(errno, "Failed to create %s: %m", q);
1852                 return r;
1853         }
1854
1855         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1856                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1857
1858         return 0;
1859 }
1860
1861 static int drop_capabilities(void) {
1862         return capability_bounding_set_drop(~arg_retain, false);
1863 }
1864
1865 static int register_machine(pid_t pid, int local_ifindex) {
1866         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1867         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1868         int r;
1869
1870         if (!arg_register)
1871                 return 0;
1872
1873         r = sd_bus_default_system(&bus);
1874         if (r < 0)
1875                 return log_error_errno(r, "Failed to open system bus: %m");
1876
1877         if (arg_keep_unit) {
1878                 r = sd_bus_call_method(
1879                                 bus,
1880                                 "org.freedesktop.machine1",
1881                                 "/org/freedesktop/machine1",
1882                                 "org.freedesktop.machine1.Manager",
1883                                 "RegisterMachineWithNetwork",
1884                                 &error,
1885                                 NULL,
1886                                 "sayssusai",
1887                                 arg_machine,
1888                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1889                                 "nspawn",
1890                                 "container",
1891                                 (uint32_t) pid,
1892                                 strempty(arg_directory),
1893                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1894         } else {
1895                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1896
1897                 r = sd_bus_message_new_method_call(
1898                                 bus,
1899                                 &m,
1900                                 "org.freedesktop.machine1",
1901                                 "/org/freedesktop/machine1",
1902                                 "org.freedesktop.machine1.Manager",
1903                                 "CreateMachineWithNetwork");
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to create message: %m");
1906
1907                 r = sd_bus_message_append(
1908                                 m,
1909                                 "sayssusai",
1910                                 arg_machine,
1911                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1912                                 "nspawn",
1913                                 "container",
1914                                 (uint32_t) pid,
1915                                 strempty(arg_directory),
1916                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1917                 if (r < 0)
1918                         return log_error_errno(r, "Failed to append message arguments: %m");
1919
1920                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1921                 if (r < 0)
1922                         return log_error_errno(r, "Failed to open container: %m");
1923
1924                 if (!isempty(arg_slice)) {
1925                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1926                         if (r < 0)
1927                                 return log_error_errno(r, "Failed to append slice: %m");
1928                 }
1929
1930                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1931                 if (r < 0)
1932                         return log_error_errno(r, "Failed to add device policy: %m");
1933
1934                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1935                                           /* Allow the container to
1936                                            * access and create the API
1937                                            * device nodes, so that
1938                                            * PrivateDevices= in the
1939                                            * container can work
1940                                            * fine */
1941                                           "/dev/null", "rwm",
1942                                           "/dev/zero", "rwm",
1943                                           "/dev/full", "rwm",
1944                                           "/dev/random", "rwm",
1945                                           "/dev/urandom", "rwm",
1946                                           "/dev/tty", "rwm",
1947                                           "/dev/net/tun", "rwm",
1948                                           /* Allow the container
1949                                            * access to ptys. However,
1950                                            * do not permit the
1951                                            * container to ever create
1952                                            * these device nodes. */
1953                                           "/dev/pts/ptmx", "rw",
1954                                           "char-pts", "rw");
1955                 if (r < 0)
1956                         return log_error_errno(r, "Failed to add device whitelist: %m");
1957
1958                 r = sd_bus_message_close_container(m);
1959                 if (r < 0)
1960                         return log_error_errno(r, "Failed to close container: %m");
1961
1962                 r = sd_bus_call(bus, m, 0, &error, NULL);
1963         }
1964
1965         if (r < 0) {
1966                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1967                 return r;
1968         }
1969
1970         return 0;
1971 }
1972
1973 static int terminate_machine(pid_t pid) {
1974         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1975         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1976         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1977         const char *path;
1978         int r;
1979
1980         if (!arg_register)
1981                 return 0;
1982
1983         r = sd_bus_default_system(&bus);
1984         if (r < 0)
1985                 return log_error_errno(r, "Failed to open system bus: %m");
1986
1987         r = sd_bus_call_method(
1988                         bus,
1989                         "org.freedesktop.machine1",
1990                         "/org/freedesktop/machine1",
1991                         "org.freedesktop.machine1.Manager",
1992                         "GetMachineByPID",
1993                         &error,
1994                         &reply,
1995                         "u",
1996                         (uint32_t) pid);
1997         if (r < 0) {
1998                 /* Note that the machine might already have been
1999                  * cleaned up automatically, hence don't consider it a
2000                  * failure if we cannot get the machine object. */
2001                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2002                 return 0;
2003         }
2004
2005         r = sd_bus_message_read(reply, "o", &path);
2006         if (r < 0)
2007                 return bus_log_parse_error(r);
2008
2009         r = sd_bus_call_method(
2010                         bus,
2011                         "org.freedesktop.machine1",
2012                         path,
2013                         "org.freedesktop.machine1.Machine",
2014                         "Terminate",
2015                         &error,
2016                         NULL,
2017                         NULL);
2018         if (r < 0) {
2019                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2020                 return 0;
2021         }
2022
2023         return 0;
2024 }
2025
2026 static int reset_audit_loginuid(void) {
2027         _cleanup_free_ char *p = NULL;
2028         int r;
2029
2030         if (arg_share_system)
2031                 return 0;
2032
2033         r = read_one_line_file("/proc/self/loginuid", &p);
2034         if (r == -ENOENT)
2035                 return 0;
2036         if (r < 0)
2037                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2038
2039         /* Already reset? */
2040         if (streq(p, "4294967295"))
2041                 return 0;
2042
2043         r = write_string_file("/proc/self/loginuid", "4294967295");
2044         if (r < 0) {
2045                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2046                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2047                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2048                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2049                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2050
2051                 sleep(5);
2052         }
2053
2054         return 0;
2055 }
2056
2057 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2058 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2059 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2060
2061 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2062         uint8_t result[8];
2063         size_t l, sz;
2064         uint8_t *v, *i;
2065         int r;
2066
2067         l = strlen(arg_machine);
2068         sz = sizeof(sd_id128_t) + l;
2069         if (idx > 0)
2070                 sz += sizeof(idx);
2071
2072         v = alloca(sz);
2073
2074         /* fetch some persistent data unique to the host */
2075         r = sd_id128_get_machine((sd_id128_t*) v);
2076         if (r < 0)
2077                 return r;
2078
2079         /* combine with some data unique (on this host) to this
2080          * container instance */
2081         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2082         if (idx > 0) {
2083                 idx = htole64(idx);
2084                 memcpy(i, &idx, sizeof(idx));
2085         }
2086
2087         /* Let's hash the host machine ID plus the container name. We
2088          * use a fixed, but originally randomly created hash key here. */
2089         siphash24(result, v, sz, hash_key.bytes);
2090
2091         assert_cc(ETH_ALEN <= sizeof(result));
2092         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2093
2094         /* see eth_random_addr in the kernel */
2095         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2096         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2097
2098         return 0;
2099 }
2100
2101 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2102         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2103         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2104         struct ether_addr mac_host, mac_container;
2105         int r, i;
2106
2107         if (!arg_private_network)
2108                 return 0;
2109
2110         if (!arg_network_veth)
2111                 return 0;
2112
2113         /* Use two different interface name prefixes depending whether
2114          * we are in bridge mode or not. */
2115         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2116                  arg_network_bridge ? "vb" : "ve", arg_machine);
2117
2118         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2119         if (r < 0)
2120                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2121
2122         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2123         if (r < 0)
2124                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2125
2126         r = sd_rtnl_open(&rtnl, 0);
2127         if (r < 0)
2128                 return log_error_errno(r, "Failed to connect to netlink: %m");
2129
2130         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2131         if (r < 0)
2132                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2133
2134         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2135         if (r < 0)
2136                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2137
2138         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2139         if (r < 0)
2140                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2141
2142         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2143         if (r < 0)
2144                 return log_error_errno(r, "Failed to open netlink container: %m");
2145
2146         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2147         if (r < 0)
2148                 return log_error_errno(r, "Failed to open netlink container: %m");
2149
2150         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2151         if (r < 0)
2152                 return log_error_errno(r, "Failed to open netlink container: %m");
2153
2154         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2155         if (r < 0)
2156                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2157
2158         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2159         if (r < 0)
2160                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2161
2162         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2163         if (r < 0)
2164                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2165
2166         r = sd_rtnl_message_close_container(m);
2167         if (r < 0)
2168                 return log_error_errno(r, "Failed to close netlink container: %m");
2169
2170         r = sd_rtnl_message_close_container(m);
2171         if (r < 0)
2172                 return log_error_errno(r, "Failed to close netlink container: %m");
2173
2174         r = sd_rtnl_message_close_container(m);
2175         if (r < 0)
2176                 return log_error_errno(r, "Failed to close netlink container: %m");
2177
2178         r = sd_rtnl_call(rtnl, m, 0, NULL);
2179         if (r < 0)
2180                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2181
2182         i = (int) if_nametoindex(iface_name);
2183         if (i <= 0)
2184                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2185
2186         *ifi = i;
2187
2188         return 0;
2189 }
2190
2191 static int setup_bridge(const char veth_name[], int *ifi) {
2192         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2193         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2194         int r, bridge;
2195
2196         if (!arg_private_network)
2197                 return 0;
2198
2199         if (!arg_network_veth)
2200                 return 0;
2201
2202         if (!arg_network_bridge)
2203                 return 0;
2204
2205         bridge = (int) if_nametoindex(arg_network_bridge);
2206         if (bridge <= 0)
2207                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2208
2209         *ifi = bridge;
2210
2211         r = sd_rtnl_open(&rtnl, 0);
2212         if (r < 0)
2213                 return log_error_errno(r, "Failed to connect to netlink: %m");
2214
2215         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2216         if (r < 0)
2217                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2218
2219         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2220         if (r < 0)
2221                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2222
2223         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2224         if (r < 0)
2225                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2226
2227         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2228         if (r < 0)
2229                 return log_error_errno(r, "Failed to add netlink master field: %m");
2230
2231         r = sd_rtnl_call(rtnl, m, 0, NULL);
2232         if (r < 0)
2233                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2234
2235         return 0;
2236 }
2237
2238 static int parse_interface(struct udev *udev, const char *name) {
2239         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2240         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2241         int ifi;
2242
2243         ifi = (int) if_nametoindex(name);
2244         if (ifi <= 0)
2245                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2246
2247         sprintf(ifi_str, "n%i", ifi);
2248         d = udev_device_new_from_device_id(udev, ifi_str);
2249         if (!d)
2250                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2251
2252         if (udev_device_get_is_initialized(d) <= 0) {
2253                 log_error("Network interface %s is not initialized yet.", name);
2254                 return -EBUSY;
2255         }
2256
2257         return ifi;
2258 }
2259
2260 static int move_network_interfaces(pid_t pid) {
2261         _cleanup_udev_unref_ struct udev *udev = NULL;
2262         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2263         char **i;
2264         int r;
2265
2266         if (!arg_private_network)
2267                 return 0;
2268
2269         if (strv_isempty(arg_network_interfaces))
2270                 return 0;
2271
2272         r = sd_rtnl_open(&rtnl, 0);
2273         if (r < 0)
2274                 return log_error_errno(r, "Failed to connect to netlink: %m");
2275
2276         udev = udev_new();
2277         if (!udev) {
2278                 log_error("Failed to connect to udev.");
2279                 return -ENOMEM;
2280         }
2281
2282         STRV_FOREACH(i, arg_network_interfaces) {
2283                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2284                 int ifi;
2285
2286                 ifi = parse_interface(udev, *i);
2287                 if (ifi < 0)
2288                         return ifi;
2289
2290                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2291                 if (r < 0)
2292                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2293
2294                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2295                 if (r < 0)
2296                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2297
2298                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2299                 if (r < 0)
2300                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2301         }
2302
2303         return 0;
2304 }
2305
2306 static int setup_macvlan(pid_t pid) {
2307         _cleanup_udev_unref_ struct udev *udev = NULL;
2308         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2309         unsigned idx = 0;
2310         char **i;
2311         int r;
2312
2313         if (!arg_private_network)
2314                 return 0;
2315
2316         if (strv_isempty(arg_network_macvlan))
2317                 return 0;
2318
2319         r = sd_rtnl_open(&rtnl, 0);
2320         if (r < 0)
2321                 return log_error_errno(r, "Failed to connect to netlink: %m");
2322
2323         udev = udev_new();
2324         if (!udev) {
2325                 log_error("Failed to connect to udev.");
2326                 return -ENOMEM;
2327         }
2328
2329         STRV_FOREACH(i, arg_network_macvlan) {
2330                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2331                 _cleanup_free_ char *n = NULL;
2332                 struct ether_addr mac;
2333                 int ifi;
2334
2335                 ifi = parse_interface(udev, *i);
2336                 if (ifi < 0)
2337                         return ifi;
2338
2339                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2340                 if (r < 0)
2341                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2342
2343                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2346
2347                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2348                 if (r < 0)
2349                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2350
2351                 n = strappend("mv-", *i);
2352                 if (!n)
2353                         return log_oom();
2354
2355                 strshorten(n, IFNAMSIZ-1);
2356
2357                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2358                 if (r < 0)
2359                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2360
2361                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2362                 if (r < 0)
2363                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2364
2365                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2366                 if (r < 0)
2367                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2368
2369                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2370                 if (r < 0)
2371                         return log_error_errno(r, "Failed to open netlink container: %m");
2372
2373                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2374                 if (r < 0)
2375                         return log_error_errno(r, "Failed to open netlink container: %m");
2376
2377                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2378                 if (r < 0)
2379                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2380
2381                 r = sd_rtnl_message_close_container(m);
2382                 if (r < 0)
2383                         return log_error_errno(r, "Failed to close netlink container: %m");
2384
2385                 r = sd_rtnl_message_close_container(m);
2386                 if (r < 0)
2387                         return log_error_errno(r, "Failed to close netlink container: %m");
2388
2389                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2390                 if (r < 0)
2391                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2392         }
2393
2394         return 0;
2395 }
2396
2397 static int setup_ipvlan(pid_t pid) {
2398         _cleanup_udev_unref_ struct udev *udev = NULL;
2399         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2400         char **i;
2401         int r;
2402
2403         if (!arg_private_network)
2404                 return 0;
2405
2406         if (strv_isempty(arg_network_ipvlan))
2407                 return 0;
2408
2409         r = sd_rtnl_open(&rtnl, 0);
2410         if (r < 0)
2411                 return log_error_errno(r, "Failed to connect to netlink: %m");
2412
2413         udev = udev_new();
2414         if (!udev) {
2415                 log_error("Failed to connect to udev.");
2416                 return -ENOMEM;
2417         }
2418
2419         STRV_FOREACH(i, arg_network_ipvlan) {
2420                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2421                 _cleanup_free_ char *n = NULL;
2422                 int ifi;
2423
2424                 ifi = parse_interface(udev, *i);
2425                 if (ifi < 0)
2426                         return ifi;
2427
2428                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2431
2432                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2433                 if (r < 0)
2434                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2435
2436                 n = strappend("iv-", *i);
2437                 if (!n)
2438                         return log_oom();
2439
2440                 strshorten(n, IFNAMSIZ-1);
2441
2442                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2443                 if (r < 0)
2444                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2445
2446                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2447                 if (r < 0)
2448                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2449
2450                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2451                 if (r < 0)
2452                         return log_error_errno(r, "Failed to open netlink container: %m");
2453
2454                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2455                 if (r < 0)
2456                         return log_error_errno(r, "Failed to open netlink container: %m");
2457
2458                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2459                 if (r < 0)
2460                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2461
2462                 r = sd_rtnl_message_close_container(m);
2463                 if (r < 0)
2464                         return log_error_errno(r, "Failed to close netlink container: %m");
2465
2466                 r = sd_rtnl_message_close_container(m);
2467                 if (r < 0)
2468                         return log_error_errno(r, "Failed to close netlink container: %m");
2469
2470                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2471                 if (r < 0)
2472                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2473         }
2474
2475         return 0;
2476 }
2477
2478 static int setup_seccomp(void) {
2479
2480 #ifdef HAVE_SECCOMP
2481         static const int blacklist[] = {
2482                 SCMP_SYS(kexec_load),
2483                 SCMP_SYS(open_by_handle_at),
2484                 SCMP_SYS(init_module),
2485                 SCMP_SYS(finit_module),
2486                 SCMP_SYS(delete_module),
2487                 SCMP_SYS(iopl),
2488                 SCMP_SYS(ioperm),
2489                 SCMP_SYS(swapon),
2490                 SCMP_SYS(swapoff),
2491         };
2492
2493         scmp_filter_ctx seccomp;
2494         unsigned i;
2495         int r;
2496
2497         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2498         if (!seccomp)
2499                 return log_oom();
2500
2501         r = seccomp_add_secondary_archs(seccomp);
2502         if (r < 0) {
2503                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2504                 goto finish;
2505         }
2506
2507         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2508                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2509                 if (r == -EFAULT)
2510                         continue; /* unknown syscall */
2511                 if (r < 0) {
2512                         log_error_errno(r, "Failed to block syscall: %m");
2513                         goto finish;
2514                 }
2515         }
2516
2517         /*
2518            Audit is broken in containers, much of the userspace audit
2519            hookup will fail if running inside a container. We don't
2520            care and just turn off creation of audit sockets.
2521
2522            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2523            with EAFNOSUPPORT which audit userspace uses as indication
2524            that audit is disabled in the kernel.
2525          */
2526
2527         r = seccomp_rule_add(
2528                         seccomp,
2529                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2530                         SCMP_SYS(socket),
2531                         2,
2532                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2533                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2534         if (r < 0) {
2535                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2536                 goto finish;
2537         }
2538
2539         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2540         if (r < 0) {
2541                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2542                 goto finish;
2543         }
2544
2545         r = seccomp_load(seccomp);
2546         if (r < 0)
2547                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2548
2549 finish:
2550         seccomp_release(seccomp);
2551         return r;
2552 #else
2553         return 0;
2554 #endif
2555
2556 }
2557
2558 static int setup_propagate(const char *root) {
2559         const char *p, *q;
2560
2561         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2562         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2563         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2564         (void) mkdir_p(p, 0600);
2565
2566         q = strappenda(root, "/run/systemd/nspawn/incoming");
2567         mkdir_parents(q, 0755);
2568         mkdir_p(q, 0600);
2569
2570         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2571                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2572
2573         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2574                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2575
2576         return 0;
2577 }
2578
2579 static int setup_image(char **device_path, int *loop_nr) {
2580         struct loop_info64 info = {
2581                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2582         };
2583         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2584         _cleanup_free_ char* loopdev = NULL;
2585         struct stat st;
2586         int r, nr;
2587
2588         assert(device_path);
2589         assert(loop_nr);
2590         assert(arg_image);
2591
2592         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2593         if (fd < 0)
2594                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2595
2596         if (fstat(fd, &st) < 0)
2597                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2598
2599         if (S_ISBLK(st.st_mode)) {
2600                 char *p;
2601
2602                 p = strdup(arg_image);
2603                 if (!p)
2604                         return log_oom();
2605
2606                 *device_path = p;
2607
2608                 *loop_nr = -1;
2609
2610                 r = fd;
2611                 fd = -1;
2612
2613                 return r;
2614         }
2615
2616         if (!S_ISREG(st.st_mode)) {
2617                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2618                 return -EINVAL;
2619         }
2620
2621         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2622         if (control < 0)
2623                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2624
2625         nr = ioctl(control, LOOP_CTL_GET_FREE);
2626         if (nr < 0)
2627                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2628
2629         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2630                 return log_oom();
2631
2632         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2633         if (loop < 0)
2634                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2635
2636         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2637                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2638
2639         if (arg_read_only)
2640                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2641
2642         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2643                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2644
2645         *device_path = loopdev;
2646         loopdev = NULL;
2647
2648         *loop_nr = nr;
2649
2650         r = loop;
2651         loop = -1;
2652
2653         return r;
2654 }
2655
2656 static int wait_for_block_device(struct udev *udev, dev_t devnum, struct udev_device **ret) {
2657         _cleanup_udev_monitor_unref_ struct udev_monitor *monitor = NULL;
2658         int r;
2659
2660         assert(udev);
2661         assert(ret);
2662
2663         for (;;) {
2664                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2665                 struct pollfd pfd = {
2666                         .events = POLLIN
2667                 };
2668
2669                 d = udev_device_new_from_devnum(udev, 'b', devnum);
2670                 if (!d)
2671                         return log_oom();
2672
2673                 r = udev_device_get_is_initialized(d);
2674                 if (r < 0)
2675                         return log_error_errno(r, "Failed to check if device is initialized: %m");
2676                 if (r > 0) {
2677                         *ret = d;
2678                         d = NULL;
2679                         return 0;
2680                 }
2681                 d = udev_device_unref(d);
2682
2683                 if (!monitor) {
2684                         monitor = udev_monitor_new_from_netlink(udev, "udev");
2685                         if (!monitor)
2686                                 return log_oom();
2687
2688                         r = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block", NULL);
2689                         if (r < 0)
2690                                 return log_error_errno(r, "Failed to add block match: %m");
2691
2692                         r = udev_monitor_enable_receiving(monitor);
2693                         if (r < 0)
2694                                 return log_error_errno(r, "Failed to turn on monitor: %m");
2695
2696                         continue;
2697                 }
2698
2699                 pfd.fd = udev_monitor_get_fd(monitor);
2700                 if (pfd.fd < 0)
2701                         return log_error_errno(r, "Failed to get udev monitor fd: %m");
2702
2703                 r = poll(&pfd, 1, -1);
2704                 if (r < 0)
2705                         return log_error_errno(errno, "Failed to wait for device initialization: %m");
2706
2707                 d = udev_monitor_receive_device(monitor);
2708         }
2709
2710         return 0;
2711 }
2712
2713 #define PARTITION_TABLE_BLURB \
2714         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2715         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2716         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2717         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2718         "to be bootable with systemd-nspawn."
2719
2720 static int dissect_image(
2721                 int fd,
2722                 char **root_device, bool *root_device_rw,
2723                 char **home_device, bool *home_device_rw,
2724                 char **srv_device, bool *srv_device_rw,
2725                 bool *secondary) {
2726
2727 #ifdef HAVE_BLKID
2728         int home_nr = -1, srv_nr = -1;
2729 #ifdef GPT_ROOT_NATIVE
2730         int root_nr = -1;
2731 #endif
2732 #ifdef GPT_ROOT_SECONDARY
2733         int secondary_root_nr = -1;
2734 #endif
2735         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2736         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2737         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2738         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2739         _cleanup_udev_unref_ struct udev *udev = NULL;
2740         struct udev_list_entry *first, *item;
2741         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2742         const char *pttype = NULL;
2743         blkid_partlist pl;
2744         struct stat st;
2745         int r;
2746         bool is_gpt, is_mbr, multiple_generic = false;
2747
2748         assert(fd >= 0);
2749         assert(root_device);
2750         assert(home_device);
2751         assert(srv_device);
2752         assert(secondary);
2753         assert(arg_image);
2754
2755         b = blkid_new_probe();
2756         if (!b)
2757                 return log_oom();
2758
2759         errno = 0;
2760         r = blkid_probe_set_device(b, fd, 0, 0);
2761         if (r != 0) {
2762                 if (errno == 0)
2763                         return log_oom();
2764
2765                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2766                 return -errno;
2767         }
2768
2769         blkid_probe_enable_partitions(b, 1);
2770         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2771
2772         errno = 0;
2773         r = blkid_do_safeprobe(b);
2774         if (r == -2 || r == 1) {
2775                 log_error("Failed to identify any partition table on\n"
2776                           "    %s\n"
2777                           PARTITION_TABLE_BLURB, arg_image);
2778                 return -EINVAL;
2779         } else if (r != 0) {
2780                 if (errno == 0)
2781                         errno = EIO;
2782                 log_error_errno(errno, "Failed to probe: %m");
2783                 return -errno;
2784         }
2785
2786         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2787
2788         is_gpt = streq_ptr(pttype, "gpt");
2789         is_mbr = streq_ptr(pttype, "dos");
2790
2791         if (!is_gpt && !is_mbr) {
2792                 log_error("No GPT or MBR partition table discovered on\n"
2793                           "    %s\n"
2794                           PARTITION_TABLE_BLURB, arg_image);
2795                 return -EINVAL;
2796         }
2797
2798         errno = 0;
2799         pl = blkid_probe_get_partitions(b);
2800         if (!pl) {
2801                 if (errno == 0)
2802                         return log_oom();
2803
2804                 log_error("Failed to list partitions of %s", arg_image);
2805                 return -errno;
2806         }
2807
2808         udev = udev_new();
2809         if (!udev)
2810                 return log_oom();
2811
2812         if (fstat(fd, &st) < 0)
2813                 return log_error_errno(errno, "Failed to stat block device: %m");
2814
2815         r = wait_for_block_device(udev, st.st_rdev, &d);
2816         if (r < 0)
2817                 return r;
2818
2819         e = udev_enumerate_new(udev);
2820         if (!e)
2821                 return log_oom();
2822
2823         r = udev_enumerate_add_match_parent(e, d);
2824         if (r < 0)
2825                 return log_oom();
2826
2827         r = udev_enumerate_scan_devices(e);
2828         if (r < 0)
2829                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2830
2831         first = udev_enumerate_get_list_entry(e);
2832         udev_list_entry_foreach(item, first) {
2833                 _cleanup_udev_device_unref_ struct udev_device *q;
2834                 const char *node;
2835                 unsigned long long flags;
2836                 blkid_partition pp;
2837                 dev_t qn;
2838                 int nr;
2839
2840                 errno = 0;
2841                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2842                 if (!q) {
2843                         if (!errno)
2844                                 errno = ENOMEM;
2845
2846                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2847                         return -errno;
2848                 }
2849
2850                 qn = udev_device_get_devnum(q);
2851                 if (major(qn) == 0)
2852                         continue;
2853
2854                 if (st.st_rdev == qn)
2855                         continue;
2856
2857                 node = udev_device_get_devnode(q);
2858                 if (!node)
2859                         continue;
2860
2861                 pp = blkid_partlist_devno_to_partition(pl, qn);
2862                 if (!pp)
2863                         continue;
2864
2865                 flags = blkid_partition_get_flags(pp);
2866
2867                 nr = blkid_partition_get_partno(pp);
2868                 if (nr < 0)
2869                         continue;
2870
2871                 if (is_gpt) {
2872                         sd_id128_t type_id;
2873                         const char *stype;
2874
2875                         if (flags & GPT_FLAG_NO_AUTO)
2876                                 continue;
2877
2878                         stype = blkid_partition_get_type_string(pp);
2879                         if (!stype)
2880                                 continue;
2881
2882                         if (sd_id128_from_string(stype, &type_id) < 0)
2883                                 continue;
2884
2885                         if (sd_id128_equal(type_id, GPT_HOME)) {
2886
2887                                 if (home && nr >= home_nr)
2888                                         continue;
2889
2890                                 home_nr = nr;
2891                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2892
2893                                 r = free_and_strdup(&home, node);
2894                                 if (r < 0)
2895                                         return log_oom();
2896
2897                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2898
2899                                 if (srv && nr >= srv_nr)
2900                                         continue;
2901
2902                                 srv_nr = nr;
2903                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2904
2905                                 r = free_and_strdup(&srv, node);
2906                                 if (r < 0)
2907                                         return log_oom();
2908                         }
2909 #ifdef GPT_ROOT_NATIVE
2910                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2911
2912                                 if (root && nr >= root_nr)
2913                                         continue;
2914
2915                                 root_nr = nr;
2916                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2917
2918                                 r = free_and_strdup(&root, node);
2919                                 if (r < 0)
2920                                         return log_oom();
2921                         }
2922 #endif
2923 #ifdef GPT_ROOT_SECONDARY
2924                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2925
2926                                 if (secondary_root && nr >= secondary_root_nr)
2927                                         continue;
2928
2929                                 secondary_root_nr = nr;
2930                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2931
2932                                 r = free_and_strdup(&secondary_root, node);
2933                                 if (r < 0)
2934                                         return log_oom();
2935                         }
2936 #endif
2937                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2938
2939                                 if (generic)
2940                                         multiple_generic = true;
2941                                 else {
2942                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2943
2944                                         r = free_and_strdup(&generic, node);
2945                                         if (r < 0)
2946                                                 return log_oom();
2947                                 }
2948                         }
2949
2950                 } else if (is_mbr) {
2951                         int type;
2952
2953                         if (flags != 0x80) /* Bootable flag */
2954                                 continue;
2955
2956                         type = blkid_partition_get_type(pp);
2957                         if (type != 0x83) /* Linux partition */
2958                                 continue;
2959
2960                         if (generic)
2961                                 multiple_generic = true;
2962                         else {
2963                                 generic_rw = true;
2964
2965                                 r = free_and_strdup(&root, node);
2966                                 if (r < 0)
2967                                         return log_oom();
2968                         }
2969                 }
2970         }
2971
2972         if (root) {
2973                 *root_device = root;
2974                 root = NULL;
2975
2976                 *root_device_rw = root_rw;
2977                 *secondary = false;
2978         } else if (secondary_root) {
2979                 *root_device = secondary_root;
2980                 secondary_root = NULL;
2981
2982                 *root_device_rw = secondary_root_rw;
2983                 *secondary = true;
2984         } else if (generic) {
2985
2986                 /* There were no partitions with precise meanings
2987                  * around, but we found generic partitions. In this
2988                  * case, if there's only one, we can go ahead and boot
2989                  * it, otherwise we bail out, because we really cannot
2990                  * make any sense of it. */
2991
2992                 if (multiple_generic) {
2993                         log_error("Identified multiple bootable Linux partitions on\n"
2994                                   "    %s\n"
2995                                   PARTITION_TABLE_BLURB, arg_image);
2996                         return -EINVAL;
2997                 }
2998
2999                 *root_device = generic;
3000                 generic = NULL;
3001
3002                 *root_device_rw = generic_rw;
3003                 *secondary = false;
3004         } else {
3005                 log_error("Failed to identify root partition in disk image\n"
3006                           "    %s\n"
3007                           PARTITION_TABLE_BLURB, arg_image);
3008                 return -EINVAL;
3009         }
3010
3011         if (home) {
3012                 *home_device = home;
3013                 home = NULL;
3014
3015                 *home_device_rw = home_rw;
3016         }
3017
3018         if (srv) {
3019                 *srv_device = srv;
3020                 srv = NULL;
3021
3022                 *srv_device_rw = srv_rw;
3023         }
3024
3025         return 0;
3026 #else
3027         log_error("--image= is not supported, compiled without blkid support.");
3028         return -ENOTSUP;
3029 #endif
3030 }
3031
3032 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3033 #ifdef HAVE_BLKID
3034         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3035         const char *fstype, *p;
3036         int r;
3037
3038         assert(what);
3039         assert(where);
3040
3041         if (arg_read_only)
3042                 rw = false;
3043
3044         if (directory)
3045                 p = strappenda(where, directory);
3046         else
3047                 p = where;
3048
3049         errno = 0;
3050         b = blkid_new_probe_from_filename(what);
3051         if (!b) {
3052                 if (errno == 0)
3053                         return log_oom();
3054                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3055                 return -errno;
3056         }
3057
3058         blkid_probe_enable_superblocks(b, 1);
3059         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3060
3061         errno = 0;
3062         r = blkid_do_safeprobe(b);
3063         if (r == -1 || r == 1) {
3064                 log_error("Cannot determine file system type of %s", what);
3065                 return -EINVAL;
3066         } else if (r != 0) {
3067                 if (errno == 0)
3068                         errno = EIO;
3069                 log_error_errno(errno, "Failed to probe %s: %m", what);
3070                 return -errno;
3071         }
3072
3073         errno = 0;
3074         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3075                 if (errno == 0)
3076                         errno = EINVAL;
3077                 log_error("Failed to determine file system type of %s", what);
3078                 return -errno;
3079         }
3080
3081         if (streq(fstype, "crypto_LUKS")) {
3082                 log_error("nspawn currently does not support LUKS disk images.");
3083                 return -ENOTSUP;
3084         }
3085
3086         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3087                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3088
3089         return 0;
3090 #else
3091         log_error("--image= is not supported, compiled without blkid support.");
3092         return -ENOTSUP;
3093 #endif
3094 }
3095
3096 static int mount_devices(
3097                 const char *where,
3098                 const char *root_device, bool root_device_rw,
3099                 const char *home_device, bool home_device_rw,
3100                 const char *srv_device, bool srv_device_rw) {
3101         int r;
3102
3103         assert(where);
3104
3105         if (root_device) {
3106                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3107                 if (r < 0)
3108                         return log_error_errno(r, "Failed to mount root directory: %m");
3109         }
3110
3111         if (home_device) {
3112                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3113                 if (r < 0)
3114                         return log_error_errno(r, "Failed to mount home directory: %m");
3115         }
3116
3117         if (srv_device) {
3118                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3119                 if (r < 0)
3120                         return log_error_errno(r, "Failed to mount server data directory: %m");
3121         }
3122
3123         return 0;
3124 }
3125
3126 static void loop_remove(int nr, int *image_fd) {
3127         _cleanup_close_ int control = -1;
3128         int r;
3129
3130         if (nr < 0)
3131                 return;
3132
3133         if (image_fd && *image_fd >= 0) {
3134                 r = ioctl(*image_fd, LOOP_CLR_FD);
3135                 if (r < 0)
3136                         log_debug_errno(errno, "Failed to close loop image: %m");
3137                 *image_fd = safe_close(*image_fd);
3138         }
3139
3140         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3141         if (control < 0) {
3142                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3143                 return;
3144         }
3145
3146         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3147         if (r < 0)
3148                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3149 }
3150
3151 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3152         int pipe_fds[2];
3153         pid_t pid;
3154
3155         assert(database);
3156         assert(key);
3157         assert(rpid);
3158
3159         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3160                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3161
3162         pid = fork();
3163         if (pid < 0)
3164                 return log_error_errno(errno, "Failed to fork getent child: %m");
3165         else if (pid == 0) {
3166                 int nullfd;
3167                 char *empty_env = NULL;
3168
3169                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3170                         _exit(EXIT_FAILURE);
3171
3172                 if (pipe_fds[0] > 2)
3173                         safe_close(pipe_fds[0]);
3174                 if (pipe_fds[1] > 2)
3175                         safe_close(pipe_fds[1]);
3176
3177                 nullfd = open("/dev/null", O_RDWR);
3178                 if (nullfd < 0)
3179                         _exit(EXIT_FAILURE);
3180
3181                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3182                         _exit(EXIT_FAILURE);
3183
3184                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3185                         _exit(EXIT_FAILURE);
3186
3187                 if (nullfd > 2)
3188                         safe_close(nullfd);
3189
3190                 reset_all_signal_handlers();
3191                 close_all_fds(NULL, 0);
3192
3193                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3194                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3195                 _exit(EXIT_FAILURE);
3196         }
3197
3198         pipe_fds[1] = safe_close(pipe_fds[1]);
3199
3200         *rpid = pid;
3201
3202         return pipe_fds[0];
3203 }
3204
3205 static int change_uid_gid(char **_home) {
3206         char line[LINE_MAX], *x, *u, *g, *h;
3207         const char *word, *state;
3208         _cleanup_free_ uid_t *uids = NULL;
3209         _cleanup_free_ char *home = NULL;
3210         _cleanup_fclose_ FILE *f = NULL;
3211         _cleanup_close_ int fd = -1;
3212         unsigned n_uids = 0;
3213         size_t sz = 0, l;
3214         uid_t uid;
3215         gid_t gid;
3216         pid_t pid;
3217         int r;
3218
3219         assert(_home);
3220
3221         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3222                 /* Reset everything fully to 0, just in case */
3223
3224                 if (setgroups(0, NULL) < 0)
3225                         return log_error_errno(errno, "setgroups() failed: %m");
3226
3227                 if (setresgid(0, 0, 0) < 0)
3228                         return log_error_errno(errno, "setregid() failed: %m");
3229
3230                 if (setresuid(0, 0, 0) < 0)
3231                         return log_error_errno(errno, "setreuid() failed: %m");
3232
3233                 *_home = NULL;
3234                 return 0;
3235         }
3236
3237         /* First, get user credentials */
3238         fd = spawn_getent("passwd", arg_user, &pid);
3239         if (fd < 0)
3240                 return fd;
3241
3242         f = fdopen(fd, "r");
3243         if (!f)
3244                 return log_oom();
3245         fd = -1;
3246
3247         if (!fgets(line, sizeof(line), f)) {
3248
3249                 if (!ferror(f)) {
3250                         log_error("Failed to resolve user %s.", arg_user);
3251                         return -ESRCH;
3252                 }
3253
3254                 log_error_errno(errno, "Failed to read from getent: %m");
3255                 return -errno;
3256         }
3257
3258         truncate_nl(line);
3259
3260         wait_for_terminate_and_warn("getent passwd", pid, true);
3261
3262         x = strchr(line, ':');
3263         if (!x) {
3264                 log_error("/etc/passwd entry has invalid user field.");
3265                 return -EIO;
3266         }
3267
3268         u = strchr(x+1, ':');
3269         if (!u) {
3270                 log_error("/etc/passwd entry has invalid password field.");
3271                 return -EIO;
3272         }
3273
3274         u++;
3275         g = strchr(u, ':');
3276         if (!g) {
3277                 log_error("/etc/passwd entry has invalid UID field.");
3278                 return -EIO;
3279         }
3280
3281         *g = 0;
3282         g++;
3283         x = strchr(g, ':');
3284         if (!x) {
3285                 log_error("/etc/passwd entry has invalid GID field.");
3286                 return -EIO;
3287         }
3288
3289         *x = 0;
3290         h = strchr(x+1, ':');
3291         if (!h) {
3292                 log_error("/etc/passwd entry has invalid GECOS field.");
3293                 return -EIO;
3294         }
3295
3296         h++;
3297         x = strchr(h, ':');
3298         if (!x) {
3299                 log_error("/etc/passwd entry has invalid home directory field.");
3300                 return -EIO;
3301         }
3302
3303         *x = 0;
3304
3305         r = parse_uid(u, &uid);
3306         if (r < 0) {
3307                 log_error("Failed to parse UID of user.");
3308                 return -EIO;
3309         }
3310
3311         r = parse_gid(g, &gid);
3312         if (r < 0) {
3313                 log_error("Failed to parse GID of user.");
3314                 return -EIO;
3315         }
3316
3317         home = strdup(h);
3318         if (!home)
3319                 return log_oom();
3320
3321         /* Second, get group memberships */
3322         fd = spawn_getent("initgroups", arg_user, &pid);
3323         if (fd < 0)
3324                 return fd;
3325
3326         fclose(f);
3327         f = fdopen(fd, "r");
3328         if (!f)
3329                 return log_oom();
3330         fd = -1;
3331
3332         if (!fgets(line, sizeof(line), f)) {
3333                 if (!ferror(f)) {
3334                         log_error("Failed to resolve user %s.", arg_user);
3335                         return -ESRCH;
3336                 }
3337
3338                 log_error_errno(errno, "Failed to read from getent: %m");
3339                 return -errno;
3340         }
3341
3342         truncate_nl(line);
3343
3344         wait_for_terminate_and_warn("getent initgroups", pid, true);
3345
3346         /* Skip over the username and subsequent separator whitespace */
3347         x = line;
3348         x += strcspn(x, WHITESPACE);
3349         x += strspn(x, WHITESPACE);
3350
3351         FOREACH_WORD(word, l, x, state) {
3352                 char c[l+1];
3353
3354                 memcpy(c, word, l);
3355                 c[l] = 0;
3356
3357                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3358                         return log_oom();
3359
3360                 r = parse_uid(c, &uids[n_uids++]);
3361                 if (r < 0) {
3362                         log_error("Failed to parse group data from getent.");
3363                         return -EIO;
3364                 }
3365         }
3366
3367         r = mkdir_parents(home, 0775);
3368         if (r < 0)
3369                 return log_error_errno(r, "Failed to make home root directory: %m");
3370
3371         r = mkdir_safe(home, 0755, uid, gid);
3372         if (r < 0 && r != -EEXIST)
3373                 return log_error_errno(r, "Failed to make home directory: %m");
3374
3375         fchown(STDIN_FILENO, uid, gid);
3376         fchown(STDOUT_FILENO, uid, gid);
3377         fchown(STDERR_FILENO, uid, gid);
3378
3379         if (setgroups(n_uids, uids) < 0)
3380                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3381
3382         if (setresgid(gid, gid, gid) < 0)
3383                 return log_error_errno(errno, "setregid() failed: %m");
3384
3385         if (setresuid(uid, uid, uid) < 0)
3386                 return log_error_errno(errno, "setreuid() failed: %m");
3387
3388         if (_home) {
3389                 *_home = home;
3390                 home = NULL;
3391         }
3392
3393         return 0;
3394 }
3395
3396 /*
3397  * Return values:
3398  * < 0 : wait_for_terminate() failed to get the state of the
3399  *       container, the container was terminated by a signal, or
3400  *       failed for an unknown reason.  No change is made to the
3401  *       container argument.
3402  * > 0 : The program executed in the container terminated with an
3403  *       error.  The exit code of the program executed in the
3404  *       container is returned.  The container argument has been set
3405  *       to CONTAINER_TERMINATED.
3406  *   0 : The container is being rebooted, has been shut down or exited
3407  *       successfully.  The container argument has been set to either
3408  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3409  *
3410  * That is, success is indicated by a return value of zero, and an
3411  * error is indicated by a non-zero value.
3412  */
3413 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3414         siginfo_t status;
3415         int r;
3416
3417         r = wait_for_terminate(pid, &status);
3418         if (r < 0)
3419                 return log_warning_errno(r, "Failed to wait for container: %m");
3420
3421         switch (status.si_code) {
3422
3423         case CLD_EXITED:
3424                 if (status.si_status == 0) {
3425                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3426
3427                 } else
3428                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3429
3430                 *container = CONTAINER_TERMINATED;
3431                 return status.si_status;
3432
3433         case CLD_KILLED:
3434                 if (status.si_status == SIGINT) {
3435
3436                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3437                         *container = CONTAINER_TERMINATED;
3438                         return 0;
3439
3440                 } else if (status.si_status == SIGHUP) {
3441
3442                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3443                         *container = CONTAINER_REBOOTED;
3444                         return 0;
3445                 }
3446
3447                 /* CLD_KILLED fallthrough */
3448
3449         case CLD_DUMPED:
3450                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3451                 return -EIO;
3452
3453         default:
3454                 log_error("Container %s failed due to unknown reason.", arg_machine);
3455                 return -EIO;
3456         }
3457
3458         return r;
3459 }
3460
3461 static void nop_handler(int sig) {}
3462
3463 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3464         pid_t pid;
3465
3466         pid = PTR_TO_UINT32(userdata);
3467         if (pid > 0) {
3468                 if (kill(pid, SIGRTMIN+3) >= 0) {
3469                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3470                         sd_event_source_set_userdata(s, NULL);
3471                         return 0;
3472                 }
3473         }
3474
3475         sd_event_exit(sd_event_source_get_event(s), 0);
3476         return 0;
3477 }
3478
3479 static int determine_names(void) {
3480         int r;
3481
3482         if (!arg_image && !arg_directory) {
3483                 if (arg_machine) {
3484                         _cleanup_(image_unrefp) Image *i = NULL;
3485
3486                         r = image_find(arg_machine, &i);
3487                         if (r < 0)
3488                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3489                         else if (r == 0) {
3490                                 log_error("No image for machine '%s': %m", arg_machine);
3491                                 return -ENOENT;
3492                         }
3493
3494                         if (i->type == IMAGE_RAW)
3495                                 r = set_sanitized_path(&arg_image, i->path);
3496                         else
3497                                 r = set_sanitized_path(&arg_directory, i->path);
3498                         if (r < 0)
3499                                 return log_error_errno(r, "Invalid image directory: %m");
3500
3501                         arg_read_only = arg_read_only || i->read_only;
3502                 } else
3503                         arg_directory = get_current_dir_name();
3504
3505                 if (!arg_directory && !arg_machine) {
3506                         log_error("Failed to determine path, please use -D or -i.");
3507                         return -EINVAL;
3508                 }
3509         }
3510
3511         if (!arg_machine) {
3512                 if (arg_directory && path_equal(arg_directory, "/"))
3513                         arg_machine = gethostname_malloc();
3514                 else
3515                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3516
3517                 if (!arg_machine)
3518                         return log_oom();
3519
3520                 hostname_cleanup(arg_machine, false);
3521                 if (!machine_name_is_valid(arg_machine)) {
3522                         log_error("Failed to determine machine name automatically, please use -M.");
3523                         return -EINVAL;
3524                 }
3525
3526                 if (arg_ephemeral) {
3527                         char *b;
3528
3529                         /* Add a random suffix when this is an
3530                          * ephemeral machine, so that we can run many
3531                          * instances at once without manually having
3532                          * to specify -M each time. */
3533
3534                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3535                                 return log_oom();
3536
3537                         free(arg_machine);
3538                         arg_machine = b;
3539                 }
3540         }
3541
3542         return 0;
3543 }
3544
3545 int main(int argc, char *argv[]) {
3546
3547         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3548         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3549         _cleanup_close_ int master = -1, image_fd = -1;
3550         _cleanup_fdset_free_ FDSet *fds = NULL;
3551         int r, n_fd_passed, loop_nr = -1;
3552         char veth_name[IFNAMSIZ];
3553         bool secondary = false, remove_subvol = false;
3554         sigset_t mask, mask_chld;
3555         pid_t pid = 0;
3556         int ret = EXIT_SUCCESS;
3557         union in_addr_union exposed = {};
3558         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3559
3560         log_parse_environment();
3561         log_open();
3562
3563         r = parse_argv(argc, argv);
3564         if (r <= 0)
3565                 goto finish;
3566
3567         r = determine_names();
3568         if (r < 0)
3569                 goto finish;
3570
3571         if (geteuid() != 0) {
3572                 log_error("Need to be root.");
3573                 r = -EPERM;
3574                 goto finish;
3575         }
3576
3577         if (sd_booted() <= 0) {
3578                 log_error("Not running on a systemd system.");
3579                 r = -EINVAL;
3580                 goto finish;
3581         }
3582
3583         log_close();
3584         n_fd_passed = sd_listen_fds(false);
3585         if (n_fd_passed > 0) {
3586                 r = fdset_new_listen_fds(&fds, false);
3587                 if (r < 0) {
3588                         log_error_errno(r, "Failed to collect file descriptors: %m");
3589                         goto finish;
3590                 }
3591         }
3592         fdset_close_others(fds);
3593         log_open();
3594
3595         if (arg_directory) {
3596                 assert(!arg_image);
3597
3598                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3599                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3600                         r = -EINVAL;
3601                         goto finish;
3602                 }
3603
3604                 if (arg_ephemeral) {
3605                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3606                         char *np;
3607
3608                         /* If the specified path is a mount point we
3609                          * generate the new snapshot immediately
3610                          * inside it under a random name. However if
3611                          * the specified is not a mount point we
3612                          * create the new snapshot in the parent
3613                          * directory, just next to it. */
3614                         r = path_is_mount_point(arg_directory, false);
3615                         if (r < 0) {
3616                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3617                                 goto finish;
3618                         }
3619                         if (r > 0)
3620                                 r = tempfn_random_child(arg_directory, &np);
3621                         else
3622                                 r = tempfn_random(arg_directory, &np);
3623                         if (r < 0) {
3624                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3625                                 goto finish;
3626                         }
3627
3628                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3629                         if (r < 0) {
3630                                 log_error_errno(r, "Failed to lock %s: %m", np);
3631                                 goto finish;
3632                         }
3633
3634                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3635                         if (r < 0) {
3636                                 free(np);
3637                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3638                                 goto finish;
3639                         }
3640
3641                         free(arg_directory);
3642                         arg_directory = np;
3643
3644                         remove_subvol = true;
3645
3646                 } else {
3647                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3648                         if (r == -EBUSY) {
3649                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3650                                 goto finish;
3651                         }
3652                         if (r < 0) {
3653                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3654                                 return r;
3655                         }
3656
3657                         if (arg_template) {
3658                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3659                                 if (r == -EEXIST) {
3660                                         if (!arg_quiet)
3661                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3662                                 } else if (r < 0) {
3663                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3664                                         goto finish;
3665                                 } else {
3666                                         if (!arg_quiet)
3667                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3668                                 }
3669                         }
3670                 }
3671
3672                 if (arg_boot) {
3673                         if (path_is_os_tree(arg_directory) <= 0) {
3674                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3675                                 r = -EINVAL;
3676                                 goto finish;
3677                         }
3678                 } else {
3679                         const char *p;
3680
3681                         p = strappenda(arg_directory,
3682                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3683                         if (access(p, F_OK) < 0) {
3684                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3685                                 r = -EINVAL;
3686                                 goto finish;
3687                         }
3688                 }
3689
3690         } else {
3691                 char template[] = "/tmp/nspawn-root-XXXXXX";
3692
3693                 assert(arg_image);
3694                 assert(!arg_template);
3695
3696                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697                 if (r == -EBUSY) {
3698                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3699                         goto finish;
3700                 }
3701                 if (r < 0) {
3702                         r = log_error_errno(r, "Failed to create image lock: %m");
3703                         goto finish;
3704                 }
3705
3706                 if (!mkdtemp(template)) {
3707                         log_error_errno(errno, "Failed to create temporary directory: %m");
3708                         r = -errno;
3709                         goto finish;
3710                 }
3711
3712                 arg_directory = strdup(template);
3713                 if (!arg_directory) {
3714                         r = log_oom();
3715                         goto finish;
3716                 }
3717
3718                 image_fd = setup_image(&device_path, &loop_nr);
3719                 if (image_fd < 0) {
3720                         r = image_fd;
3721                         goto finish;
3722                 }
3723
3724                 r = dissect_image(image_fd,
3725                                   &root_device, &root_device_rw,
3726                                   &home_device, &home_device_rw,
3727                                   &srv_device, &srv_device_rw,
3728                                   &secondary);
3729                 if (r < 0)
3730                         goto finish;
3731         }
3732
3733         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3734         if (master < 0) {
3735                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3736                 goto finish;
3737         }
3738
3739         r = ptsname_malloc(master, &console);
3740         if (r < 0) {
3741                 r = log_error_errno(r, "Failed to determine tty name: %m");
3742                 goto finish;
3743         }
3744
3745         if (!arg_quiet)
3746                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3747                          arg_machine, arg_image ?: arg_directory);
3748
3749         if (unlockpt(master) < 0) {
3750                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3751                 goto finish;
3752         }
3753
3754         assert_se(sigemptyset(&mask) == 0);
3755         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3756         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3757
3758         assert_se(sigemptyset(&mask_chld) == 0);
3759         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3760
3761         for (;;) {
3762                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3763                 ContainerStatus container_status;
3764                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3765                 struct sigaction sa = {
3766                         .sa_handler = nop_handler,
3767                         .sa_flags = SA_NOCLDSTOP,
3768                 };
3769
3770                 r = barrier_create(&barrier);
3771                 if (r < 0) {
3772                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3773                         goto finish;
3774                 }
3775
3776                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3777                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3778                         goto finish;
3779                 }
3780
3781                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3782                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3783                         goto finish;
3784                 }
3785
3786                 /* Child can be killed before execv(), so handle SIGCHLD
3787                  * in order to interrupt parent's blocking calls and
3788                  * give it a chance to call wait() and terminate. */
3789                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3790                 if (r < 0) {
3791                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3792                         goto finish;
3793                 }
3794
3795                 r = sigaction(SIGCHLD, &sa, NULL);
3796                 if (r < 0) {
3797                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3798                         goto finish;
3799                 }
3800
3801                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3802                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3803                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3804                 if (pid < 0) {
3805                         if (errno == EINVAL)
3806                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3807                         else
3808                                 r = log_error_errno(errno, "clone() failed: %m");
3809
3810                         goto finish;
3811                 }
3812
3813                 if (pid == 0) {
3814                         /* child */
3815                         _cleanup_free_ char *home = NULL;
3816                         unsigned n_env = 2;
3817                         const char *envp[] = {
3818                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3819                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3820                                 NULL, /* TERM */
3821                                 NULL, /* HOME */
3822                                 NULL, /* USER */
3823                                 NULL, /* LOGNAME */
3824                                 NULL, /* container_uuid */
3825                                 NULL, /* LISTEN_FDS */
3826                                 NULL, /* LISTEN_PID */
3827                                 NULL
3828                         };
3829                         char **env_use;
3830
3831                         barrier_set_role(&barrier, BARRIER_CHILD);
3832
3833                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3834                         if (envp[n_env])
3835                                 n_env ++;
3836
3837                         master = safe_close(master);
3838
3839                         close_nointr(STDIN_FILENO);
3840                         close_nointr(STDOUT_FILENO);
3841                         close_nointr(STDERR_FILENO);
3842
3843                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3844                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3845
3846                         reset_all_signal_handlers();
3847                         reset_signal_mask();
3848
3849                         r = open_terminal(console, O_RDWR);
3850                         if (r != STDIN_FILENO) {
3851                                 if (r >= 0) {
3852                                         safe_close(r);
3853                                         r = -EINVAL;
3854                                 }
3855
3856                                 log_error_errno(r, "Failed to open console: %m");
3857                                 _exit(EXIT_FAILURE);
3858                         }
3859
3860                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3861                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3862                                 log_error_errno(errno, "Failed to duplicate console: %m");
3863                                 _exit(EXIT_FAILURE);
3864                         }
3865
3866                         if (setsid() < 0) {
3867                                 log_error_errno(errno, "setsid() failed: %m");
3868                                 _exit(EXIT_FAILURE);
3869                         }
3870
3871                         if (reset_audit_loginuid() < 0)
3872                                 _exit(EXIT_FAILURE);
3873
3874                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3875                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3876                                 _exit(EXIT_FAILURE);
3877                         }
3878
3879                         /* Mark everything as slave, so that we still
3880                          * receive mounts from the real root, but don't
3881                          * propagate mounts to the real root. */
3882                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3883                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3884                                 _exit(EXIT_FAILURE);
3885                         }
3886
3887                         if (mount_devices(arg_directory,
3888                                           root_device, root_device_rw,
3889                                           home_device, home_device_rw,
3890                                           srv_device, srv_device_rw) < 0)
3891                                 _exit(EXIT_FAILURE);
3892
3893                         /* Turn directory into bind mount */
3894                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3895                                 log_error_errno(errno, "Failed to make bind mount: %m");
3896                                 _exit(EXIT_FAILURE);
3897                         }
3898
3899                         r = setup_volatile(arg_directory);
3900                         if (r < 0)
3901                                 _exit(EXIT_FAILURE);
3902
3903                         if (setup_volatile_state(arg_directory) < 0)
3904                                 _exit(EXIT_FAILURE);
3905
3906                         r = base_filesystem_create(arg_directory);
3907                         if (r < 0)
3908                                 _exit(EXIT_FAILURE);
3909
3910                         if (arg_read_only) {
3911                                 r = bind_remount_recursive(arg_directory, true);
3912                                 if (r < 0) {
3913                                         log_error_errno(r, "Failed to make tree read-only: %m");
3914                                         _exit(EXIT_FAILURE);
3915                                 }
3916                         }
3917
3918                         if (mount_all(arg_directory) < 0)
3919                                 _exit(EXIT_FAILURE);
3920
3921                         if (copy_devnodes(arg_directory) < 0)
3922                                 _exit(EXIT_FAILURE);
3923
3924                         if (setup_ptmx(arg_directory) < 0)
3925                                 _exit(EXIT_FAILURE);
3926
3927                         dev_setup(arg_directory);
3928
3929                         if (setup_propagate(arg_directory) < 0)
3930                                 _exit(EXIT_FAILURE);
3931
3932                         if (setup_seccomp() < 0)
3933                                 _exit(EXIT_FAILURE);
3934
3935                         if (setup_dev_console(arg_directory, console) < 0)
3936                                 _exit(EXIT_FAILURE);
3937
3938                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3939                                 _exit(EXIT_FAILURE);
3940                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3941
3942                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3943                                 _exit(EXIT_FAILURE);
3944                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3945
3946                         /* Tell the parent that we are ready, and that
3947                          * it can cgroupify us to that we lack access
3948                          * to certain devices and resources. */
3949                         (void) barrier_place(&barrier);
3950
3951                         if (setup_boot_id(arg_directory) < 0)
3952                                 _exit(EXIT_FAILURE);
3953
3954                         if (setup_timezone(arg_directory) < 0)
3955                                 _exit(EXIT_FAILURE);
3956
3957                         if (setup_resolv_conf(arg_directory) < 0)
3958                                 _exit(EXIT_FAILURE);
3959
3960                         if (setup_journal(arg_directory) < 0)
3961                                 _exit(EXIT_FAILURE);
3962
3963                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3964                                 _exit(EXIT_FAILURE);
3965
3966                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3967                                 _exit(EXIT_FAILURE);
3968
3969                         if (mount_tmpfs(arg_directory) < 0)
3970                                 _exit(EXIT_FAILURE);
3971
3972                         /* Wait until we are cgroup-ified, so that we
3973                          * can mount the right cgroup path writable */
3974                         (void) barrier_sync_next(&barrier);
3975
3976                         if (mount_cgroup(arg_directory) < 0)
3977                                 _exit(EXIT_FAILURE);
3978
3979                         if (chdir(arg_directory) < 0) {
3980                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3981                                 _exit(EXIT_FAILURE);
3982                         }
3983
3984                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3985                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3986                                 _exit(EXIT_FAILURE);
3987                         }
3988
3989                         if (chroot(".") < 0) {
3990                                 log_error_errno(errno, "chroot() failed: %m");
3991                                 _exit(EXIT_FAILURE);
3992                         }
3993
3994                         if (chdir("/") < 0) {
3995                                 log_error_errno(errno, "chdir() failed: %m");
3996                                 _exit(EXIT_FAILURE);
3997                         }
3998
3999                         umask(0022);
4000
4001                         if (arg_private_network)
4002                                 loopback_setup();
4003
4004                         if (drop_capabilities() < 0) {
4005                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4006                                 _exit(EXIT_FAILURE);
4007                         }
4008
4009                         r = change_uid_gid(&home);
4010                         if (r < 0)
4011                                 _exit(EXIT_FAILURE);
4012
4013                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4014                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4015                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4016                                 log_oom();
4017                                 _exit(EXIT_FAILURE);
4018                         }
4019
4020                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4021                                 char as_uuid[37];
4022
4023                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4024                                         log_oom();
4025                                         _exit(EXIT_FAILURE);
4026                                 }
4027                         }
4028
4029                         if (fdset_size(fds) > 0) {
4030                                 r = fdset_cloexec(fds, false);
4031                                 if (r < 0) {
4032                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4033                                         _exit(EXIT_FAILURE);
4034                                 }
4035
4036                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4037                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4038                                         log_oom();
4039                                         _exit(EXIT_FAILURE);
4040                                 }
4041                         }
4042
4043                         setup_hostname();
4044
4045                         if (arg_personality != 0xffffffffLU) {
4046                                 if (personality(arg_personality) < 0) {
4047                                         log_error_errno(errno, "personality() failed: %m");
4048                                         _exit(EXIT_FAILURE);
4049                                 }
4050                         } else if (secondary) {
4051                                 if (personality(PER_LINUX32) < 0) {
4052                                         log_error_errno(errno, "personality() failed: %m");
4053                                         _exit(EXIT_FAILURE);
4054                                 }
4055                         }
4056
4057 #ifdef HAVE_SELINUX
4058                         if (arg_selinux_context)
4059                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4060                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4061                                         _exit(EXIT_FAILURE);
4062                                 }
4063 #endif
4064
4065                         if (!strv_isempty(arg_setenv)) {
4066                                 char **n;
4067
4068                                 n = strv_env_merge(2, envp, arg_setenv);
4069                                 if (!n) {
4070                                         log_oom();
4071                                         _exit(EXIT_FAILURE);
4072                                 }
4073
4074                                 env_use = n;
4075                         } else
4076                                 env_use = (char**) envp;
4077
4078                         /* Wait until the parent is ready with the setup, too... */
4079                         if (!barrier_place_and_sync(&barrier))
4080                                 _exit(EXIT_FAILURE);
4081
4082                         if (arg_boot) {
4083                                 char **a;
4084                                 size_t l;
4085
4086                                 /* Automatically search for the init system */
4087
4088                                 l = 1 + argc - optind;
4089                                 a = newa(char*, l + 1);
4090                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4091
4092                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4093                                 execve(a[0], a, env_use);
4094
4095                                 a[0] = (char*) "/lib/systemd/systemd";
4096                                 execve(a[0], a, env_use);
4097
4098                                 a[0] = (char*) "/sbin/init";
4099                                 execve(a[0], a, env_use);
4100                         } else if (argc > optind)
4101                                 execvpe(argv[optind], argv + optind, env_use);
4102                         else {
4103                                 chdir(home ? home : "/root");
4104                                 execle("/bin/bash", "-bash", NULL, env_use);
4105                                 execle("/bin/sh", "-sh", NULL, env_use);
4106                         }
4107
4108                         log_error_errno(errno, "execv() failed: %m");
4109                         _exit(EXIT_FAILURE);
4110                 }
4111
4112                 barrier_set_role(&barrier, BARRIER_PARENT);
4113                 fdset_free(fds);
4114                 fds = NULL;
4115
4116                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4117                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4118
4119                 /* Wait for the most basic Child-setup to be done,
4120                  * before we add hardware to it, and place it in a
4121                  * cgroup. */
4122                 if (barrier_sync_next(&barrier)) {
4123                         int ifi = 0;
4124
4125                         r = move_network_interfaces(pid);
4126                         if (r < 0)
4127                                 goto finish;
4128
4129                         r = setup_veth(pid, veth_name, &ifi);
4130                         if (r < 0)
4131                                 goto finish;
4132
4133                         r = setup_bridge(veth_name, &ifi);
4134                         if (r < 0)
4135                                 goto finish;
4136
4137                         r = setup_macvlan(pid);
4138                         if (r < 0)
4139                                 goto finish;
4140
4141                         r = setup_ipvlan(pid);
4142                         if (r < 0)
4143                                 goto finish;
4144
4145                         r = register_machine(pid, ifi);
4146                         if (r < 0)
4147                                 goto finish;
4148
4149                         /* Block SIGCHLD here, before notifying child.
4150                          * process_pty() will handle it with the other signals. */
4151                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4152                         if (r < 0)
4153                                 goto finish;
4154
4155                         /* Reset signal to default */
4156                         r = default_signals(SIGCHLD, -1);
4157                         if (r < 0)
4158                                 goto finish;
4159
4160                         /* Notify the child that the parent is ready with all
4161                          * its setup, and that the child can now hand over
4162                          * control to the code to run inside the container. */
4163                         (void) barrier_place(&barrier);
4164
4165                         /* And wait that the child is completely ready now. */
4166                         if (barrier_place_and_sync(&barrier)) {
4167                                 _cleanup_event_unref_ sd_event *event = NULL;
4168                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4169                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4170                                 char last_char = 0;
4171
4172                                 sd_notifyf(false,
4173                                            "READY=1\n"
4174                                            "STATUS=Container running.\n"
4175                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4176
4177                                 r = sd_event_new(&event);
4178                                 if (r < 0) {
4179                                         log_error_errno(r, "Failed to get default event source: %m");
4180                                         goto finish;
4181                                 }
4182
4183                                 if (arg_boot) {
4184                                         /* Try to kill the init system on SIGINT or SIGTERM */
4185                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4186                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4187                                 } else {
4188                                         /* Immediately exit */
4189                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4190                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4191                                 }
4192
4193                                 /* simply exit on sigchld */
4194                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4195
4196                                 if (arg_expose_ports) {
4197                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4198                                         if (r < 0)
4199                                                 goto finish;
4200
4201                                         (void) expose_ports(rtnl, &exposed);
4202                                 }
4203
4204                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4205
4206                                 r = pty_forward_new(event, master, true, &forward);
4207                                 if (r < 0) {
4208                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4209                                         goto finish;
4210                                 }
4211
4212                                 r = sd_event_loop(event);
4213                                 if (r < 0) {
4214                                         log_error_errno(r, "Failed to run event loop: %m");
4215                                         goto finish;
4216                                 }
4217
4218                                 pty_forward_get_last_char(forward, &last_char);
4219
4220                                 forward = pty_forward_free(forward);
4221
4222                                 if (!arg_quiet && last_char != '\n')
4223                                         putc('\n', stdout);
4224
4225                                 /* Kill if it is not dead yet anyway */
4226                                 terminate_machine(pid);
4227                         }
4228                 }
4229
4230                 /* Normally redundant, but better safe than sorry */
4231                 kill(pid, SIGKILL);
4232
4233                 r = wait_for_container(pid, &container_status);
4234                 pid = 0;
4235
4236                 if (r < 0)
4237                         /* We failed to wait for the container, or the
4238                          * container exited abnormally */
4239                         goto finish;
4240                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4241                         /* The container exited with a non-zero
4242                          * status, or with zero status and no reboot
4243                          * was requested. */
4244                         ret = r;
4245                         break;
4246                 }
4247
4248                 /* CONTAINER_REBOOTED, loop again */
4249
4250                 if (arg_keep_unit) {
4251                         /* Special handling if we are running as a
4252                          * service: instead of simply restarting the
4253                          * machine we want to restart the entire
4254                          * service, so let's inform systemd about this
4255                          * with the special exit code 133. The service
4256                          * file uses RestartForceExitStatus=133 so
4257                          * that this results in a full nspawn
4258                          * restart. This is necessary since we might
4259                          * have cgroup parameters set we want to have
4260                          * flushed out. */
4261                         ret = 133;
4262                         r = 0;
4263                         break;
4264                 }
4265
4266                 flush_ports(&exposed);
4267         }
4268
4269 finish:
4270         sd_notify(false,
4271                   "STOPPING=1\n"
4272                   "STATUS=Terminating...");
4273
4274         loop_remove(loop_nr, &image_fd);
4275
4276         if (pid > 0)
4277                 kill(pid, SIGKILL);
4278
4279         if (remove_subvol && arg_directory) {
4280                 int k;
4281
4282                 k = btrfs_subvol_remove(arg_directory);
4283                 if (k < 0)
4284                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4285         }
4286
4287         if (arg_machine) {
4288                 const char *p;
4289
4290                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4291                 (void) rm_rf(p, false, true, false);
4292         }
4293
4294         free(arg_directory);
4295         free(arg_template);
4296         free(arg_image);
4297         free(arg_machine);
4298         free(arg_user);
4299         strv_free(arg_setenv);
4300         strv_free(arg_network_interfaces);
4301         strv_free(arg_network_macvlan);
4302         strv_free(arg_network_ipvlan);
4303         strv_free(arg_bind);
4304         strv_free(arg_bind_ro);
4305         strv_free(arg_tmpfs);
4306
4307         flush_ports(&exposed);
4308
4309         while (arg_expose_ports) {
4310                 ExposePort *p = arg_expose_ports;
4311                 LIST_REMOVE(ports, arg_expose_ports, p);
4312                 free(p);
4313         }
4314
4315         return r < 0 ? EXIT_FAILURE : ret;
4316 }