chiark / gitweb /
nspawn: make sure we don't try to mount the container block device in the child after...
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89
90 #ifdef HAVE_SECCOMP
91 #include "seccomp-util.h"
92 #endif
93
94 typedef enum LinkJournal {
95         LINK_NO,
96         LINK_AUTO,
97         LINK_HOST,
98         LINK_GUEST
99 } LinkJournal;
100
101 static char *arg_directory = NULL;
102 static char *arg_user = NULL;
103 static sd_id128_t arg_uuid = {};
104 static char *arg_machine = NULL;
105 static const char *arg_selinux_context = NULL;
106 static const char *arg_selinux_apifs_context = NULL;
107 static const char *arg_slice = NULL;
108 static bool arg_private_network = false;
109 static bool arg_read_only = false;
110 static bool arg_boot = false;
111 static LinkJournal arg_link_journal = LINK_AUTO;
112 static uint64_t arg_retain =
113         (1ULL << CAP_CHOWN) |
114         (1ULL << CAP_DAC_OVERRIDE) |
115         (1ULL << CAP_DAC_READ_SEARCH) |
116         (1ULL << CAP_FOWNER) |
117         (1ULL << CAP_FSETID) |
118         (1ULL << CAP_IPC_OWNER) |
119         (1ULL << CAP_KILL) |
120         (1ULL << CAP_LEASE) |
121         (1ULL << CAP_LINUX_IMMUTABLE) |
122         (1ULL << CAP_NET_BIND_SERVICE) |
123         (1ULL << CAP_NET_BROADCAST) |
124         (1ULL << CAP_NET_RAW) |
125         (1ULL << CAP_SETGID) |
126         (1ULL << CAP_SETFCAP) |
127         (1ULL << CAP_SETPCAP) |
128         (1ULL << CAP_SETUID) |
129         (1ULL << CAP_SYS_ADMIN) |
130         (1ULL << CAP_SYS_CHROOT) |
131         (1ULL << CAP_SYS_NICE) |
132         (1ULL << CAP_SYS_PTRACE) |
133         (1ULL << CAP_SYS_TTY_CONFIG) |
134         (1ULL << CAP_SYS_RESOURCE) |
135         (1ULL << CAP_SYS_BOOT) |
136         (1ULL << CAP_AUDIT_WRITE) |
137         (1ULL << CAP_AUDIT_CONTROL) |
138         (1ULL << CAP_MKNOD);
139 static char **arg_bind = NULL;
140 static char **arg_bind_ro = NULL;
141 static char **arg_setenv = NULL;
142 static bool arg_quiet = false;
143 static bool arg_share_system = false;
144 static bool arg_register = true;
145 static bool arg_keep_unit = false;
146 static char **arg_network_interfaces = NULL;
147 static char **arg_network_macvlan = NULL;
148 static bool arg_network_veth = false;
149 static const char *arg_network_bridge = NULL;
150 static unsigned long arg_personality = 0xffffffffLU;
151 static const char *arg_image = NULL;
152
153 static int help(void) {
154
155         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
157                "  -h --help                 Show this help\n"
158                "     --version              Print version string\n"
159                "  -q --quiet                Do not show status information\n"
160                "  -D --directory=PATH       Root directory for the container\n"
161                "  -i --image=PATH           File system device or image for the container\n"
162                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
163                "  -u --user=USER            Run the command under specified user or uid\n"
164                "  -M --machine=NAME         Set the machine name for the container\n"
165                "     --uuid=UUID            Set a specific machine UUID for the container\n"
166                "  -S --slice=SLICE          Place the container in the specified slice\n"
167                "     --private-network      Disable network in container\n"
168                "     --network-interface=INTERFACE\n"
169                "                            Assign an existing network interface to the\n"
170                "                            container\n"
171                "     --network-macvlan=INTERFACE\n"
172                "                            Create a macvlan network interface based on an\n"
173                "                            existing network interface to the container\n"
174                "     --network-veth         Add a virtual ethernet connection between host\n"
175                "                            and container\n"
176                "     --network-bridge=INTERFACE\n"
177                "                            Add a virtual ethernet connection between host\n"
178                "                            and container and add it to an existing bridge on\n"
179                "                            the host\n"
180                "  -Z --selinux-context=SECLABEL\n"
181                "                            Set the SELinux security context to be used by\n"
182                "                            processes in the container\n"
183                "  -L --selinux-apifs-context=SECLABEL\n"
184                "                            Set the SELinux security context to be used by\n"
185                "                            API/tmpfs file systems in the container\n"
186                "     --capability=CAP       In addition to the default, retain specified\n"
187                "                            capability\n"
188                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
189                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
190                "  -j                        Equivalent to --link-journal=host\n"
191                "     --read-only            Mount the root directory read-only\n"
192                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
193                "                            the container\n"
194                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
195                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
196                "     --share-system         Share system namespaces with host\n"
197                "     --register=BOOLEAN     Register container as machine\n"
198                "     --keep-unit            Do not register a scope for the machine, reuse\n"
199                "                            the service unit nspawn is running in\n",
200                program_invocation_short_name);
201
202         return 0;
203 }
204
205 static int parse_argv(int argc, char *argv[]) {
206
207         enum {
208                 ARG_VERSION = 0x100,
209                 ARG_PRIVATE_NETWORK,
210                 ARG_UUID,
211                 ARG_READ_ONLY,
212                 ARG_CAPABILITY,
213                 ARG_DROP_CAPABILITY,
214                 ARG_LINK_JOURNAL,
215                 ARG_BIND,
216                 ARG_BIND_RO,
217                 ARG_SETENV,
218                 ARG_SHARE_SYSTEM,
219                 ARG_REGISTER,
220                 ARG_KEEP_UNIT,
221                 ARG_NETWORK_INTERFACE,
222                 ARG_NETWORK_MACVLAN,
223                 ARG_NETWORK_VETH,
224                 ARG_NETWORK_BRIDGE,
225                 ARG_PERSONALITY,
226         };
227
228         static const struct option options[] = {
229                 { "help",                  no_argument,       NULL, 'h'                   },
230                 { "version",               no_argument,       NULL, ARG_VERSION           },
231                 { "directory",             required_argument, NULL, 'D'                   },
232                 { "user",                  required_argument, NULL, 'u'                   },
233                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
234                 { "boot",                  no_argument,       NULL, 'b'                   },
235                 { "uuid",                  required_argument, NULL, ARG_UUID              },
236                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
237                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
238                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
239                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
240                 { "bind",                  required_argument, NULL, ARG_BIND              },
241                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
242                 { "machine",               required_argument, NULL, 'M'                   },
243                 { "slice",                 required_argument, NULL, 'S'                   },
244                 { "setenv",                required_argument, NULL, ARG_SETENV            },
245                 { "selinux-context",       required_argument, NULL, 'Z'                   },
246                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
247                 { "quiet",                 no_argument,       NULL, 'q'                   },
248                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
249                 { "register",              required_argument, NULL, ARG_REGISTER          },
250                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
251                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
252                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
253                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
254                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
255                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
256                 { "image",                 required_argument, NULL, 'i'                   },
257                 {}
258         };
259
260         int c, r;
261         uint64_t plus = 0, minus = 0;
262
263         assert(argc >= 0);
264         assert(argv);
265
266         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
267
268                 switch (c) {
269
270                 case 'h':
271                         return help();
272
273                 case ARG_VERSION:
274                         puts(PACKAGE_STRING);
275                         puts(SYSTEMD_FEATURES);
276                         return 0;
277
278                 case 'D':
279                         free(arg_directory);
280                         arg_directory = canonicalize_file_name(optarg);
281                         if (!arg_directory) {
282                                 log_error("Invalid root directory: %m");
283                                 return -ENOMEM;
284                         }
285
286                         break;
287
288                 case 'i':
289                         arg_image = optarg;
290                         break;
291
292                 case 'u':
293                         free(arg_user);
294                         arg_user = strdup(optarg);
295                         if (!arg_user)
296                                 return log_oom();
297
298                         break;
299
300                 case ARG_NETWORK_BRIDGE:
301                         arg_network_bridge = optarg;
302
303                         /* fall through */
304
305                 case ARG_NETWORK_VETH:
306                         arg_network_veth = true;
307                         arg_private_network = true;
308                         break;
309
310                 case ARG_NETWORK_INTERFACE:
311                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
312                                 return log_oom();
313
314                         arg_private_network = true;
315                         break;
316
317                 case ARG_NETWORK_MACVLAN:
318                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
319                                 return log_oom();
320
321                         /* fall through */
322
323                 case ARG_PRIVATE_NETWORK:
324                         arg_private_network = true;
325                         break;
326
327                 case 'b':
328                         arg_boot = true;
329                         break;
330
331                 case ARG_UUID:
332                         r = sd_id128_from_string(optarg, &arg_uuid);
333                         if (r < 0) {
334                                 log_error("Invalid UUID: %s", optarg);
335                                 return r;
336                         }
337                         break;
338
339                 case 'S':
340                         arg_slice = optarg;
341                         break;
342
343                 case 'M':
344                         if (isempty(optarg)) {
345                                 free(arg_machine);
346                                 arg_machine = NULL;
347                         } else {
348
349                                 if (!hostname_is_valid(optarg)) {
350                                         log_error("Invalid machine name: %s", optarg);
351                                         return -EINVAL;
352                                 }
353
354                                 free(arg_machine);
355                                 arg_machine = strdup(optarg);
356                                 if (!arg_machine)
357                                         return log_oom();
358
359                                 break;
360                         }
361
362                 case 'Z':
363                         arg_selinux_context = optarg;
364                         break;
365
366                 case 'L':
367                         arg_selinux_apifs_context = optarg;
368                         break;
369
370                 case ARG_READ_ONLY:
371                         arg_read_only = true;
372                         break;
373
374                 case ARG_CAPABILITY:
375                 case ARG_DROP_CAPABILITY: {
376                         char *state, *word;
377                         size_t length;
378
379                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
380                                 _cleanup_free_ char *t;
381                                 cap_value_t cap;
382
383                                 t = strndup(word, length);
384                                 if (!t)
385                                         return log_oom();
386
387                                 if (streq(t, "all")) {
388                                         if (c == ARG_CAPABILITY)
389                                                 plus = (uint64_t) -1;
390                                         else
391                                                 minus = (uint64_t) -1;
392                                 } else {
393                                         if (cap_from_name(t, &cap) < 0) {
394                                                 log_error("Failed to parse capability %s.", t);
395                                                 return -EINVAL;
396                                         }
397
398                                         if (c == ARG_CAPABILITY)
399                                                 plus |= 1ULL << (uint64_t) cap;
400                                         else
401                                                 minus |= 1ULL << (uint64_t) cap;
402                                 }
403                         }
404
405                         break;
406                 }
407
408                 case 'j':
409                         arg_link_journal = LINK_GUEST;
410                         break;
411
412                 case ARG_LINK_JOURNAL:
413                         if (streq(optarg, "auto"))
414                                 arg_link_journal = LINK_AUTO;
415                         else if (streq(optarg, "no"))
416                                 arg_link_journal = LINK_NO;
417                         else if (streq(optarg, "guest"))
418                                 arg_link_journal = LINK_GUEST;
419                         else if (streq(optarg, "host"))
420                                 arg_link_journal = LINK_HOST;
421                         else {
422                                 log_error("Failed to parse link journal mode %s", optarg);
423                                 return -EINVAL;
424                         }
425
426                         break;
427
428                 case ARG_BIND:
429                 case ARG_BIND_RO: {
430                         _cleanup_free_ char *a = NULL, *b = NULL;
431                         char *e;
432                         char ***x;
433
434                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
435
436                         e = strchr(optarg, ':');
437                         if (e) {
438                                 a = strndup(optarg, e - optarg);
439                                 b = strdup(e + 1);
440                         } else {
441                                 a = strdup(optarg);
442                                 b = strdup(optarg);
443                         }
444
445                         if (!a || !b)
446                                 return log_oom();
447
448                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
449                                 log_error("Invalid bind mount specification: %s", optarg);
450                                 return -EINVAL;
451                         }
452
453                         r = strv_extend(x, a);
454                         if (r < 0)
455                                 return log_oom();
456
457                         r = strv_extend(x, b);
458                         if (r < 0)
459                                 return log_oom();
460
461                         break;
462                 }
463
464                 case ARG_SETENV: {
465                         char **n;
466
467                         if (!env_assignment_is_valid(optarg)) {
468                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
469                                 return -EINVAL;
470                         }
471
472                         n = strv_env_set(arg_setenv, optarg);
473                         if (!n)
474                                 return log_oom();
475
476                         strv_free(arg_setenv);
477                         arg_setenv = n;
478                         break;
479                 }
480
481                 case 'q':
482                         arg_quiet = true;
483                         break;
484
485                 case ARG_SHARE_SYSTEM:
486                         arg_share_system = true;
487                         break;
488
489                 case ARG_REGISTER:
490                         r = parse_boolean(optarg);
491                         if (r < 0) {
492                                 log_error("Failed to parse --register= argument: %s", optarg);
493                                 return r;
494                         }
495
496                         arg_register = r;
497                         break;
498
499                 case ARG_KEEP_UNIT:
500                         arg_keep_unit = true;
501                         break;
502
503                 case ARG_PERSONALITY:
504
505                         arg_personality = personality_from_string(optarg);
506                         if (arg_personality == 0xffffffffLU) {
507                                 log_error("Unknown or unsupported personality '%s'.", optarg);
508                                 return -EINVAL;
509                         }
510
511                         break;
512
513                 case '?':
514                         return -EINVAL;
515
516                 default:
517                         assert_not_reached("Unhandled option");
518                 }
519         }
520
521         if (arg_share_system)
522                 arg_register = false;
523
524         if (arg_boot && arg_share_system) {
525                 log_error("--boot and --share-system may not be combined.");
526                 return -EINVAL;
527         }
528
529         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530                 log_error("--keep-unit may not be used when invoked from a user session.");
531                 return -EINVAL;
532         }
533
534         if (arg_directory && arg_image) {
535                 log_error("--directory= and --image= may not be combined.");
536                 return -EINVAL;
537         }
538
539         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
540
541         return 1;
542 }
543
544 static int mount_all(const char *dest) {
545
546         typedef struct MountPoint {
547                 const char *what;
548                 const char *where;
549                 const char *type;
550                 const char *options;
551                 unsigned long flags;
552                 bool fatal;
553         } MountPoint;
554
555         static const MountPoint mount_table[] = {
556                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
557                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
558                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
559                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
560                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
561                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
562                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
563                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564 #ifdef HAVE_SELINUX
565                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
566                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
567 #endif
568         };
569
570         unsigned k;
571         int r = 0;
572
573         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
574                 _cleanup_free_ char *where = NULL;
575 #ifdef HAVE_SELINUX
576                 _cleanup_free_ char *options = NULL;
577 #endif
578                 const char *o;
579                 int t;
580
581                 where = strjoin(dest, "/", mount_table[k].where, NULL);
582                 if (!where)
583                         return log_oom();
584
585                 t = path_is_mount_point(where, true);
586                 if (t < 0) {
587                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
588
589                         if (r == 0)
590                                 r = t;
591
592                         continue;
593                 }
594
595                 /* Skip this entry if it is not a remount. */
596                 if (mount_table[k].what && t > 0)
597                         continue;
598
599                 mkdir_p(where, 0755);
600
601 #ifdef HAVE_SELINUX
602                 if (arg_selinux_apifs_context &&
603                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
605                         if (!options)
606                                 return log_oom();
607
608                         o = options;
609                 } else
610 #endif
611                         o = mount_table[k].options;
612
613
614                 if (mount(mount_table[k].what,
615                           where,
616                           mount_table[k].type,
617                           mount_table[k].flags,
618                           o) < 0 &&
619                     mount_table[k].fatal) {
620
621                         log_error("mount(%s) failed: %m", where);
622
623                         if (r == 0)
624                                 r = -errno;
625                 }
626         }
627
628         return r;
629 }
630
631 static int mount_binds(const char *dest, char **l, unsigned long flags) {
632         char **x, **y;
633
634         STRV_FOREACH_PAIR(x, y, l) {
635                 char *where;
636                 struct stat source_st, dest_st;
637                 int r;
638
639                 if (stat(*x, &source_st) < 0) {
640                         log_error("Failed to stat %s: %m", *x);
641                         return -errno;
642                 }
643
644                 where = strappenda(dest, *y);
645                 r = stat(where, &dest_st);
646                 if (r == 0) {
647                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
648                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
649                                                 *x, where);
650                                 return -EINVAL;
651                         }
652                 } else if (errno == ENOENT) {
653                         r = mkdir_parents_label(where, 0755);
654                         if (r < 0) {
655                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
656                                 return r;
657                         }
658                 } else {
659                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
660                         return -errno;
661                 }
662                 /* Create the mount point, but be conservative -- refuse to create block
663                 * and char devices. */
664                 if (S_ISDIR(source_st.st_mode))
665                         mkdir_label(where, 0755);
666                 else if (S_ISFIFO(source_st.st_mode))
667                         mkfifo(where, 0644);
668                 else if (S_ISSOCK(source_st.st_mode))
669                         mknod(where, 0644 | S_IFSOCK, 0);
670                 else if (S_ISREG(source_st.st_mode))
671                         touch(where);
672                 else {
673                         log_error("Refusing to create mountpoint for file: %s", *x);
674                         return -ENOTSUP;
675                 }
676
677                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678                         log_error("mount(%s) failed: %m", where);
679                         return -errno;
680                 }
681
682                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683                         log_error("mount(%s) failed: %m", where);
684                         return -errno;
685                 }
686         }
687
688         return 0;
689 }
690
691 static int setup_timezone(const char *dest) {
692         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
693         char *z, *y;
694         int r;
695
696         assert(dest);
697
698         /* Fix the timezone, if possible */
699         r = readlink_malloc("/etc/localtime", &p);
700         if (r < 0) {
701                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
702                 return 0;
703         }
704
705         z = path_startswith(p, "../usr/share/zoneinfo/");
706         if (!z)
707                 z = path_startswith(p, "/usr/share/zoneinfo/");
708         if (!z) {
709                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
710                 return 0;
711         }
712
713         where = strappend(dest, "/etc/localtime");
714         if (!where)
715                 return log_oom();
716
717         r = readlink_malloc(where, &q);
718         if (r >= 0) {
719                 y = path_startswith(q, "../usr/share/zoneinfo/");
720                 if (!y)
721                         y = path_startswith(q, "/usr/share/zoneinfo/");
722
723
724                 /* Already pointing to the right place? Then do nothing .. */
725                 if (y && streq(y, z))
726                         return 0;
727         }
728
729         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
730         if (!check)
731                 return log_oom();
732
733         if (access(check, F_OK) < 0) {
734                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
735                 return 0;
736         }
737
738         what = strappend("../usr/share/zoneinfo/", z);
739         if (!what)
740                 return log_oom();
741
742         unlink(where);
743         if (symlink(what, where) < 0) {
744                 log_error("Failed to correct timezone of container: %m");
745                 return 0;
746         }
747
748         return 0;
749 }
750
751 static int setup_resolv_conf(const char *dest) {
752         char _cleanup_free_ *where = NULL;
753
754         assert(dest);
755
756         if (arg_private_network)
757                 return 0;
758
759         /* Fix resolv.conf, if possible */
760         where = strappend(dest, "/etc/resolv.conf");
761         if (!where)
762                 return log_oom();
763
764         /* We don't really care for the results of this really. If it
765          * fails, it fails, but meh... */
766         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
767
768         return 0;
769 }
770
771 static int setup_boot_id(const char *dest) {
772         _cleanup_free_ char *from = NULL, *to = NULL;
773         sd_id128_t rnd = {};
774         char as_uuid[37];
775         int r;
776
777         assert(dest);
778
779         if (arg_share_system)
780                 return 0;
781
782         /* Generate a new randomized boot ID, so that each boot-up of
783          * the container gets a new one */
784
785         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
786         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
787         if (!from || !to)
788                 return log_oom();
789
790         r = sd_id128_randomize(&rnd);
791         if (r < 0) {
792                 log_error("Failed to generate random boot id: %s", strerror(-r));
793                 return r;
794         }
795
796         snprintf(as_uuid, sizeof(as_uuid),
797                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798                  SD_ID128_FORMAT_VAL(rnd));
799         char_array_0(as_uuid);
800
801         r = write_string_file(from, as_uuid);
802         if (r < 0) {
803                 log_error("Failed to write boot id: %s", strerror(-r));
804                 return r;
805         }
806
807         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808                 log_error("Failed to bind mount boot id: %m");
809                 r = -errno;
810         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811                 log_warning("Failed to make boot id read-only: %m");
812
813         unlink(from);
814         return r;
815 }
816
817 static int copy_devnodes(const char *dest) {
818
819         static const char devnodes[] =
820                 "null\0"
821                 "zero\0"
822                 "full\0"
823                 "random\0"
824                 "urandom\0"
825                 "tty\0";
826
827         const char *d;
828         int r = 0;
829         _cleanup_umask_ mode_t u;
830
831         assert(dest);
832
833         u = umask(0000);
834
835         NULSTR_FOREACH(d, devnodes) {
836                 _cleanup_free_ char *from = NULL, *to = NULL;
837                 struct stat st;
838
839                 from = strappend("/dev/", d);
840                 to = strjoin(dest, "/dev/", d, NULL);
841                 if (!from || !to)
842                         return log_oom();
843
844                 if (stat(from, &st) < 0) {
845
846                         if (errno != ENOENT) {
847                                 log_error("Failed to stat %s: %m", from);
848                                 return -errno;
849                         }
850
851                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
852
853                         log_error("%s is not a char or block device, cannot copy", from);
854                         return -EIO;
855
856                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
857
858                         log_error("mknod(%s) failed: %m", dest);
859                         return  -errno;
860                 }
861         }
862
863         return r;
864 }
865
866 static int setup_ptmx(const char *dest) {
867         _cleanup_free_ char *p = NULL;
868
869         p = strappend(dest, "/dev/ptmx");
870         if (!p)
871                 return log_oom();
872
873         if (symlink("pts/ptmx", p) < 0) {
874                 log_error("Failed to create /dev/ptmx symlink: %m");
875                 return -errno;
876         }
877
878         return 0;
879 }
880
881 static int setup_dev_console(const char *dest, const char *console) {
882         _cleanup_umask_ mode_t u;
883         const char *to;
884         struct stat st;
885         int r;
886
887         assert(dest);
888         assert(console);
889
890         u = umask(0000);
891
892         if (stat("/dev/null", &st) < 0) {
893                 log_error("Failed to stat /dev/null: %m");
894                 return -errno;
895         }
896
897         r = chmod_and_chown(console, 0600, 0, 0);
898         if (r < 0) {
899                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
900                 return r;
901         }
902
903         /* We need to bind mount the right tty to /dev/console since
904          * ptys can only exist on pts file systems. To have something
905          * to bind mount things on we create a device node first, and
906          * use /dev/null for that since we the cgroups device policy
907          * allows us to create that freely, while we cannot create
908          * /dev/console. (Note that the major minor doesn't actually
909          * matter here, since we mount it over anyway). */
910
911         to = strappenda(dest, "/dev/console");
912         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
913                 log_error("mknod() for /dev/console failed: %m");
914                 return -errno;
915         }
916
917         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
918                 log_error("Bind mount for /dev/console failed: %m");
919                 return -errno;
920         }
921
922         return 0;
923 }
924
925 static int setup_kmsg(const char *dest, int kmsg_socket) {
926         _cleanup_free_ char *from = NULL, *to = NULL;
927         int r, fd, k;
928         _cleanup_umask_ mode_t u;
929         union {
930                 struct cmsghdr cmsghdr;
931                 uint8_t buf[CMSG_SPACE(sizeof(int))];
932         } control = {};
933         struct msghdr mh = {
934                 .msg_control = &control,
935                 .msg_controllen = sizeof(control),
936         };
937         struct cmsghdr *cmsg;
938
939         assert(dest);
940         assert(kmsg_socket >= 0);
941
942         u = umask(0000);
943
944         /* We create the kmsg FIFO as /dev/kmsg, but immediately
945          * delete it after bind mounting it to /proc/kmsg. While FIFOs
946          * on the reading side behave very similar to /proc/kmsg,
947          * their writing side behaves differently from /dev/kmsg in
948          * that writing blocks when nothing is reading. In order to
949          * avoid any problems with containers deadlocking due to this
950          * we simply make /dev/kmsg unavailable to the container. */
951         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
952             asprintf(&to, "%s/proc/kmsg", dest) < 0)
953                 return log_oom();
954
955         if (mkfifo(from, 0600) < 0) {
956                 log_error("mkfifo() for /dev/kmsg failed: %m");
957                 return -errno;
958         }
959
960         r = chmod_and_chown(from, 0600, 0, 0);
961         if (r < 0) {
962                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
963                 return r;
964         }
965
966         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
967                 log_error("Bind mount for /proc/kmsg failed: %m");
968                 return -errno;
969         }
970
971         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
972         if (fd < 0) {
973                 log_error("Failed to open fifo: %m");
974                 return -errno;
975         }
976
977         cmsg = CMSG_FIRSTHDR(&mh);
978         cmsg->cmsg_level = SOL_SOCKET;
979         cmsg->cmsg_type = SCM_RIGHTS;
980         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
981         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
982
983         mh.msg_controllen = cmsg->cmsg_len;
984
985         /* Store away the fd in the socket, so that it stays open as
986          * long as we run the child */
987         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
988         close_nointr_nofail(fd);
989
990         if (k < 0) {
991                 log_error("Failed to send FIFO fd: %m");
992                 return -errno;
993         }
994
995         /* And now make the FIFO unavailable as /dev/kmsg... */
996         unlink(from);
997         return 0;
998 }
999
1000 static int setup_hostname(void) {
1001
1002         if (arg_share_system)
1003                 return 0;
1004
1005         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1006                 return -errno;
1007
1008         return 0;
1009 }
1010
1011 static int setup_journal(const char *directory) {
1012         sd_id128_t machine_id, this_id;
1013         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1014         char *id;
1015         int r;
1016
1017         p = strappend(directory, "/etc/machine-id");
1018         if (!p)
1019                 return log_oom();
1020
1021         r = read_one_line_file(p, &b);
1022         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1023                 return 0;
1024         else if (r < 0) {
1025                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1026                 return r;
1027         }
1028
1029         id = strstrip(b);
1030         if (isempty(id) && arg_link_journal == LINK_AUTO)
1031                 return 0;
1032
1033         /* Verify validity */
1034         r = sd_id128_from_string(id, &machine_id);
1035         if (r < 0) {
1036                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1037                 return r;
1038         }
1039
1040         r = sd_id128_get_machine(&this_id);
1041         if (r < 0) {
1042                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1043                 return r;
1044         }
1045
1046         if (sd_id128_equal(machine_id, this_id)) {
1047                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1048                          "Host and machine ids are equal (%s): refusing to link journals", id);
1049                 if (arg_link_journal == LINK_AUTO)
1050                         return 0;
1051                 return
1052                         -EEXIST;
1053         }
1054
1055         if (arg_link_journal == LINK_NO)
1056                 return 0;
1057
1058         free(p);
1059         p = strappend("/var/log/journal/", id);
1060         q = strjoin(directory, "/var/log/journal/", id, NULL);
1061         if (!p || !q)
1062                 return log_oom();
1063
1064         if (path_is_mount_point(p, false) > 0) {
1065                 if (arg_link_journal != LINK_AUTO) {
1066                         log_error("%s: already a mount point, refusing to use for journal", p);
1067                         return -EEXIST;
1068                 }
1069
1070                 return 0;
1071         }
1072
1073         if (path_is_mount_point(q, false) > 0) {
1074                 if (arg_link_journal != LINK_AUTO) {
1075                         log_error("%s: already a mount point, refusing to use for journal", q);
1076                         return -EEXIST;
1077                 }
1078
1079                 return 0;
1080         }
1081
1082         r = readlink_and_make_absolute(p, &d);
1083         if (r >= 0) {
1084                 if ((arg_link_journal == LINK_GUEST ||
1085                      arg_link_journal == LINK_AUTO) &&
1086                     path_equal(d, q)) {
1087
1088                         r = mkdir_p(q, 0755);
1089                         if (r < 0)
1090                                 log_warning("failed to create directory %s: %m", q);
1091                         return 0;
1092                 }
1093
1094                 if (unlink(p) < 0) {
1095                         log_error("Failed to remove symlink %s: %m", p);
1096                         return -errno;
1097                 }
1098         } else if (r == -EINVAL) {
1099
1100                 if (arg_link_journal == LINK_GUEST &&
1101                     rmdir(p) < 0) {
1102
1103                         if (errno == ENOTDIR) {
1104                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1105                                 return r;
1106                         } else {
1107                                 log_error("Failed to remove %s: %m", p);
1108                                 return -errno;
1109                         }
1110                 }
1111         } else if (r != -ENOENT) {
1112                 log_error("readlink(%s) failed: %m", p);
1113                 return r;
1114         }
1115
1116         if (arg_link_journal == LINK_GUEST) {
1117
1118                 if (symlink(q, p) < 0) {
1119                         log_error("Failed to symlink %s to %s: %m", q, p);
1120                         return -errno;
1121                 }
1122
1123                 r = mkdir_p(q, 0755);
1124                 if (r < 0)
1125                         log_warning("failed to create directory %s: %m", q);
1126                 return 0;
1127         }
1128
1129         if (arg_link_journal == LINK_HOST) {
1130                 r = mkdir_p(p, 0755);
1131                 if (r < 0) {
1132                         log_error("Failed to create %s: %m", p);
1133                         return r;
1134                 }
1135
1136         } else if (access(p, F_OK) < 0)
1137                 return 0;
1138
1139         if (dir_is_empty(q) == 0) {
1140                 log_error("%s not empty.", q);
1141                 return -ENOTEMPTY;
1142         }
1143
1144         r = mkdir_p(q, 0755);
1145         if (r < 0) {
1146                 log_error("Failed to create %s: %m", q);
1147                 return r;
1148         }
1149
1150         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1151                 log_error("Failed to bind mount journal from host into guest: %m");
1152                 return -errno;
1153         }
1154
1155         return 0;
1156 }
1157
1158 static int setup_kdbus(const char *dest, const char *path) {
1159         const char *p;
1160
1161         if (!path)
1162                 return 0;
1163
1164         p = strappenda(dest, "/dev/kdbus");
1165         if (mkdir(p, 0755) < 0) {
1166                 log_error("Failed to create kdbus path: %m");
1167                 return  -errno;
1168         }
1169
1170         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1171                 log_error("Failed to mount kdbus domain path: %m");
1172                 return -errno;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static int drop_capabilities(void) {
1179         return capability_bounding_set_drop(~arg_retain, false);
1180 }
1181
1182 static int register_machine(pid_t pid) {
1183         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184         _cleanup_bus_unref_ sd_bus *bus = NULL;
1185         int r;
1186
1187         if (!arg_register)
1188                 return 0;
1189
1190         r = sd_bus_default_system(&bus);
1191         if (r < 0) {
1192                 log_error("Failed to open system bus: %s", strerror(-r));
1193                 return r;
1194         }
1195
1196         if (arg_keep_unit) {
1197                 r = sd_bus_call_method(
1198                                 bus,
1199                                 "org.freedesktop.machine1",
1200                                 "/org/freedesktop/machine1",
1201                                 "org.freedesktop.machine1.Manager",
1202                                 "RegisterMachine",
1203                                 &error,
1204                                 NULL,
1205                                 "sayssus",
1206                                 arg_machine,
1207                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1208                                 "nspawn",
1209                                 "container",
1210                                 (uint32_t) pid,
1211                                 strempty(arg_directory));
1212         } else {
1213                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1214
1215                 r = sd_bus_message_new_method_call(
1216                                 bus,
1217                                 &m,
1218                                 "org.freedesktop.machine1",
1219                                 "/org/freedesktop/machine1",
1220                                 "org.freedesktop.machine1.Manager",
1221                                 "CreateMachine");
1222                 if (r < 0) {
1223                         log_error("Failed to create message: %s", strerror(-r));
1224                         return r;
1225                 }
1226
1227                 r = sd_bus_message_append(
1228                                 m,
1229                                 "sayssus",
1230                                 arg_machine,
1231                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1232                                 "nspawn",
1233                                 "container",
1234                                 (uint32_t) pid,
1235                                 strempty(arg_directory));
1236                 if (r < 0) {
1237                         log_error("Failed to append message arguments: %s", strerror(-r));
1238                         return r;
1239                 }
1240
1241                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1242                 if (r < 0) {
1243                         log_error("Failed to open container: %s", strerror(-r));
1244                         return r;
1245                 }
1246
1247                 if (!isempty(arg_slice)) {
1248                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1249                         if (r < 0) {
1250                                 log_error("Failed to append slice: %s", strerror(-r));
1251                                 return r;
1252                         }
1253                 }
1254
1255                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1256                 if (r < 0) {
1257                         log_error("Failed to add device policy: %s", strerror(-r));
1258                         return r;
1259                 }
1260
1261                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 8,
1262                                           /* Allow the container to
1263                                            * access and create the API
1264                                            * device nodes, so that
1265                                            * PrivateDevices= in the
1266                                            * container can work
1267                                            * fine */
1268                                           "/dev/null", "rwm",
1269                                           "/dev/zero", "rwm",
1270                                           "/dev/full", "rwm",
1271                                           "/dev/random", "rwm",
1272                                           "/dev/urandom", "rwm",
1273                                           "/dev/tty", "rwm",
1274                                           /* Allow the container
1275                                            * access to ptys. However,
1276                                            * do not permit the
1277                                            * container to ever create
1278                                            * these device nodes. */
1279                                           "/dev/pts/ptmx", "rw",
1280                                           "char-pts", "rw");
1281                 if (r < 0) {
1282                         log_error("Failed to add device whitelist: %s", strerror(-r));
1283                         return r;
1284                 }
1285
1286                 r = sd_bus_message_close_container(m);
1287                 if (r < 0) {
1288                         log_error("Failed to close container: %s", strerror(-r));
1289                         return r;
1290                 }
1291
1292                 r = sd_bus_call(bus, m, 0, &error, NULL);
1293         }
1294
1295         if (r < 0) {
1296                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1297                 return r;
1298         }
1299
1300         return 0;
1301 }
1302
1303 static int terminate_machine(pid_t pid) {
1304         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1305         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1306         _cleanup_bus_unref_ sd_bus *bus = NULL;
1307         const char *path;
1308         int r;
1309
1310         if (!arg_register)
1311                 return 0;
1312
1313         r = sd_bus_default_system(&bus);
1314         if (r < 0) {
1315                 log_error("Failed to open system bus: %s", strerror(-r));
1316                 return r;
1317         }
1318
1319         r = sd_bus_call_method(
1320                         bus,
1321                         "org.freedesktop.machine1",
1322                         "/org/freedesktop/machine1",
1323                         "org.freedesktop.machine1.Manager",
1324                         "GetMachineByPID",
1325                         &error,
1326                         &reply,
1327                         "u",
1328                         (uint32_t) pid);
1329         if (r < 0) {
1330                 /* Note that the machine might already have been
1331                  * cleaned up automatically, hence don't consider it a
1332                  * failure if we cannot get the machine object. */
1333                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1334                 return 0;
1335         }
1336
1337         r = sd_bus_message_read(reply, "o", &path);
1338         if (r < 0)
1339                 return bus_log_parse_error(r);
1340
1341         r = sd_bus_call_method(
1342                         bus,
1343                         "org.freedesktop.machine1",
1344                         path,
1345                         "org.freedesktop.machine1.Machine",
1346                         "Terminate",
1347                         &error,
1348                         NULL,
1349                         NULL);
1350         if (r < 0) {
1351                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1352                 return 0;
1353         }
1354
1355         return 0;
1356 }
1357
1358 static int reset_audit_loginuid(void) {
1359         _cleanup_free_ char *p = NULL;
1360         int r;
1361
1362         if (arg_share_system)
1363                 return 0;
1364
1365         r = read_one_line_file("/proc/self/loginuid", &p);
1366         if (r == -ENOENT)
1367                 return 0;
1368         if (r < 0) {
1369                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1370                 return r;
1371         }
1372
1373         /* Already reset? */
1374         if (streq(p, "4294967295"))
1375                 return 0;
1376
1377         r = write_string_file("/proc/self/loginuid", "4294967295");
1378         if (r < 0) {
1379                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1380                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1381                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1382                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1383                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1384
1385                 sleep(5);
1386         }
1387
1388         return 0;
1389 }
1390
1391 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1392         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1393         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1394         int r;
1395
1396         if (!arg_private_network)
1397                 return 0;
1398
1399         if (!arg_network_veth)
1400                 return 0;
1401
1402         /* Use two different interface name prefixes depending whether
1403          * we are in bridge mode or not. */
1404         if (arg_network_bridge)
1405                 memcpy(iface_name, "vb-", 3);
1406         else
1407                 memcpy(iface_name, "ve-", 3);
1408
1409         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1410
1411         r = sd_rtnl_open(&rtnl, 0);
1412         if (r < 0) {
1413                 log_error("Failed to connect to netlink: %s", strerror(-r));
1414                 return r;
1415         }
1416
1417         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1418         if (r < 0) {
1419                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1420                 return r;
1421         }
1422
1423         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1424         if (r < 0) {
1425                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1426                 return r;
1427         }
1428
1429         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1430         if (r < 0) {
1431                 log_error("Failed to open netlink container: %s", strerror(-r));
1432                 return r;
1433         }
1434
1435         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1436         if (r < 0) {
1437                 log_error("Failed to append netlink kind: %s", strerror(-r));
1438                 return r;
1439         }
1440
1441         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1442         if (r < 0) {
1443                 log_error("Failed to open netlink container: %s", strerror(-r));
1444                 return r;
1445         }
1446
1447         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1448         if (r < 0) {
1449                 log_error("Failed to open netlink container: %s", strerror(-r));
1450                 return r;
1451         }
1452
1453         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1454         if (r < 0) {
1455                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1456                 return r;
1457         }
1458
1459         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1460         if (r < 0) {
1461                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1462                 return r;
1463         }
1464
1465         r = sd_rtnl_message_close_container(m);
1466         if (r < 0) {
1467                 log_error("Failed to close netlink container: %s", strerror(-r));
1468                 return r;
1469         }
1470
1471         r = sd_rtnl_message_close_container(m);
1472         if (r < 0) {
1473                 log_error("Failed to close netlink container: %s", strerror(-r));
1474                 return r;
1475         }
1476
1477         r = sd_rtnl_message_close_container(m);
1478         if (r < 0) {
1479                 log_error("Failed to close netlink container: %s", strerror(-r));
1480                 return r;
1481         }
1482
1483         r = sd_rtnl_call(rtnl, m, 0, NULL);
1484         if (r < 0) {
1485                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1486                 return r;
1487         }
1488
1489         return 0;
1490 }
1491
1492 static int setup_bridge(const char veth_name[]) {
1493         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1494         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1495         int r, bridge;
1496
1497         if (!arg_private_network)
1498                 return 0;
1499
1500         if (!arg_network_veth)
1501                 return 0;
1502
1503         if (!arg_network_bridge)
1504                 return 0;
1505
1506         bridge = (int) if_nametoindex(arg_network_bridge);
1507         if (bridge <= 0) {
1508                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1509                 return -errno;
1510         }
1511
1512         r = sd_rtnl_open(&rtnl, 0);
1513         if (r < 0) {
1514                 log_error("Failed to connect to netlink: %s", strerror(-r));
1515                 return r;
1516         }
1517
1518         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1519         if (r < 0) {
1520                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1521                 return r;
1522         }
1523
1524         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1525         if (r < 0) {
1526                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1527                 return r;
1528         }
1529
1530         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1531         if (r < 0) {
1532                 log_error("Failed to add netlink master field: %s", strerror(-r));
1533                 return r;
1534         }
1535
1536         r = sd_rtnl_call(rtnl, m, 0, NULL);
1537         if (r < 0) {
1538                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1539                 return r;
1540         }
1541
1542         return 0;
1543 }
1544
1545 static int parse_interface(struct udev *udev, const char *name) {
1546         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1547         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1548         int ifi;
1549
1550         ifi = (int) if_nametoindex(name);
1551         if (ifi <= 0) {
1552                 log_error("Failed to resolve interface %s: %m", name);
1553                 return -errno;
1554         }
1555
1556         sprintf(ifi_str, "n%i", ifi);
1557         d = udev_device_new_from_device_id(udev, ifi_str);
1558         if (!d) {
1559                 log_error("Failed to get udev device for interface %s: %m", name);
1560                 return -errno;
1561         }
1562
1563         if (udev_device_get_is_initialized(d) <= 0) {
1564                 log_error("Network interface %s is not initialized yet.", name);
1565                 return -EBUSY;
1566         }
1567
1568         return ifi;
1569 }
1570
1571 static int move_network_interfaces(pid_t pid) {
1572         _cleanup_udev_unref_ struct udev *udev = NULL;
1573         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1574         char **i;
1575         int r;
1576
1577         if (!arg_private_network)
1578                 return 0;
1579
1580         if (strv_isempty(arg_network_interfaces))
1581                 return 0;
1582
1583         r = sd_rtnl_open(&rtnl, 0);
1584         if (r < 0) {
1585                 log_error("Failed to connect to netlink: %s", strerror(-r));
1586                 return r;
1587         }
1588
1589         udev = udev_new();
1590         if (!udev) {
1591                 log_error("Failed to connect to udev.");
1592                 return -ENOMEM;
1593         }
1594
1595         STRV_FOREACH(i, arg_network_interfaces) {
1596                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1597                 int ifi;
1598
1599                 ifi = parse_interface(udev, *i);
1600                 if (ifi < 0)
1601                         return ifi;
1602
1603                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1604                 if (r < 0) {
1605                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1606                         return r;
1607                 }
1608
1609                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1610                 if (r < 0) {
1611                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1612                         return r;
1613                 }
1614
1615                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1616                 if (r < 0) {
1617                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1618                         return r;
1619                 }
1620         }
1621
1622         return 0;
1623 }
1624
1625 static int setup_macvlan(pid_t pid) {
1626         _cleanup_udev_unref_ struct udev *udev = NULL;
1627         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1628         char **i;
1629         int r;
1630
1631         if (!arg_private_network)
1632                 return 0;
1633
1634         if (strv_isempty(arg_network_macvlan))
1635                 return 0;
1636
1637         r = sd_rtnl_open(&rtnl, 0);
1638         if (r < 0) {
1639                 log_error("Failed to connect to netlink: %s", strerror(-r));
1640                 return r;
1641         }
1642
1643         udev = udev_new();
1644         if (!udev) {
1645                 log_error("Failed to connect to udev.");
1646                 return -ENOMEM;
1647         }
1648
1649         STRV_FOREACH(i, arg_network_macvlan) {
1650                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1651                 _cleanup_free_ char *n = NULL;
1652                 int ifi;
1653
1654                 ifi = parse_interface(udev, *i);
1655                 if (ifi < 0)
1656                         return ifi;
1657
1658                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1659                 if (r < 0) {
1660                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1661                         return r;
1662                 }
1663
1664                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1665                 if (r < 0) {
1666                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1667                         return r;
1668                 }
1669
1670                 n = strappend("mv-", *i);
1671                 if (!n)
1672                         return log_oom();
1673
1674                 strshorten(n, IFNAMSIZ-1);
1675
1676                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1677                 if (r < 0) {
1678                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1679                         return r;
1680                 }
1681
1682                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1683                 if (r < 0) {
1684                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1685                         return r;
1686                 }
1687
1688                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1689                 if (r < 0) {
1690                         log_error("Failed to open netlink container: %s", strerror(-r));
1691                         return r;
1692                 }
1693
1694                 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1695                 if (r < 0) {
1696                         log_error("Failed to append netlink kind: %s", strerror(-r));
1697                         return r;
1698                 }
1699
1700                 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1701                 if (r < 0) {
1702                         log_error("Failed to open netlink container: %s", strerror(-r));
1703                         return r;
1704                 }
1705
1706                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1707                 if (r < 0) {
1708                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1709                         return r;
1710                 }
1711
1712                 r = sd_rtnl_message_close_container(m);
1713                 if (r < 0) {
1714                         log_error("Failed to close netlink container: %s", strerror(-r));
1715                         return r;
1716                 }
1717
1718                 r = sd_rtnl_message_close_container(m);
1719                 if (r < 0) {
1720                         log_error("Failed to close netlink container: %s", strerror(-r));
1721                         return r;
1722                 }
1723
1724                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1725                 if (r < 0) {
1726                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1727                         return r;
1728                 }
1729         }
1730
1731         return 0;
1732 }
1733
1734 static int audit_still_doesnt_work_in_containers(void) {
1735
1736 #ifdef HAVE_SECCOMP
1737         scmp_filter_ctx seccomp;
1738         int r;
1739
1740         /*
1741            Audit is broken in containers, much of the userspace audit
1742            hookup will fail if running inside a container. We don't
1743            care and just turn off creation of audit sockets.
1744
1745            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1746            with EAFNOSUPPORT which audit userspace uses as indication
1747            that audit is disabled in the kernel.
1748          */
1749
1750         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1751         if (!seccomp)
1752                 return log_oom();
1753
1754         r = seccomp_add_secondary_archs(seccomp);
1755         if (r < 0) {
1756                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1757                 goto finish;
1758         }
1759
1760         r = seccomp_rule_add(
1761                         seccomp,
1762                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1763                         SCMP_SYS(socket),
1764                         2,
1765                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1766                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1767         if (r < 0) {
1768                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1769                 goto finish;
1770         }
1771
1772         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1773         if (r < 0) {
1774                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1775                 goto finish;
1776         }
1777
1778         r = seccomp_load(seccomp);
1779         if (r < 0)
1780                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1781
1782 finish:
1783         seccomp_release(seccomp);
1784         return r;
1785 #else
1786         return 0;
1787 #endif
1788
1789 }
1790
1791 static int setup_image(char **device_path, int *loop_nr) {
1792         struct loop_info64 info = {
1793                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1794         };
1795         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1796         _cleanup_free_ char* loopdev = NULL;
1797         struct stat st;
1798         int r, nr;
1799
1800         assert(device_path);
1801         assert(loop_nr);
1802
1803         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1804         if (fd < 0) {
1805                 log_error("Failed to open %s: %m", arg_image);
1806                 return -errno;
1807         }
1808
1809         if (fstat(fd, &st) < 0) {
1810                 log_error("Failed to stat %s: %m", arg_image);
1811                 return -errno;
1812         }
1813
1814         if (S_ISBLK(st.st_mode)) {
1815                 char *p;
1816
1817                 p = strdup(arg_image);
1818                 if (!p)
1819                         return log_oom();
1820
1821                 *device_path = p;
1822
1823                 *loop_nr = -1;
1824
1825                 r = fd;
1826                 fd = -1;
1827
1828                 return r;
1829         }
1830
1831         if (!S_ISREG(st.st_mode)) {
1832                 log_error("%s is not a regular file or block device: %m", arg_image);
1833                 return -EINVAL;
1834         }
1835
1836         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1837         if (control < 0) {
1838                 log_error("Failed to open /dev/loop-control: %m");
1839                 return -errno;
1840         }
1841
1842         nr = ioctl(control, LOOP_CTL_GET_FREE);
1843         if (nr < 0) {
1844                 log_error("Failed to allocate loop device: %m");
1845                 return -errno;
1846         }
1847
1848         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1849                 return log_oom();
1850
1851         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1852         if (loop < 0) {
1853                 log_error("Failed to open loop device %s: %m", loopdev);
1854                 return -errno;
1855         }
1856
1857         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1858                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1859                 return -errno;
1860         }
1861
1862         if (arg_read_only)
1863                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1864
1865         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1866                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1867                 return -errno;
1868         }
1869
1870         *device_path = loopdev;
1871         loopdev = NULL;
1872
1873         *loop_nr = nr;
1874
1875         r = loop;
1876         loop = -1;
1877
1878         return r;
1879 }
1880
1881 static int dissect_image(
1882                 int fd,
1883                 char **root_device,
1884                 char **home_device,
1885                 char **srv_device,
1886                 bool *secondary) {
1887
1888 #ifdef HAVE_BLKID
1889         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1890         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1891         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1892         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1893         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1894         _cleanup_udev_unref_ struct udev *udev = NULL;
1895         struct udev_list_entry *first, *item;
1896         const char *pttype = NULL;
1897         blkid_partlist pl;
1898         struct stat st;
1899         int r;
1900
1901         assert(fd >= 0);
1902         assert(root_device);
1903         assert(home_device);
1904         assert(srv_device);
1905         assert(secondary);
1906
1907         b = blkid_new_probe();
1908         if (!b)
1909                 return log_oom();
1910
1911         errno = 0;
1912         r = blkid_probe_set_device(b, fd, 0, 0);
1913         if (r != 0) {
1914                 if (errno == 0)
1915                         return log_oom();
1916
1917                 log_error("Failed to set device on blkid probe: %m");
1918                 return -errno;
1919         }
1920
1921         blkid_probe_enable_partitions(b, 1);
1922         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1923
1924         errno = 0;
1925         r = blkid_do_safeprobe(b);
1926         if (r == -2 || r == 1) {
1927                 log_error("Failed to identify any partition table on %s.\n"
1928                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1929                 return -EINVAL;
1930         } else if (r != 0) {
1931                 if (errno == 0)
1932                         errno = EIO;
1933                 log_error("Failed to probe: %m");
1934                 return -errno;
1935         }
1936
1937         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1938         if (!streq_ptr(pttype, "gpt")) {
1939                 log_error("Image %s does not carry a GUID Partition Table.\n"
1940                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1941                 return -EINVAL;
1942         }
1943
1944         errno = 0;
1945         pl = blkid_probe_get_partitions(b);
1946         if (!pl) {
1947                 if (errno == 0)
1948                         return log_oom();
1949
1950                 log_error("Failed to list partitions of %s", arg_image);
1951                 return -errno;
1952         }
1953
1954         udev = udev_new();
1955         if (!udev)
1956                 return log_oom();
1957
1958         if (fstat(fd, &st) < 0) {
1959                 log_error("Failed to stat block device: %m");
1960                 return -errno;
1961         }
1962
1963         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1964         if (!d)
1965                 return log_oom();
1966
1967         e = udev_enumerate_new(udev);
1968         if (!e)
1969                 return log_oom();
1970
1971         r = udev_enumerate_add_match_parent(e, d);
1972         if (r < 0)
1973                 return log_oom();
1974
1975         r = udev_enumerate_scan_devices(e);
1976         if (r < 0) {
1977                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1978                 return r;
1979         }
1980
1981         first = udev_enumerate_get_list_entry(e);
1982         udev_list_entry_foreach(item, first) {
1983                 _cleanup_udev_device_unref_ struct udev_device *q;
1984                 const char *stype, *node;
1985                 sd_id128_t type_id;
1986                 blkid_partition pp;
1987                 dev_t qn;
1988                 int nr;
1989
1990                 errno = 0;
1991                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1992                 if (!q) {
1993                         if (!errno)
1994                                 errno = ENOMEM;
1995
1996                         log_error("Failed to get partition device of %s: %m", arg_image);
1997                         return -errno;
1998                 }
1999
2000                 qn = udev_device_get_devnum(q);
2001                 if (major(qn) == 0)
2002                         continue;
2003
2004                 if (st.st_rdev == qn)
2005                         continue;
2006
2007                 node = udev_device_get_devnode(q);
2008                 if (!node)
2009                         continue;
2010
2011                 pp = blkid_partlist_devno_to_partition(pl, qn);
2012                 if (!pp)
2013                         continue;
2014
2015                 nr = blkid_partition_get_partno(pp);
2016                 if (nr < 0)
2017                         continue;
2018
2019                 stype = blkid_partition_get_type_string(pp);
2020                 if (!stype)
2021                         continue;
2022
2023                 if (sd_id128_from_string(stype, &type_id) < 0)
2024                         continue;
2025
2026                 if (sd_id128_equal(type_id, GPT_HOME)) {
2027
2028                         if (home && nr >= home_nr)
2029                                 continue;
2030
2031                         home_nr = nr;
2032                         free(home);
2033                         home = strdup(node);
2034                         if (!home)
2035                                 return log_oom();
2036                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2037
2038                         if (srv && nr >= srv_nr)
2039                                 continue;
2040
2041                         srv_nr = nr;
2042                         free(srv);
2043                         srv = strdup(node);
2044                         if (!srv)
2045                                 return log_oom();
2046                 }
2047 #ifdef GPT_ROOT_NATIVE
2048                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2049
2050                         if (root && nr >= root_nr)
2051                                 continue;
2052
2053                         root_nr = nr;
2054                         free(root);
2055                         root = strdup(node);
2056                         if (!root)
2057                                 return log_oom();
2058                 }
2059 #endif
2060 #ifdef GPT_ROOT_SECONDARY
2061                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2062
2063                         if (secondary_root && nr >= secondary_root_nr)
2064                                 continue;
2065
2066                         secondary_root_nr = nr;
2067                         free(secondary_root);
2068                         secondary_root = strdup(node);
2069                         if (!secondary_root)
2070                                 return log_oom();
2071                 }
2072 #endif
2073         }
2074
2075         if (!root && !secondary_root) {
2076                 log_error("Failed to identify root partition in disk image %s.\n"
2077                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2078                 return -EINVAL;
2079         }
2080
2081         if (root) {
2082                 *root_device = root;
2083                 root = NULL;
2084                 *secondary = false;
2085         } else if (secondary_root) {
2086                 *root_device = secondary_root;
2087                 secondary_root = NULL;
2088                 *secondary = true;
2089         }
2090
2091         if (home) {
2092                 *home_device = home;
2093                 home = NULL;
2094         }
2095
2096         if (srv) {
2097                 *srv_device = srv;
2098                 srv = NULL;
2099         }
2100
2101         return 0;
2102 #else
2103         log_error("--image= is not supported, compiled without blkid support.");
2104         return -ENOTSUP;
2105 #endif
2106 }
2107
2108 static int mount_device(const char *what, const char *where, const char *directory) {
2109 #ifdef HAVE_BLKID
2110         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2111         const char *fstype, *p;
2112         int r;
2113
2114         assert(what);
2115         assert(where);
2116
2117         if (directory)
2118                 p = strappenda(where, directory);
2119         else
2120                 p = where;
2121
2122         errno = 0;
2123         b = blkid_new_probe_from_filename(what);
2124         if (!b) {
2125                 if (errno == 0)
2126                         return log_oom();
2127                 log_error("Failed to allocate prober for %s: %m", what);
2128                 return -errno;
2129         }
2130
2131         blkid_probe_enable_superblocks(b, 1);
2132         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2133
2134         errno = 0;
2135         r = blkid_do_safeprobe(b);
2136         if (r == -1 || r == 1) {
2137                 log_error("Cannot determine file system type of %s", what);
2138                 return -EINVAL;
2139         } else if (r != 0) {
2140                 if (errno == 0)
2141                         errno = EIO;
2142                 log_error("Failed to probe %s: %m", what);
2143                 return -errno;
2144         }
2145
2146         errno = 0;
2147         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2148                 if (errno == 0)
2149                         errno = EINVAL;
2150                 log_error("Failed to determine file system type of %s", what);
2151                 return -errno;
2152         }
2153
2154         if (streq(fstype, "crypto_LUKS")) {
2155                 log_error("nspawn currently does not support LUKS disk images.");
2156                 return -ENOTSUP;
2157         }
2158
2159         if (mount(what, p, fstype, arg_read_only ? MS_NODEV|MS_RDONLY : 0, NULL) < 0) {
2160                 log_error("Failed to mount %s: %m", what);
2161                 return -errno;
2162         }
2163
2164         return 0;
2165 #else
2166         log_error("--image= is not supported, compiled without blkid support.");
2167         return -ENOTSUP;
2168 #endif
2169 }
2170
2171 static int mount_devices(const char *where, const char *root_device, const char *home_device, const char *srv_device) {
2172         int r;
2173
2174         assert(where);
2175
2176         if (root_device) {
2177                 r = mount_device(root_device, arg_directory, NULL);
2178                 if (r < 0) {
2179                         log_error("Failed to mount root directory: %s", strerror(-r));
2180                         return r;
2181                 }
2182         }
2183
2184         if (home_device) {
2185                 r = mount_device(home_device, arg_directory, "/home");
2186                 if (r < 0) {
2187                         log_error("Failed to mount home directory: %s", strerror(-r));
2188                         return r;
2189                 }
2190         }
2191
2192         if (srv_device) {
2193                 r = mount_device(srv_device, arg_directory, "/srv");
2194                 if (r < 0) {
2195                         log_error("Failed to mount server data directory: %s", strerror(-r));
2196                         return r;
2197                 }
2198         }
2199
2200         return 0;
2201 }
2202
2203 static void loop_remove(int nr, int *image_fd) {
2204         _cleanup_close_ int control = -1;
2205
2206         if (nr < 0)
2207                 return;
2208
2209         if (image_fd && *image_fd >= 0) {
2210                 ioctl(*image_fd, LOOP_CLR_FD);
2211                 close_nointr_nofail(*image_fd);
2212                 *image_fd = -1;
2213         }
2214
2215         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2216         if (control < 0)
2217                 return;
2218
2219         ioctl(control, LOOP_CTL_REMOVE, nr);
2220 }
2221
2222 int main(int argc, char *argv[]) {
2223
2224         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2225         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2226         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2227         _cleanup_fdset_free_ FDSet *fds = NULL;
2228         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2229         const char *console = NULL;
2230         char veth_name[IFNAMSIZ];
2231         bool secondary = false;
2232         pid_t pid = 0;
2233         sigset_t mask;
2234
2235         log_parse_environment();
2236         log_open();
2237
2238         k = parse_argv(argc, argv);
2239         if (k < 0)
2240                 goto finish;
2241         else if (k == 0) {
2242                 r = EXIT_SUCCESS;
2243                 goto finish;
2244         }
2245
2246         if (!arg_image) {
2247                 if (arg_directory) {
2248                         char *p;
2249
2250                         p = path_make_absolute_cwd(arg_directory);
2251                         free(arg_directory);
2252                         arg_directory = p;
2253                 } else
2254                         arg_directory = get_current_dir_name();
2255
2256                 if (!arg_directory) {
2257                         log_error("Failed to determine path, please use -D.");
2258                         goto finish;
2259                 }
2260                 path_kill_slashes(arg_directory);
2261         }
2262
2263         if (!arg_machine) {
2264                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2265                 if (!arg_machine) {
2266                         log_oom();
2267                         goto finish;
2268                 }
2269
2270                 hostname_cleanup(arg_machine, false);
2271                 if (isempty(arg_machine)) {
2272                         log_error("Failed to determine machine name automatically, please use -M.");
2273                         goto finish;
2274                 }
2275         }
2276
2277         if (geteuid() != 0) {
2278                 log_error("Need to be root.");
2279                 goto finish;
2280         }
2281
2282         if (sd_booted() <= 0) {
2283                 log_error("Not running on a systemd system.");
2284                 goto finish;
2285         }
2286
2287         log_close();
2288         n_fd_passed = sd_listen_fds(false);
2289         if (n_fd_passed > 0) {
2290                 k = fdset_new_listen_fds(&fds, false);
2291                 if (k < 0) {
2292                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2293                         goto finish;
2294                 }
2295         }
2296         fdset_close_others(fds);
2297         log_open();
2298
2299         if (arg_directory) {
2300                 if (path_equal(arg_directory, "/")) {
2301                         log_error("Spawning container on root directory not supported.");
2302                         goto finish;
2303                 }
2304
2305                 if (arg_boot) {
2306                         if (path_is_os_tree(arg_directory) <= 0) {
2307                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2308                                 goto finish;
2309                         }
2310                 } else {
2311                         const char *p;
2312
2313                         p = strappenda(arg_directory,
2314                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2315                         if (access(p, F_OK) < 0) {
2316                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2317                                 goto finish;
2318
2319                         }
2320                 }
2321         } else {
2322                 char template[] = "/tmp/nspawn-root-XXXXXX";
2323
2324                 if (!mkdtemp(template)) {
2325                         log_error("Failed to create temporary directory: %m");
2326                         r = -errno;
2327                         goto finish;
2328                 }
2329
2330                 arg_directory = strdup(template);
2331                 if (!arg_directory) {
2332                         r = log_oom();
2333                         goto finish;
2334                 }
2335
2336                 image_fd = setup_image(&device_path, &loop_nr);
2337                 if (image_fd < 0) {
2338                         r = image_fd;
2339                         goto finish;
2340                 }
2341
2342                 r = dissect_image(image_fd, &root_device, &home_device, &srv_device, &secondary);
2343                 if (r < 0)
2344                         goto finish;
2345         }
2346
2347         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2348         if (master < 0) {
2349                 log_error("Failed to acquire pseudo tty: %m");
2350                 goto finish;
2351         }
2352
2353         console = ptsname(master);
2354         if (!console) {
2355                 log_error("Failed to determine tty name: %m");
2356                 goto finish;
2357         }
2358
2359         if (!arg_quiet)
2360                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2361
2362         if (unlockpt(master) < 0) {
2363                 log_error("Failed to unlock tty: %m");
2364                 goto finish;
2365         }
2366
2367         if (access("/dev/kdbus/control", F_OK) >= 0) {
2368
2369                 if (arg_share_system) {
2370                         kdbus_domain = strdup("/dev/kdbus");
2371                         if (!kdbus_domain) {
2372                                 log_oom();
2373                                 goto finish;
2374                         }
2375                 } else {
2376                         const char *ns;
2377
2378                         ns = strappenda("machine-", arg_machine);
2379                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2380                         if (r < 0)
2381                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2382                         else
2383                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2384                 }
2385         }
2386
2387         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2388                 log_error("Failed to create kmsg socket pair: %m");
2389                 goto finish;
2390         }
2391
2392         sd_notify(0, "READY=1");
2393
2394         assert_se(sigemptyset(&mask) == 0);
2395         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2396         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2397
2398         for (;;) {
2399                 int parent_ready_fd = -1, child_ready_fd = -1;
2400                 siginfo_t status;
2401                 eventfd_t x;
2402
2403                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2404                 if (parent_ready_fd < 0) {
2405                         log_error("Failed to create event fd: %m");
2406                         goto finish;
2407                 }
2408
2409                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2410                 if (child_ready_fd < 0) {
2411                         log_error("Failed to create event fd: %m");
2412                         goto finish;
2413                 }
2414
2415                 pid = syscall(__NR_clone,
2416                               SIGCHLD|CLONE_NEWNS|
2417                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2418                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2419                 if (pid < 0) {
2420                         if (errno == EINVAL)
2421                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2422                         else
2423                                 log_error("clone() failed: %m");
2424
2425                         goto finish;
2426                 }
2427
2428                 if (pid == 0) {
2429                         /* child */
2430                         const char *home = NULL;
2431                         uid_t uid = (uid_t) -1;
2432                         gid_t gid = (gid_t) -1;
2433                         unsigned n_env = 2;
2434                         const char *envp[] = {
2435                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2436                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2437                                 NULL, /* TERM */
2438                                 NULL, /* HOME */
2439                                 NULL, /* USER */
2440                                 NULL, /* LOGNAME */
2441                                 NULL, /* container_uuid */
2442                                 NULL, /* LISTEN_FDS */
2443                                 NULL, /* LISTEN_PID */
2444                                 NULL
2445                         };
2446                         char **env_use;
2447
2448                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2449                         if (envp[n_env])
2450                                 n_env ++;
2451
2452                         close_nointr_nofail(master);
2453                         master = -1;
2454
2455                         close_nointr(STDIN_FILENO);
2456                         close_nointr(STDOUT_FILENO);
2457                         close_nointr(STDERR_FILENO);
2458
2459                         close_nointr_nofail(kmsg_socket_pair[0]);
2460                         kmsg_socket_pair[0] = -1;
2461
2462                         reset_all_signal_handlers();
2463
2464                         assert_se(sigemptyset(&mask) == 0);
2465                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2466
2467                         k = open_terminal(console, O_RDWR);
2468                         if (k != STDIN_FILENO) {
2469                                 if (k >= 0) {
2470                                         close_nointr_nofail(k);
2471                                         k = -EINVAL;
2472                                 }
2473
2474                                 log_error("Failed to open console: %s", strerror(-k));
2475                                 goto child_fail;
2476                         }
2477
2478                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2479                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2480                                 log_error("Failed to duplicate console: %m");
2481                                 goto child_fail;
2482                         }
2483
2484                         if (setsid() < 0) {
2485                                 log_error("setsid() failed: %m");
2486                                 goto child_fail;
2487                         }
2488
2489                         if (reset_audit_loginuid() < 0)
2490                                 goto child_fail;
2491
2492                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2493                                 log_error("PR_SET_PDEATHSIG failed: %m");
2494                                 goto child_fail;
2495                         }
2496
2497                         /* Mark everything as slave, so that we still
2498                          * receive mounts from the real root, but don't
2499                          * propagate mounts to the real root. */
2500                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2501                                 log_error("MS_SLAVE|MS_REC failed: %m");
2502                                 goto child_fail;
2503                         }
2504
2505                         if (mount_devices(arg_directory, root_device, home_device, srv_device) < 0)
2506                                 goto child_fail;
2507
2508                         /* Turn directory into bind mount */
2509                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2510                                 log_error("Failed to make bind mount.");
2511                                 goto child_fail;
2512                         }
2513
2514                         if (arg_read_only)
2515                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2516                                         log_error("Failed to make read-only.");
2517                                         goto child_fail;
2518                                 }
2519
2520                         if (mount_all(arg_directory) < 0)
2521                                 goto child_fail;
2522
2523                         if (copy_devnodes(arg_directory) < 0)
2524                                 goto child_fail;
2525
2526                         if (setup_ptmx(arg_directory) < 0)
2527                                 goto child_fail;
2528
2529                         dev_setup(arg_directory);
2530
2531                         if (audit_still_doesnt_work_in_containers() < 0)
2532                                 goto child_fail;
2533
2534                         if (setup_dev_console(arg_directory, console) < 0)
2535                                 goto child_fail;
2536
2537                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2538                                 goto child_fail;
2539
2540                         close_nointr_nofail(kmsg_socket_pair[1]);
2541                         kmsg_socket_pair[1] = -1;
2542
2543                         if (setup_boot_id(arg_directory) < 0)
2544                                 goto child_fail;
2545
2546                         if (setup_timezone(arg_directory) < 0)
2547                                 goto child_fail;
2548
2549                         if (setup_resolv_conf(arg_directory) < 0)
2550                                 goto child_fail;
2551
2552                         if (setup_journal(arg_directory) < 0)
2553                                 goto child_fail;
2554
2555                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2556                                 goto child_fail;
2557
2558                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2559                                 goto child_fail;
2560
2561                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2562                                 goto child_fail;
2563
2564                         /* Tell the parent that we are ready, and that
2565                          * it can cgroupify us to that we lack access
2566                          * to certain devices and resources. */
2567                         eventfd_write(child_ready_fd, 1);
2568                         close_nointr_nofail(child_ready_fd);
2569                         child_ready_fd = -1;
2570
2571                         if (chdir(arg_directory) < 0) {
2572                                 log_error("chdir(%s) failed: %m", arg_directory);
2573                                 goto child_fail;
2574                         }
2575
2576                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2577                                 log_error("mount(MS_MOVE) failed: %m");
2578                                 goto child_fail;
2579                         }
2580
2581                         if (chroot(".") < 0) {
2582                                 log_error("chroot() failed: %m");
2583                                 goto child_fail;
2584                         }
2585
2586                         if (chdir("/") < 0) {
2587                                 log_error("chdir() failed: %m");
2588                                 goto child_fail;
2589                         }
2590
2591                         umask(0022);
2592
2593                         if (arg_private_network)
2594                                 loopback_setup();
2595
2596                         if (drop_capabilities() < 0) {
2597                                 log_error("drop_capabilities() failed: %m");
2598                                 goto child_fail;
2599                         }
2600
2601                         if (arg_user) {
2602
2603                                 /* Note that this resolves user names
2604                                  * inside the container, and hence
2605                                  * accesses the NSS modules from the
2606                                  * container and not the host. This is
2607                                  * a bit weird... */
2608
2609                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
2610                                         log_error("get_user_creds() failed: %m");
2611                                         goto child_fail;
2612                                 }
2613
2614                                 if (mkdir_parents_label(home, 0775) < 0) {
2615                                         log_error("mkdir_parents_label() failed: %m");
2616                                         goto child_fail;
2617                                 }
2618
2619                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
2620                                         log_error("mkdir_safe_label() failed: %m");
2621                                         goto child_fail;
2622                                 }
2623
2624                                 if (initgroups((const char*)arg_user, gid) < 0) {
2625                                         log_error("initgroups() failed: %m");
2626                                         goto child_fail;
2627                                 }
2628
2629                                 if (setresgid(gid, gid, gid) < 0) {
2630                                         log_error("setregid() failed: %m");
2631                                         goto child_fail;
2632                                 }
2633
2634                                 if (setresuid(uid, uid, uid) < 0) {
2635                                         log_error("setreuid() failed: %m");
2636                                         goto child_fail;
2637                                 }
2638                         } else {
2639                                 /* Reset everything fully to 0, just in case */
2640
2641                                 if (setgroups(0, NULL) < 0) {
2642                                         log_error("setgroups() failed: %m");
2643                                         goto child_fail;
2644                                 }
2645
2646                                 if (setresgid(0, 0, 0) < 0) {
2647                                         log_error("setregid() failed: %m");
2648                                         goto child_fail;
2649                                 }
2650
2651                                 if (setresuid(0, 0, 0) < 0) {
2652                                         log_error("setreuid() failed: %m");
2653                                         goto child_fail;
2654                                 }
2655                         }
2656
2657                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2658                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2659                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2660                                 log_oom();
2661                                 goto child_fail;
2662                         }
2663
2664                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2665                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2666                                         log_oom();
2667                                         goto child_fail;
2668                                 }
2669                         }
2670
2671                         if (fdset_size(fds) > 0) {
2672                                 k = fdset_cloexec(fds, false);
2673                                 if (k < 0) {
2674                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2675                                         goto child_fail;
2676                                 }
2677
2678                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2679                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2680                                         log_oom();
2681                                         goto child_fail;
2682                                 }
2683                         }
2684
2685                         setup_hostname();
2686
2687                         if (arg_personality != 0xffffffffLU) {
2688                                 if (personality(arg_personality) < 0) {
2689                                         log_error("personality() failed: %m");
2690                                         goto child_fail;
2691                                 }
2692                         } else if (secondary) {
2693                                 if (personality(PER_LINUX32) < 0) {
2694                                         log_error("personality() failed: %m");
2695                                         goto child_fail;
2696                                 }
2697                         }
2698
2699 #ifdef HAVE_SELINUX
2700                         if (arg_selinux_context)
2701                                 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2702                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2703 #endif
2704
2705                         if (!strv_isempty(arg_setenv)) {
2706                                 char **n;
2707
2708                                 n = strv_env_merge(2, envp, arg_setenv);
2709                                 if (!n) {
2710                                         log_oom();
2711                                         goto child_fail;
2712                                 }
2713
2714                                 env_use = n;
2715                         } else
2716                                 env_use = (char**) envp;
2717
2718                         /* Wait until the parent is ready with the setup, too... */
2719                         eventfd_read(parent_ready_fd, &x);
2720                         close_nointr_nofail(parent_ready_fd);
2721                         parent_ready_fd = -1;
2722
2723                         if (arg_boot) {
2724                                 char **a;
2725                                 size_t l;
2726
2727                                 /* Automatically search for the init system */
2728
2729                                 l = 1 + argc - optind;
2730                                 a = newa(char*, l + 1);
2731                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
2732
2733                                 a[0] = (char*) "/usr/lib/systemd/systemd";
2734                                 execve(a[0], a, env_use);
2735
2736                                 a[0] = (char*) "/lib/systemd/systemd";
2737                                 execve(a[0], a, env_use);
2738
2739                                 a[0] = (char*) "/sbin/init";
2740                                 execve(a[0], a, env_use);
2741                         } else if (argc > optind)
2742                                 execvpe(argv[optind], argv + optind, env_use);
2743                         else {
2744                                 chdir(home ? home : "/root");
2745                                 execle("/bin/bash", "-bash", NULL, env_use);
2746                                 execle("/bin/sh", "-sh", NULL, env_use);
2747                         }
2748
2749                         log_error("execv() failed: %m");
2750
2751                 child_fail:
2752                         _exit(EXIT_FAILURE);
2753                 }
2754
2755                 fdset_free(fds);
2756                 fds = NULL;
2757
2758                 /* Wait until the child reported that it is ready with
2759                  * all it needs to do with priviliges. After we got
2760                  * the notification we can make the process join its
2761                  * cgroup which might limit what it can do */
2762                 eventfd_read(child_ready_fd, &x);
2763
2764                 r = register_machine(pid);
2765                 if (r < 0)
2766                         goto finish;
2767
2768                 r = move_network_interfaces(pid);
2769                 if (r < 0)
2770                         goto finish;
2771
2772                 r = setup_veth(pid, veth_name);
2773                 if (r < 0)
2774                         goto finish;
2775
2776                 r = setup_bridge(veth_name);
2777                 if (r < 0)
2778                         goto finish;
2779
2780                 r = setup_macvlan(pid);
2781                 if (r < 0)
2782                         goto finish;
2783
2784                 /* Notify the child that the parent is ready with all
2785                  * its setup, and thtat the child can now hand over
2786                  * control to the code to run inside the container. */
2787                 eventfd_write(parent_ready_fd, 1);
2788
2789                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2790                 if (k < 0) {
2791                         r = EXIT_FAILURE;
2792                         break;
2793                 }
2794
2795                 if (!arg_quiet)
2796                         putc('\n', stdout);
2797
2798                 /* Kill if it is not dead yet anyway */
2799                 terminate_machine(pid);
2800
2801                 /* Redundant, but better safe than sorry */
2802                 kill(pid, SIGKILL);
2803
2804                 k = wait_for_terminate(pid, &status);
2805                 pid = 0;
2806
2807                 if (k < 0) {
2808                         r = EXIT_FAILURE;
2809                         break;
2810                 }
2811
2812                 if (status.si_code == CLD_EXITED) {
2813                         r = status.si_status;
2814                         if (status.si_status != 0) {
2815                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2816                                 break;
2817                         }
2818
2819                         if (!arg_quiet)
2820                                 log_debug("Container %s exited successfully.", arg_machine);
2821                         break;
2822                 } else if (status.si_code == CLD_KILLED &&
2823                            status.si_status == SIGINT) {
2824
2825                         if (!arg_quiet)
2826                                 log_info("Container %s has been shut down.", arg_machine);
2827                         r = 0;
2828                         break;
2829                 } else if (status.si_code == CLD_KILLED &&
2830                            status.si_status == SIGHUP) {
2831
2832                         if (!arg_quiet)
2833                                 log_info("Container %s is being rebooted.", arg_machine);
2834                         continue;
2835                 } else if (status.si_code == CLD_KILLED ||
2836                            status.si_code == CLD_DUMPED) {
2837
2838                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2839                         r = EXIT_FAILURE;
2840                         break;
2841                 } else {
2842                         log_error("Container %s failed due to unknown reason.", arg_machine);
2843                         r = EXIT_FAILURE;
2844                         break;
2845                 }
2846         }
2847
2848 finish:
2849         loop_remove(loop_nr, &image_fd);
2850
2851         if (pid > 0)
2852                 kill(pid, SIGKILL);
2853
2854         free(arg_directory);
2855         free(arg_machine);
2856         free(arg_user);
2857         strv_free(arg_setenv);
2858         strv_free(arg_network_interfaces);
2859         strv_free(arg_network_macvlan);
2860         strv_free(arg_bind);
2861         strv_free(arg_bind_ro);
2862
2863         return r;
2864 }