chiark / gitweb /
units: make networkd pull in its own .busname unit
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "utf8.h"
27 #include "btrfs-util.h"
28 #include "path-util.h"
29 #include "copy.h"
30 #include "mkdir.h"
31 #include "rm-rf.h"
32 #include "machine-image.h"
33
34 static const char image_search_path[] =
35         "/var/lib/machines\0"
36         "/var/lib/container\0"
37         "/usr/local/lib/machines\0"
38         "/usr/lib/machines\0";
39
40 Image *image_unref(Image *i) {
41         if (!i)
42                 return NULL;
43
44         free(i->name);
45         free(i->path);
46         free(i);
47         return NULL;
48 }
49
50 static int image_new(
51                 ImageType t,
52                 const char *pretty,
53                 const char *path,
54                 const char *filename,
55                 bool read_only,
56                 usec_t crtime,
57                 usec_t mtime,
58                 Image **ret) {
59
60         _cleanup_(image_unrefp) Image *i = NULL;
61
62         assert(t >= 0);
63         assert(t < _IMAGE_TYPE_MAX);
64         assert(pretty);
65         assert(filename);
66         assert(ret);
67
68         i = new0(Image, 1);
69         if (!i)
70                 return -ENOMEM;
71
72         i->type = t;
73         i->read_only = read_only;
74         i->crtime = crtime;
75         i->mtime = mtime;
76         i->usage = i->usage_exclusive = (uint64_t) -1;
77         i->limit = i->limit_exclusive = (uint64_t) -1;
78
79         i->name = strdup(pretty);
80         if (!i->name)
81                 return -ENOMEM;
82
83         if (path)
84                 i->path = strjoin(path, "/", filename, NULL);
85         else
86                 i->path = strdup(filename);
87
88         if (!i->path)
89                 return -ENOMEM;
90
91         path_kill_slashes(i->path);
92
93         *ret = i;
94         i = NULL;
95
96         return 0;
97 }
98
99 static int image_make(
100                 const char *pretty,
101                 int dfd,
102                 const char *path,
103                 const char *filename,
104                 Image **ret) {
105
106         struct stat st;
107         bool read_only;
108         int r;
109
110         assert(filename);
111
112         /* We explicitly *do* follow symlinks here, since we want to
113          * allow symlinking trees into /var/lib/machines/, and treat
114          * them normally. */
115
116         if (fstatat(dfd, filename, &st, 0) < 0)
117                 return -errno;
118
119         read_only =
120                 (path && path_startswith(path, "/usr")) ||
121                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
122
123         if (S_ISDIR(st.st_mode)) {
124                 _cleanup_close_ int fd = -1;
125                 unsigned file_attr = 0;
126
127                 if (!ret)
128                         return 1;
129
130                 if (!pretty)
131                         pretty = filename;
132
133                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
134                 if (fd < 0)
135                         return -errno;
136
137                 /* btrfs subvolumes have inode 256 */
138                 if (st.st_ino == 256) {
139
140                         r = btrfs_is_filesystem(fd);
141                         if (r < 0)
142                                 return r;
143                         if (r) {
144                                 BtrfsSubvolInfo info;
145                                 BtrfsQuotaInfo quota;
146
147                                 /* It's a btrfs subvolume */
148
149                                 r = btrfs_subvol_get_info_fd(fd, &info);
150                                 if (r < 0)
151                                         return r;
152
153                                 r = image_new(IMAGE_SUBVOLUME,
154                                               pretty,
155                                               path,
156                                               filename,
157                                               info.read_only || read_only,
158                                               info.otime,
159                                               0,
160                                               ret);
161                                 if (r < 0)
162                                         return r;
163
164                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
165                                 if (r >= 0) {
166                                         (*ret)->usage = quota.referenced;
167                                         (*ret)->usage_exclusive = quota.exclusive;
168
169                                         (*ret)->limit = quota.referenced_max;
170                                         (*ret)->limit_exclusive = quota.exclusive_max;
171                                 }
172
173                                 return 1;
174                         }
175                 }
176
177                 /* If the IMMUTABLE bit is set, we consider the
178                  * directory read-only. Since the ioctl is not
179                  * supported everywhere we ignore failures. */
180                 (void) read_attr_fd(fd, &file_attr);
181
182                 /* It's just a normal directory. */
183                 r = image_new(IMAGE_DIRECTORY,
184                               pretty,
185                               path,
186                               filename,
187                               read_only || (file_attr & FS_IMMUTABLE_FL),
188                               0,
189                               0,
190                               ret);
191                 if (r < 0)
192                         return r;
193
194                 return 1;
195
196         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
197                 usec_t crtime = 0;
198
199                 /* It's a RAW disk image */
200
201                 if (!ret)
202                         return 1;
203
204                 fd_getcrtime_at(dfd, filename, &crtime, 0);
205
206                 if (!pretty)
207                         pretty = strndupa(filename, strlen(filename) - 4);
208
209                 r = image_new(IMAGE_RAW,
210                               pretty,
211                               path,
212                               filename,
213                               !(st.st_mode & 0222) || read_only,
214                               crtime,
215                               timespec_load(&st.st_mtim),
216                               ret);
217                 if (r < 0)
218                         return r;
219
220                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
221                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
222
223                 return 1;
224         }
225
226         return 0;
227 }
228
229 int image_find(const char *name, Image **ret) {
230         const char *path;
231         int r;
232
233         assert(name);
234
235         /* There are no images with invalid names */
236         if (!image_name_is_valid(name))
237                 return 0;
238
239         NULSTR_FOREACH(path, image_search_path) {
240                 _cleanup_closedir_ DIR *d = NULL;
241
242                 d = opendir(path);
243                 if (!d) {
244                         if (errno == ENOENT)
245                                 continue;
246
247                         return -errno;
248                 }
249
250                 r = image_make(NULL, dirfd(d), path, name, ret);
251                 if (r == 0 || r == -ENOENT) {
252                         _cleanup_free_ char *raw = NULL;
253
254                         raw = strappend(name, ".raw");
255                         if (!raw)
256                                 return -ENOMEM;
257
258                         r = image_make(NULL, dirfd(d), path, raw, ret);
259                         if (r == 0 || r == -ENOENT)
260                                 continue;
261                 }
262                 if (r < 0)
263                         return r;
264
265                 return 1;
266         }
267
268         if (streq(name, ".host"))
269                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
270
271         return 0;
272 };
273
274 int image_discover(Hashmap *h) {
275         const char *path;
276         int r;
277
278         assert(h);
279
280         NULSTR_FOREACH(path, image_search_path) {
281                 _cleanup_closedir_ DIR *d = NULL;
282                 struct dirent *de;
283
284                 d = opendir(path);
285                 if (!d) {
286                         if (errno == ENOENT)
287                                 continue;
288
289                         return -errno;
290                 }
291
292                 FOREACH_DIRENT_ALL(de, d, return -errno) {
293                         _cleanup_(image_unrefp) Image *image = NULL;
294
295                         if (!image_name_is_valid(de->d_name))
296                                 continue;
297
298                         if (hashmap_contains(h, de->d_name))
299                                 continue;
300
301                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
302                         if (r == 0 || r == -ENOENT)
303                                 continue;
304                         if (r < 0)
305                                 return r;
306
307                         r = hashmap_put(h, image->name, image);
308                         if (r < 0)
309                                 return r;
310
311                         image = NULL;
312                 }
313         }
314
315         if (!hashmap_contains(h, ".host")) {
316                 _cleanup_(image_unrefp) Image *image = NULL;
317
318                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
319                 if (r < 0)
320                         return r;
321
322                 r = hashmap_put(h, image->name, image);
323                 if (r < 0)
324                         return r;
325
326                 image = NULL;
327
328         }
329
330         return 0;
331 }
332
333 void image_hashmap_free(Hashmap *map) {
334         Image *i;
335
336         while ((i = hashmap_steal_first(map)))
337                 image_unref(i);
338
339         hashmap_free(map);
340 }
341
342 int image_remove(Image *i) {
343         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
344         int r;
345
346         assert(i);
347
348         if (path_equal(i->path, "/") ||
349             path_startswith(i->path, "/usr"))
350                 return -EROFS;
351
352         /* Make sure we don't interfere with a running nspawn */
353         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
354         if (r < 0)
355                 return r;
356
357         switch (i->type) {
358
359         case IMAGE_SUBVOLUME:
360                 return btrfs_subvol_remove(i->path, true);
361
362         case IMAGE_DIRECTORY:
363                 /* Allow deletion of read-only directories */
364                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
365                 return rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
366
367         case IMAGE_RAW:
368                 if (unlink(i->path) < 0)
369                         return -errno;
370
371                 return 0;
372
373         default:
374                 return -EOPNOTSUPP;
375         }
376 }
377
378 int image_rename(Image *i, const char *new_name) {
379         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
380         _cleanup_free_ char *new_path = NULL, *nn = NULL;
381         unsigned file_attr = 0;
382         int r;
383
384         assert(i);
385
386         if (!image_name_is_valid(new_name))
387                 return -EINVAL;
388
389         if (path_equal(i->path, "/") ||
390             path_startswith(i->path, "/usr"))
391                 return -EROFS;
392
393         /* Make sure we don't interfere with a running nspawn */
394         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
395         if (r < 0)
396                 return r;
397
398         /* Make sure nobody takes the new name, between the time we
399          * checked it is currently unused in all search paths, and the
400          * time we take possesion of it */
401         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
402         if (r < 0)
403                 return r;
404
405         r = image_find(new_name, NULL);
406         if (r < 0)
407                 return r;
408         if (r > 0)
409                 return -EEXIST;
410
411         switch (i->type) {
412
413         case IMAGE_DIRECTORY:
414                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
415                 (void) read_attr_path(i->path, &file_attr);
416
417                 if (file_attr & FS_IMMUTABLE_FL)
418                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
419
420                 /* fall through */
421
422         case IMAGE_SUBVOLUME:
423                 new_path = file_in_same_dir(i->path, new_name);
424                 break;
425
426         case IMAGE_RAW: {
427                 const char *fn;
428
429                 fn = strjoina(new_name, ".raw");
430                 new_path = file_in_same_dir(i->path, fn);
431                 break;
432         }
433
434         default:
435                 return -EOPNOTSUPP;
436         }
437
438         if (!new_path)
439                 return -ENOMEM;
440
441         nn = strdup(new_name);
442         if (!nn)
443                 return -ENOMEM;
444
445         r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
446         if (r < 0)
447                 return r;
448
449         /* Restore the immutable bit, if it was set before */
450         if (file_attr & FS_IMMUTABLE_FL)
451                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
452
453         free(i->path);
454         i->path = new_path;
455         new_path = NULL;
456
457         free(i->name);
458         i->name = nn;
459         nn = NULL;
460
461         return 0;
462 }
463
464 int image_clone(Image *i, const char *new_name, bool read_only) {
465         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
466         const char *new_path;
467         int r;
468
469         assert(i);
470
471         if (!image_name_is_valid(new_name))
472                 return -EINVAL;
473
474         /* Make sure nobody takes the new name, between the time we
475          * checked it is currently unused in all search paths, and the
476          * time we take possesion of it */
477         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
478         if (r < 0)
479                 return r;
480
481         r = image_find(new_name, NULL);
482         if (r < 0)
483                 return r;
484         if (r > 0)
485                 return -EEXIST;
486
487         switch (i->type) {
488
489         case IMAGE_SUBVOLUME:
490         case IMAGE_DIRECTORY:
491                 new_path = strjoina("/var/lib/machines/", new_name);
492
493                 r = btrfs_subvol_snapshot(i->path, new_path, (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
494                 break;
495
496         case IMAGE_RAW:
497                 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
498
499                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
500                 break;
501
502         default:
503                 return -EOPNOTSUPP;
504         }
505
506         if (r < 0)
507                 return r;
508
509         return 0;
510 }
511
512 int image_read_only(Image *i, bool b) {
513         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
514         int r;
515         assert(i);
516
517         if (path_equal(i->path, "/") ||
518             path_startswith(i->path, "/usr"))
519                 return -EROFS;
520
521         /* Make sure we don't interfere with a running nspawn */
522         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
523         if (r < 0)
524                 return r;
525
526         switch (i->type) {
527
528         case IMAGE_SUBVOLUME:
529                 r = btrfs_subvol_set_read_only(i->path, b);
530                 if (r < 0)
531                         return r;
532
533                 break;
534
535         case IMAGE_DIRECTORY:
536                 /* For simple directory trees we cannot use the access
537                    mode of the top-level directory, since it has an
538                    effect on the container itself.  However, we can
539                    use the "immutable" flag, to at least make the
540                    top-level directory read-only. It's not as good as
541                    a read-only subvolume, but at least something, and
542                    we can read the value back.*/
543
544                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
545                 if (r < 0)
546                         return r;
547
548                 break;
549
550         case IMAGE_RAW: {
551                 struct stat st;
552
553                 if (stat(i->path, &st) < 0)
554                         return -errno;
555
556                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
557                         return -errno;
558
559                 /* If the images is now read-only, it's a good time to
560                  * defrag it, given that no write patterns will
561                  * fragment it again. */
562                 if (b)
563                         (void) btrfs_defrag(i->path);
564                 break;
565         }
566
567         default:
568                 return -EOPNOTSUPP;
569         }
570
571         return 0;
572 }
573
574 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
575         _cleanup_free_ char *p = NULL;
576         LockFile t = LOCK_FILE_INIT;
577         struct stat st;
578         int r;
579
580         assert(path);
581         assert(global);
582         assert(local);
583
584         /* Locks an image path. This actually creates two locks: one
585          * "local" one, next to the image path itself, which might be
586          * shared via NFS. And another "global" one, in /run, that
587          * uses the device/inode number. This has the benefit that we
588          * can even lock a tree that is a mount point, correctly. */
589
590         if (path_equal(path, "/"))
591                 return -EBUSY;
592
593         if (!path_is_absolute(path))
594                 return -EINVAL;
595
596         if (stat(path, &st) >= 0) {
597                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
598                         return -ENOMEM;
599         }
600
601         r = make_lock_file_for(path, operation, &t);
602         if (r < 0)
603                 return r;
604
605         if (p) {
606                 mkdir_p("/run/systemd/nspawn/locks", 0700);
607
608                 r = make_lock_file(p, operation, global);
609                 if (r < 0) {
610                         release_lock_file(&t);
611                         return r;
612                 }
613         }
614
615         *local = t;
616         return 0;
617 }
618
619 int image_set_limit(Image *i, uint64_t referenced_max) {
620         assert(i);
621
622         if (path_equal(i->path, "/") ||
623             path_startswith(i->path, "/usr"))
624                 return -EROFS;
625
626         if (i->type != IMAGE_SUBVOLUME)
627                 return -EOPNOTSUPP;
628
629         return btrfs_quota_limit(i->path, referenced_max);
630 }
631
632 int image_name_lock(const char *name, int operation, LockFile *ret) {
633         const char *p;
634
635         assert(name);
636         assert(ret);
637
638         /* Locks an image name, regardless of the precise path used. */
639
640         if (!image_name_is_valid(name))
641                 return -EINVAL;
642
643         if (streq(name, ".host"))
644                 return -EBUSY;
645
646         mkdir_p("/run/systemd/nspawn/locks", 0700);
647         p = strjoina("/run/systemd/nspawn/locks/name-", name);
648
649         return make_lock_file(p, operation, ret);
650 }
651
652 bool image_name_is_valid(const char *s) {
653         if (!filename_is_valid(s))
654                 return false;
655
656         if (string_has_cc(s, NULL))
657                 return false;
658
659         if (!utf8_is_valid(s))
660                 return false;
661
662         /* Temporary files for atomically creating new files */
663         if (startswith(s, ".#"))
664                 return false;
665
666         return true;
667 }
668
669 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
670         [IMAGE_DIRECTORY] = "directory",
671         [IMAGE_SUBVOLUME] = "subvolume",
672         [IMAGE_RAW] = "raw",
673 };
674
675 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);