chiark / gitweb /
fb72123f1a80e43b2ee2b38dc66c87c63a089089
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "utf8.h"
27 #include "btrfs-util.h"
28 #include "path-util.h"
29 #include "copy.h"
30 #include "mkdir.h"
31 #include "rm-rf.h"
32 #include "machine-image.h"
33
34 static const char image_search_path[] =
35         "/var/lib/machines\0"
36         "/var/lib/container\0"
37         "/usr/local/lib/machines\0"
38         "/usr/lib/machines\0";
39
40 Image *image_unref(Image *i) {
41         if (!i)
42                 return NULL;
43
44         free(i->name);
45         free(i->path);
46         free(i);
47         return NULL;
48 }
49
50 static int image_new(
51                 ImageType t,
52                 const char *pretty,
53                 const char *path,
54                 const char *filename,
55                 bool read_only,
56                 usec_t crtime,
57                 usec_t mtime,
58                 Image **ret) {
59
60         _cleanup_(image_unrefp) Image *i = NULL;
61
62         assert(t >= 0);
63         assert(t < _IMAGE_TYPE_MAX);
64         assert(pretty);
65         assert(filename);
66         assert(ret);
67
68         i = new0(Image, 1);
69         if (!i)
70                 return -ENOMEM;
71
72         i->type = t;
73         i->read_only = read_only;
74         i->crtime = crtime;
75         i->mtime = mtime;
76         i->usage = i->usage_exclusive = (uint64_t) -1;
77         i->limit = i->limit_exclusive = (uint64_t) -1;
78
79         i->name = strdup(pretty);
80         if (!i->name)
81                 return -ENOMEM;
82
83         if (path)
84                 i->path = strjoin(path, "/", filename, NULL);
85         else
86                 i->path = strdup(filename);
87
88         if (!i->path)
89                 return -ENOMEM;
90
91         path_kill_slashes(i->path);
92
93         *ret = i;
94         i = NULL;
95
96         return 0;
97 }
98
99 static int image_make(
100                 const char *pretty,
101                 int dfd,
102                 const char *path,
103                 const char *filename,
104                 Image **ret) {
105
106         struct stat st;
107         bool read_only;
108         int r;
109
110         assert(filename);
111
112         /* We explicitly *do* follow symlinks here, since we want to
113          * allow symlinking trees into /var/lib/machines/, and treat
114          * them normally. */
115
116         if (fstatat(dfd, filename, &st, 0) < 0)
117                 return -errno;
118
119         read_only =
120                 (path && path_startswith(path, "/usr")) ||
121                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
122
123         if (S_ISDIR(st.st_mode)) {
124                 _cleanup_close_ int fd = -1;
125                 unsigned file_attr = 0;
126
127                 if (!ret)
128                         return 1;
129
130                 if (!pretty)
131                         pretty = filename;
132
133                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
134                 if (fd < 0)
135                         return -errno;
136
137                 /* btrfs subvolumes have inode 256 */
138                 if (st.st_ino == 256) {
139                         struct statfs sfs;
140
141                         if (fstatfs(fd, &sfs) < 0)
142                                 return -errno;
143
144                         if (F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC)) {
145                                 BtrfsSubvolInfo info;
146                                 BtrfsQuotaInfo quota;
147
148                                 /* It's a btrfs subvolume */
149
150                                 r = btrfs_subvol_get_info_fd(fd, &info);
151                                 if (r < 0)
152                                         return r;
153
154                                 r = image_new(IMAGE_SUBVOLUME,
155                                               pretty,
156                                               path,
157                                               filename,
158                                               info.read_only || read_only,
159                                               info.otime,
160                                               0,
161                                               ret);
162                                 if (r < 0)
163                                         return r;
164
165                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
166                                 if (r >= 0) {
167                                         (*ret)->usage = quota.referenced;
168                                         (*ret)->usage_exclusive = quota.exclusive;
169
170                                         (*ret)->limit = quota.referenced_max;
171                                         (*ret)->limit_exclusive = quota.exclusive_max;
172                                 }
173
174                                 return 1;
175                         }
176                 }
177
178                 /* If the IMMUTABLE bit is set, we consider the
179                  * directory read-only. Since the ioctl is not
180                  * supported everywhere we ignore failures. */
181                 (void) read_attr_fd(fd, &file_attr);
182
183                 /* It's just a normal directory. */
184                 r = image_new(IMAGE_DIRECTORY,
185                               pretty,
186                               path,
187                               filename,
188                               read_only || (file_attr & FS_IMMUTABLE_FL),
189                               0,
190                               0,
191                               ret);
192                 if (r < 0)
193                         return r;
194
195                 return 1;
196
197         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
198                 usec_t crtime = 0;
199
200                 /* It's a RAW disk image */
201
202                 if (!ret)
203                         return 1;
204
205                 fd_getcrtime_at(dfd, filename, &crtime, 0);
206
207                 if (!pretty)
208                         pretty = strndupa(filename, strlen(filename) - 4);
209
210                 r = image_new(IMAGE_RAW,
211                               pretty,
212                               path,
213                               filename,
214                               !(st.st_mode & 0222) || read_only,
215                               crtime,
216                               timespec_load(&st.st_mtim),
217                               ret);
218                 if (r < 0)
219                         return r;
220
221                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
222                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
223
224                 return 1;
225         }
226
227         return 0;
228 }
229
230 int image_find(const char *name, Image **ret) {
231         const char *path;
232         int r;
233
234         assert(name);
235
236         /* There are no images with invalid names */
237         if (!image_name_is_valid(name))
238                 return 0;
239
240         NULSTR_FOREACH(path, image_search_path) {
241                 _cleanup_closedir_ DIR *d = NULL;
242
243                 d = opendir(path);
244                 if (!d) {
245                         if (errno == ENOENT)
246                                 continue;
247
248                         return -errno;
249                 }
250
251                 r = image_make(NULL, dirfd(d), path, name, ret);
252                 if (r == 0 || r == -ENOENT) {
253                         _cleanup_free_ char *raw = NULL;
254
255                         raw = strappend(name, ".raw");
256                         if (!raw)
257                                 return -ENOMEM;
258
259                         r = image_make(NULL, dirfd(d), path, raw, ret);
260                         if (r == 0 || r == -ENOENT)
261                                 continue;
262                 }
263                 if (r < 0)
264                         return r;
265
266                 return 1;
267         }
268
269         if (streq(name, ".host"))
270                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
271
272         return 0;
273 };
274
275 int image_discover(Hashmap *h) {
276         const char *path;
277         int r;
278
279         assert(h);
280
281         NULSTR_FOREACH(path, image_search_path) {
282                 _cleanup_closedir_ DIR *d = NULL;
283                 struct dirent *de;
284
285                 d = opendir(path);
286                 if (!d) {
287                         if (errno == ENOENT)
288                                 continue;
289
290                         return -errno;
291                 }
292
293                 FOREACH_DIRENT_ALL(de, d, return -errno) {
294                         _cleanup_(image_unrefp) Image *image = NULL;
295
296                         if (!image_name_is_valid(de->d_name))
297                                 continue;
298
299                         if (hashmap_contains(h, de->d_name))
300                                 continue;
301
302                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
303                         if (r == 0 || r == -ENOENT)
304                                 continue;
305                         if (r < 0)
306                                 return r;
307
308                         r = hashmap_put(h, image->name, image);
309                         if (r < 0)
310                                 return r;
311
312                         image = NULL;
313                 }
314         }
315
316         if (!hashmap_contains(h, ".host")) {
317                 _cleanup_(image_unrefp) Image *image = NULL;
318
319                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
320                 if (r < 0)
321                         return r;
322
323                 r = hashmap_put(h, image->name, image);
324                 if (r < 0)
325                         return r;
326
327                 image = NULL;
328
329         }
330
331         return 0;
332 }
333
334 void image_hashmap_free(Hashmap *map) {
335         Image *i;
336
337         while ((i = hashmap_steal_first(map)))
338                 image_unref(i);
339
340         hashmap_free(map);
341 }
342
343 int image_remove(Image *i) {
344         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
345         int r;
346
347         assert(i);
348
349         if (path_equal(i->path, "/") ||
350             path_startswith(i->path, "/usr"))
351                 return -EROFS;
352
353         /* Make sure we don't interfere with a running nspawn */
354         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
355         if (r < 0)
356                 return r;
357
358         switch (i->type) {
359
360         case IMAGE_SUBVOLUME:
361                 return btrfs_subvol_remove(i->path, true);
362
363         case IMAGE_DIRECTORY:
364                 /* Allow deletion of read-only directories */
365                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
366                 return rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
367
368         case IMAGE_RAW:
369                 if (unlink(i->path) < 0)
370                         return -errno;
371
372                 return 0;
373
374         default:
375                 return -EOPNOTSUPP;
376         }
377 }
378
379 int image_rename(Image *i, const char *new_name) {
380         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
381         _cleanup_free_ char *new_path = NULL, *nn = NULL;
382         unsigned file_attr = 0;
383         int r;
384
385         assert(i);
386
387         if (!image_name_is_valid(new_name))
388                 return -EINVAL;
389
390         if (path_equal(i->path, "/") ||
391             path_startswith(i->path, "/usr"))
392                 return -EROFS;
393
394         /* Make sure we don't interfere with a running nspawn */
395         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
396         if (r < 0)
397                 return r;
398
399         /* Make sure nobody takes the new name, between the time we
400          * checked it is currently unused in all search paths, and the
401          * time we take possesion of it */
402         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
403         if (r < 0)
404                 return r;
405
406         r = image_find(new_name, NULL);
407         if (r < 0)
408                 return r;
409         if (r > 0)
410                 return -EEXIST;
411
412         switch (i->type) {
413
414         case IMAGE_DIRECTORY:
415                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
416                 (void) read_attr_path(i->path, &file_attr);
417
418                 if (file_attr & FS_IMMUTABLE_FL)
419                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
420
421                 /* fall through */
422
423         case IMAGE_SUBVOLUME:
424                 new_path = file_in_same_dir(i->path, new_name);
425                 break;
426
427         case IMAGE_RAW: {
428                 const char *fn;
429
430                 fn = strjoina(new_name, ".raw");
431                 new_path = file_in_same_dir(i->path, fn);
432                 break;
433         }
434
435         default:
436                 return -EOPNOTSUPP;
437         }
438
439         if (!new_path)
440                 return -ENOMEM;
441
442         nn = strdup(new_name);
443         if (!nn)
444                 return -ENOMEM;
445
446         r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
447         if (r < 0)
448                 return r;
449
450         /* Restore the immutable bit, if it was set before */
451         if (file_attr & FS_IMMUTABLE_FL)
452                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
453
454         free(i->path);
455         i->path = new_path;
456         new_path = NULL;
457
458         free(i->name);
459         i->name = nn;
460         nn = NULL;
461
462         return 0;
463 }
464
465 int image_clone(Image *i, const char *new_name, bool read_only) {
466         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
467         const char *new_path;
468         int r;
469
470         assert(i);
471
472         if (!image_name_is_valid(new_name))
473                 return -EINVAL;
474
475         /* Make sure nobody takes the new name, between the time we
476          * checked it is currently unused in all search paths, and the
477          * time we take possesion of it */
478         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
479         if (r < 0)
480                 return r;
481
482         r = image_find(new_name, NULL);
483         if (r < 0)
484                 return r;
485         if (r > 0)
486                 return -EEXIST;
487
488         switch (i->type) {
489
490         case IMAGE_SUBVOLUME:
491         case IMAGE_DIRECTORY:
492                 new_path = strjoina("/var/lib/machines/", new_name);
493
494                 r = btrfs_subvol_snapshot(i->path, new_path, read_only, true);
495                 break;
496
497         case IMAGE_RAW:
498                 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
499
500                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
501                 break;
502
503         default:
504                 return -EOPNOTSUPP;
505         }
506
507         if (r < 0)
508                 return r;
509
510         return 0;
511 }
512
513 int image_read_only(Image *i, bool b) {
514         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
515         int r;
516         assert(i);
517
518         if (path_equal(i->path, "/") ||
519             path_startswith(i->path, "/usr"))
520                 return -EROFS;
521
522         /* Make sure we don't interfere with a running nspawn */
523         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
524         if (r < 0)
525                 return r;
526
527         switch (i->type) {
528
529         case IMAGE_SUBVOLUME:
530                 r = btrfs_subvol_set_read_only(i->path, b);
531                 if (r < 0)
532                         return r;
533
534                 break;
535
536         case IMAGE_DIRECTORY:
537                 /* For simple directory trees we cannot use the access
538                    mode of the top-level directory, since it has an
539                    effect on the container itself.  However, we can
540                    use the "immutable" flag, to at least make the
541                    top-level directory read-only. It's not as good as
542                    a read-only subvolume, but at least something, and
543                    we can read the value back.*/
544
545                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
546                 if (r < 0)
547                         return r;
548
549                 break;
550
551         case IMAGE_RAW: {
552                 struct stat st;
553
554                 if (stat(i->path, &st) < 0)
555                         return -errno;
556
557                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
558                         return -errno;
559
560                 /* If the images is now read-only, it's a good time to
561                  * defrag it, given that no write patterns will
562                  * fragment it again. */
563                 if (b)
564                         (void) btrfs_defrag(i->path);
565                 break;
566         }
567
568         default:
569                 return -EOPNOTSUPP;
570         }
571
572         return 0;
573 }
574
575 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
576         _cleanup_free_ char *p = NULL;
577         LockFile t = LOCK_FILE_INIT;
578         struct stat st;
579         int r;
580
581         assert(path);
582         assert(global);
583         assert(local);
584
585         /* Locks an image path. This actually creates two locks: one
586          * "local" one, next to the image path itself, which might be
587          * shared via NFS. And another "global" one, in /run, that
588          * uses the device/inode number. This has the benefit that we
589          * can even lock a tree that is a mount point, correctly. */
590
591         if (path_equal(path, "/"))
592                 return -EBUSY;
593
594         if (!path_is_absolute(path))
595                 return -EINVAL;
596
597         if (stat(path, &st) >= 0) {
598                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
599                         return -ENOMEM;
600         }
601
602         r = make_lock_file_for(path, operation, &t);
603         if (r < 0)
604                 return r;
605
606         if (p) {
607                 mkdir_p("/run/systemd/nspawn/locks", 0600);
608
609                 r = make_lock_file(p, operation, global);
610                 if (r < 0) {
611                         release_lock_file(&t);
612                         return r;
613                 }
614         }
615
616         *local = t;
617         return 0;
618 }
619
620 int image_set_limit(Image *i, uint64_t referenced_max) {
621         assert(i);
622
623         if (path_equal(i->path, "/") ||
624             path_startswith(i->path, "/usr"))
625                 return -EROFS;
626
627         if (i->type != IMAGE_SUBVOLUME)
628                 return -EOPNOTSUPP;
629
630         return btrfs_quota_limit(i->path, referenced_max);
631 }
632
633 int image_name_lock(const char *name, int operation, LockFile *ret) {
634         const char *p;
635
636         assert(name);
637         assert(ret);
638
639         /* Locks an image name, regardless of the precise path used. */
640
641         if (!image_name_is_valid(name))
642                 return -EINVAL;
643
644         if (streq(name, ".host"))
645                 return -EBUSY;
646
647         mkdir_p("/run/systemd/nspawn/locks", 0600);
648         p = strjoina("/run/systemd/nspawn/locks/name-", name);
649
650         return make_lock_file(p, operation, ret);
651 }
652
653 bool image_name_is_valid(const char *s) {
654         if (!filename_is_valid(s))
655                 return false;
656
657         if (string_has_cc(s, NULL))
658                 return false;
659
660         if (!utf8_is_valid(s))
661                 return false;
662
663         /* Temporary files for atomically creating new files */
664         if (startswith(s, ".#"))
665                 return false;
666
667         return true;
668 }
669
670 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
671         [IMAGE_DIRECTORY] = "directory",
672         [IMAGE_SUBVOLUME] = "subvolume",
673         [IMAGE_RAW] = "raw",
674 };
675
676 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);