chiark / gitweb /
shared: the btrfs quota field is called "referenced" not "referred"
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "utf8.h"
27 #include "btrfs-util.h"
28 #include "path-util.h"
29 #include "copy.h"
30 #include "mkdir.h"
31 #include "machine-image.h"
32
33 static const char image_search_path[] =
34         "/var/lib/machines\0"
35         "/var/lib/container\0"
36         "/usr/local/lib/machines\0"
37         "/usr/lib/machines\0";
38
39 Image *image_unref(Image *i) {
40         if (!i)
41                 return NULL;
42
43         free(i->name);
44         free(i->path);
45         free(i);
46         return NULL;
47 }
48
49 static int image_new(
50                 ImageType t,
51                 const char *pretty,
52                 const char *path,
53                 const char *filename,
54                 bool read_only,
55                 usec_t crtime,
56                 usec_t mtime,
57                 Image **ret) {
58
59         _cleanup_(image_unrefp) Image *i = NULL;
60
61         assert(t >= 0);
62         assert(t < _IMAGE_TYPE_MAX);
63         assert(pretty);
64         assert(filename);
65         assert(ret);
66
67         i = new0(Image, 1);
68         if (!i)
69                 return -ENOMEM;
70
71         i->type = t;
72         i->read_only = read_only;
73         i->crtime = crtime;
74         i->mtime = mtime;
75         i->usage = i->usage_exclusive = (uint64_t) -1;
76         i->limit = i->limit_exclusive = (uint64_t) -1;
77
78         i->name = strdup(pretty);
79         if (!i->name)
80                 return -ENOMEM;
81
82         if (path)
83                 i->path = strjoin(path, "/", filename, NULL);
84         else
85                 i->path = strdup(filename);
86
87         if (!i->path)
88                 return -ENOMEM;
89
90         path_kill_slashes(i->path);
91
92         *ret = i;
93         i = NULL;
94
95         return 0;
96 }
97
98 static int image_make(
99                 const char *pretty,
100                 int dfd,
101                 const char *path,
102                 const char *filename,
103                 Image **ret) {
104
105         struct stat st;
106         bool read_only;
107         int r;
108
109         assert(filename);
110
111         /* We explicitly *do* follow symlinks here, since we want to
112          * allow symlinking trees into /var/lib/machines/, and treat
113          * them normally. */
114
115         if (fstatat(dfd, filename, &st, 0) < 0)
116                 return -errno;
117
118         read_only =
119                 (path && path_startswith(path, "/usr")) ||
120                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
121
122         if (S_ISDIR(st.st_mode)) {
123                 _cleanup_close_ int fd = -1;
124                 unsigned file_attr = 0;
125
126                 if (!ret)
127                         return 1;
128
129                 if (!pretty)
130                         pretty = filename;
131
132                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
133                 if (fd < 0)
134                         return -errno;
135
136                 /* btrfs subvolumes have inode 256 */
137                 if (st.st_ino == 256) {
138                         struct statfs sfs;
139
140                         if (fstatfs(fd, &sfs) < 0)
141                                 return -errno;
142
143                         if (F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC)) {
144                                 BtrfsSubvolInfo info;
145                                 BtrfsQuotaInfo quota;
146
147                                 /* It's a btrfs subvolume */
148
149                                 r = btrfs_subvol_get_info_fd(fd, &info);
150                                 if (r < 0)
151                                         return r;
152
153                                 r = image_new(IMAGE_SUBVOLUME,
154                                               pretty,
155                                               path,
156                                               filename,
157                                               info.read_only || read_only,
158                                               info.otime,
159                                               0,
160                                               ret);
161                                 if (r < 0)
162                                         return r;
163
164                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
165                                 if (r >= 0) {
166                                         (*ret)->usage = quota.referenced;
167                                         (*ret)->usage_exclusive = quota.exclusive;
168
169                                         (*ret)->limit = quota.referenced_max;
170                                         (*ret)->limit_exclusive = quota.exclusive_max;
171                                 }
172
173                                 return 1;
174                         }
175                 }
176
177                 /* If the IMMUTABLE bit is set, we consider the
178                  * directory read-only. Since the ioctl is not
179                  * supported everywhere we ignore failures. */
180                 (void) read_attr_fd(fd, &file_attr);
181
182                 /* It's just a normal directory. */
183                 r = image_new(IMAGE_DIRECTORY,
184                               pretty,
185                               path,
186                               filename,
187                               read_only || (file_attr & FS_IMMUTABLE_FL),
188                               0,
189                               0,
190                               ret);
191                 if (r < 0)
192                         return r;
193
194                 return 1;
195
196         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
197                 usec_t crtime = 0;
198
199                 /* It's a RAW disk image */
200
201                 if (!ret)
202                         return 1;
203
204                 fd_getcrtime_at(dfd, filename, &crtime, 0);
205
206                 if (!pretty)
207                         pretty = strndupa(filename, strlen(filename) - 4);
208
209                 r = image_new(IMAGE_RAW,
210                               pretty,
211                               path,
212                               filename,
213                               !(st.st_mode & 0222) || read_only,
214                               crtime,
215                               timespec_load(&st.st_mtim),
216                               ret);
217                 if (r < 0)
218                         return r;
219
220                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
221                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
222
223                 return 1;
224         }
225
226         return 0;
227 }
228
229 int image_find(const char *name, Image **ret) {
230         const char *path;
231         int r;
232
233         assert(name);
234
235         /* There are no images with invalid names */
236         if (!image_name_is_valid(name))
237                 return 0;
238
239         NULSTR_FOREACH(path, image_search_path) {
240                 _cleanup_closedir_ DIR *d = NULL;
241
242                 d = opendir(path);
243                 if (!d) {
244                         if (errno == ENOENT)
245                                 continue;
246
247                         return -errno;
248                 }
249
250                 r = image_make(NULL, dirfd(d), path, name, ret);
251                 if (r == 0 || r == -ENOENT) {
252                         _cleanup_free_ char *raw = NULL;
253
254                         raw = strappend(name, ".raw");
255                         if (!raw)
256                                 return -ENOMEM;
257
258                         r = image_make(NULL, dirfd(d), path, raw, ret);
259                         if (r == 0 || r == -ENOENT)
260                                 continue;
261                 }
262                 if (r < 0)
263                         return r;
264
265                 return 1;
266         }
267
268         if (streq(name, ".host"))
269                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
270
271         return 0;
272 };
273
274 int image_discover(Hashmap *h) {
275         const char *path;
276         int r;
277
278         assert(h);
279
280         NULSTR_FOREACH(path, image_search_path) {
281                 _cleanup_closedir_ DIR *d = NULL;
282                 struct dirent *de;
283
284                 d = opendir(path);
285                 if (!d) {
286                         if (errno == ENOENT)
287                                 continue;
288
289                         return -errno;
290                 }
291
292                 FOREACH_DIRENT_ALL(de, d, return -errno) {
293                         _cleanup_(image_unrefp) Image *image = NULL;
294
295                         if (!image_name_is_valid(de->d_name))
296                                 continue;
297
298                         if (hashmap_contains(h, de->d_name))
299                                 continue;
300
301                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
302                         if (r == 0 || r == -ENOENT)
303                                 continue;
304                         if (r < 0)
305                                 return r;
306
307                         r = hashmap_put(h, image->name, image);
308                         if (r < 0)
309                                 return r;
310
311                         image = NULL;
312                 }
313         }
314
315         if (!hashmap_contains(h, ".host")) {
316                 _cleanup_(image_unrefp) Image *image = NULL;
317
318                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
319                 if (r < 0)
320                         return r;
321
322                 r = hashmap_put(h, image->name, image);
323                 if (r < 0)
324                         return r;
325
326                 image = NULL;
327
328         }
329
330         return 0;
331 }
332
333 void image_hashmap_free(Hashmap *map) {
334         Image *i;
335
336         while ((i = hashmap_steal_first(map)))
337                 image_unref(i);
338
339         hashmap_free(map);
340 }
341
342 int image_remove(Image *i) {
343         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
344         int r;
345
346         assert(i);
347
348         if (path_equal(i->path, "/") ||
349             path_startswith(i->path, "/usr"))
350                 return -EROFS;
351
352         /* Make sure we don't interfere with a running nspawn */
353         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
354         if (r < 0)
355                 return r;
356
357         switch (i->type) {
358
359         case IMAGE_SUBVOLUME:
360                 return btrfs_subvol_remove(i->path);
361
362         case IMAGE_DIRECTORY:
363                 /* Allow deletion of read-only directories */
364                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
365
366                 /* fall through */
367
368         case IMAGE_RAW:
369                 return rm_rf_dangerous(i->path, false, true, false);
370
371         default:
372                 return -ENOTSUP;
373         }
374 }
375
376 int image_rename(Image *i, const char *new_name) {
377         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
378         _cleanup_free_ char *new_path = NULL, *nn = NULL;
379         unsigned file_attr = 0;
380         int r;
381
382         assert(i);
383
384         if (!image_name_is_valid(new_name))
385                 return -EINVAL;
386
387         if (path_equal(i->path, "/") ||
388             path_startswith(i->path, "/usr"))
389                 return -EROFS;
390
391         /* Make sure we don't interfere with a running nspawn */
392         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
393         if (r < 0)
394                 return r;
395
396         /* Make sure nobody takes the new name, between the time we
397          * checked it is currently unused in all search paths, and the
398          * time we take possesion of it */
399         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
400         if (r < 0)
401                 return r;
402
403         r = image_find(new_name, NULL);
404         if (r < 0)
405                 return r;
406         if (r > 0)
407                 return -EEXIST;
408
409         switch (i->type) {
410
411         case IMAGE_DIRECTORY:
412                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
413                 (void) read_attr_path(i->path, &file_attr);
414
415                 if (file_attr & FS_IMMUTABLE_FL)
416                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
417
418                 /* fall through */
419
420         case IMAGE_SUBVOLUME:
421                 new_path = file_in_same_dir(i->path, new_name);
422                 break;
423
424         case IMAGE_RAW: {
425                 const char *fn;
426
427                 fn = strjoina(new_name, ".raw");
428                 new_path = file_in_same_dir(i->path, fn);
429                 break;
430         }
431
432         default:
433                 return -ENOTSUP;
434         }
435
436         if (!new_path)
437                 return -ENOMEM;
438
439         nn = strdup(new_name);
440         if (!nn)
441                 return -ENOMEM;
442
443         if (renameat2(AT_FDCWD, i->path, AT_FDCWD, new_path, RENAME_NOREPLACE) < 0)
444                 return -errno;
445
446         /* Restore the immutable bit, if it was set before */
447         if (file_attr & FS_IMMUTABLE_FL)
448                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
449
450         free(i->path);
451         i->path = new_path;
452         new_path = NULL;
453
454         free(i->name);
455         i->name = nn;
456         nn = NULL;
457
458         return 0;
459 }
460
461 int image_clone(Image *i, const char *new_name, bool read_only) {
462         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
463         const char *new_path;
464         int r;
465
466         assert(i);
467
468         if (!image_name_is_valid(new_name))
469                 return -EINVAL;
470
471         /* Make sure nobody takes the new name, between the time we
472          * checked it is currently unused in all search paths, and the
473          * time we take possesion of it */
474         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
475         if (r < 0)
476                 return r;
477
478         r = image_find(new_name, NULL);
479         if (r < 0)
480                 return r;
481         if (r > 0)
482                 return -EEXIST;
483
484         switch (i->type) {
485
486         case IMAGE_SUBVOLUME:
487         case IMAGE_DIRECTORY:
488                 new_path = strjoina("/var/lib/machines/", new_name);
489
490                 r = btrfs_subvol_snapshot(i->path, new_path, read_only, true);
491                 break;
492
493         case IMAGE_RAW:
494                 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
495
496                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
497                 break;
498
499         default:
500                 return -ENOTSUP;
501         }
502
503         if (r < 0)
504                 return r;
505
506         return 0;
507 }
508
509 int image_read_only(Image *i, bool b) {
510         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
511         int r;
512         assert(i);
513
514         if (path_equal(i->path, "/") ||
515             path_startswith(i->path, "/usr"))
516                 return -EROFS;
517
518         /* Make sure we don't interfere with a running nspawn */
519         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
520         if (r < 0)
521                 return r;
522
523         switch (i->type) {
524
525         case IMAGE_SUBVOLUME:
526                 r = btrfs_subvol_set_read_only(i->path, b);
527                 if (r < 0)
528                         return r;
529
530                 break;
531
532         case IMAGE_DIRECTORY:
533                 /* For simple directory trees we cannot use the access
534                    mode of the top-level directory, since it has an
535                    effect on the container itself.  However, we can
536                    use the "immutable" flag, to at least make the
537                    top-level directory read-only. It's not as good as
538                    a read-only subvolume, but at least something, and
539                    we can read the value back.*/
540
541                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
542                 if (r < 0)
543                         return r;
544
545                 break;
546
547         case IMAGE_RAW: {
548                 struct stat st;
549
550                 if (stat(i->path, &st) < 0)
551                         return -errno;
552
553                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
554                         return -errno;
555
556                 /* If the images is now read-only, it's a good time to
557                  * defrag it, given that no write patterns will
558                  * fragment it again. */
559                 if (b)
560                         (void) btrfs_defrag(i->path);
561                 break;
562         }
563
564         default:
565                 return -ENOTSUP;
566         }
567
568         return 0;
569 }
570
571 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
572         _cleanup_free_ char *p = NULL;
573         LockFile t = LOCK_FILE_INIT;
574         struct stat st;
575         int r;
576
577         assert(path);
578         assert(global);
579         assert(local);
580
581         /* Locks an image path. This actually creates two locks: one
582          * "local" one, next to the image path itself, which might be
583          * shared via NFS. And another "global" one, in /run, that
584          * uses the device/inode number. This has the benefit that we
585          * can even lock a tree that is a mount point, correctly. */
586
587         if (path_equal(path, "/"))
588                 return -EBUSY;
589
590         if (!path_is_absolute(path))
591                 return -EINVAL;
592
593         if (stat(path, &st) >= 0) {
594                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
595                         return -ENOMEM;
596         }
597
598         r = make_lock_file_for(path, operation, &t);
599         if (r < 0)
600                 return r;
601
602         if (p) {
603                 mkdir_p("/run/systemd/nspawn/locks", 0600);
604
605                 r = make_lock_file(p, operation, global);
606                 if (r < 0) {
607                         release_lock_file(&t);
608                         return r;
609                 }
610         }
611
612         *local = t;
613         return 0;
614 }
615
616 int image_set_limit(Image *i, uint64_t referenced_max) {
617         assert(i);
618
619         if (path_equal(i->path, "/") ||
620             path_startswith(i->path, "/usr"))
621                 return -EROFS;
622
623         if (i->type != IMAGE_SUBVOLUME)
624                 return -ENOTSUP;
625
626         return btrfs_quota_limit(i->path, referenced_max);
627 }
628
629 int image_name_lock(const char *name, int operation, LockFile *ret) {
630         const char *p;
631
632         assert(name);
633         assert(ret);
634
635         /* Locks an image name, regardless of the precise path used. */
636
637         if (!image_name_is_valid(name))
638                 return -EINVAL;
639
640         if (streq(name, ".host"))
641                 return -EBUSY;
642
643         mkdir_p("/run/systemd/nspawn/locks", 0600);
644         p = strjoina("/run/systemd/nspawn/locks/name-", name);
645
646         return make_lock_file(p, operation, ret);
647 }
648
649 bool image_name_is_valid(const char *s) {
650         if (!filename_is_valid(s))
651                 return false;
652
653         if (string_has_cc(s, NULL))
654                 return false;
655
656         if (!utf8_is_valid(s))
657                 return false;
658
659         /* Temporary files for atomically creating new files */
660         if (startswith(s, ".#"))
661                 return false;
662
663         return true;
664 }
665
666 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
667         [IMAGE_DIRECTORY] = "directory",
668         [IMAGE_SUBVOLUME] = "subvolume",
669         [IMAGE_RAW] = "raw",
670 };
671
672 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);