chiark / gitweb /
rm-rf: never cross mount points
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "utf8.h"
27 #include "btrfs-util.h"
28 #include "path-util.h"
29 #include "copy.h"
30 #include "mkdir.h"
31 #include "rm-rf.h"
32 #include "machine-image.h"
33
34 static const char image_search_path[] =
35         "/var/lib/machines\0"
36         "/var/lib/container\0"
37         "/usr/local/lib/machines\0"
38         "/usr/lib/machines\0";
39
40 Image *image_unref(Image *i) {
41         if (!i)
42                 return NULL;
43
44         free(i->name);
45         free(i->path);
46         free(i);
47         return NULL;
48 }
49
50 static int image_new(
51                 ImageType t,
52                 const char *pretty,
53                 const char *path,
54                 const char *filename,
55                 bool read_only,
56                 usec_t crtime,
57                 usec_t mtime,
58                 Image **ret) {
59
60         _cleanup_(image_unrefp) Image *i = NULL;
61
62         assert(t >= 0);
63         assert(t < _IMAGE_TYPE_MAX);
64         assert(pretty);
65         assert(filename);
66         assert(ret);
67
68         i = new0(Image, 1);
69         if (!i)
70                 return -ENOMEM;
71
72         i->type = t;
73         i->read_only = read_only;
74         i->crtime = crtime;
75         i->mtime = mtime;
76         i->usage = i->usage_exclusive = (uint64_t) -1;
77         i->limit = i->limit_exclusive = (uint64_t) -1;
78
79         i->name = strdup(pretty);
80         if (!i->name)
81                 return -ENOMEM;
82
83         if (path)
84                 i->path = strjoin(path, "/", filename, NULL);
85         else
86                 i->path = strdup(filename);
87
88         if (!i->path)
89                 return -ENOMEM;
90
91         path_kill_slashes(i->path);
92
93         *ret = i;
94         i = NULL;
95
96         return 0;
97 }
98
99 static int image_make(
100                 const char *pretty,
101                 int dfd,
102                 const char *path,
103                 const char *filename,
104                 Image **ret) {
105
106         struct stat st;
107         bool read_only;
108         int r;
109
110         assert(filename);
111
112         /* We explicitly *do* follow symlinks here, since we want to
113          * allow symlinking trees into /var/lib/machines/, and treat
114          * them normally. */
115
116         if (fstatat(dfd, filename, &st, 0) < 0)
117                 return -errno;
118
119         read_only =
120                 (path && path_startswith(path, "/usr")) ||
121                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
122
123         if (S_ISDIR(st.st_mode)) {
124                 _cleanup_close_ int fd = -1;
125                 unsigned file_attr = 0;
126
127                 if (!ret)
128                         return 1;
129
130                 if (!pretty)
131                         pretty = filename;
132
133                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
134                 if (fd < 0)
135                         return -errno;
136
137                 /* btrfs subvolumes have inode 256 */
138                 if (st.st_ino == 256) {
139                         struct statfs sfs;
140
141                         if (fstatfs(fd, &sfs) < 0)
142                                 return -errno;
143
144                         if (F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC)) {
145                                 BtrfsSubvolInfo info;
146                                 BtrfsQuotaInfo quota;
147
148                                 /* It's a btrfs subvolume */
149
150                                 r = btrfs_subvol_get_info_fd(fd, &info);
151                                 if (r < 0)
152                                         return r;
153
154                                 r = image_new(IMAGE_SUBVOLUME,
155                                               pretty,
156                                               path,
157                                               filename,
158                                               info.read_only || read_only,
159                                               info.otime,
160                                               0,
161                                               ret);
162                                 if (r < 0)
163                                         return r;
164
165                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
166                                 if (r >= 0) {
167                                         (*ret)->usage = quota.referenced;
168                                         (*ret)->usage_exclusive = quota.exclusive;
169
170                                         (*ret)->limit = quota.referenced_max;
171                                         (*ret)->limit_exclusive = quota.exclusive_max;
172                                 }
173
174                                 return 1;
175                         }
176                 }
177
178                 /* If the IMMUTABLE bit is set, we consider the
179                  * directory read-only. Since the ioctl is not
180                  * supported everywhere we ignore failures. */
181                 (void) read_attr_fd(fd, &file_attr);
182
183                 /* It's just a normal directory. */
184                 r = image_new(IMAGE_DIRECTORY,
185                               pretty,
186                               path,
187                               filename,
188                               read_only || (file_attr & FS_IMMUTABLE_FL),
189                               0,
190                               0,
191                               ret);
192                 if (r < 0)
193                         return r;
194
195                 return 1;
196
197         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
198                 usec_t crtime = 0;
199
200                 /* It's a RAW disk image */
201
202                 if (!ret)
203                         return 1;
204
205                 fd_getcrtime_at(dfd, filename, &crtime, 0);
206
207                 if (!pretty)
208                         pretty = strndupa(filename, strlen(filename) - 4);
209
210                 r = image_new(IMAGE_RAW,
211                               pretty,
212                               path,
213                               filename,
214                               !(st.st_mode & 0222) || read_only,
215                               crtime,
216                               timespec_load(&st.st_mtim),
217                               ret);
218                 if (r < 0)
219                         return r;
220
221                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
222                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
223
224                 return 1;
225         }
226
227         return 0;
228 }
229
230 int image_find(const char *name, Image **ret) {
231         const char *path;
232         int r;
233
234         assert(name);
235
236         /* There are no images with invalid names */
237         if (!image_name_is_valid(name))
238                 return 0;
239
240         NULSTR_FOREACH(path, image_search_path) {
241                 _cleanup_closedir_ DIR *d = NULL;
242
243                 d = opendir(path);
244                 if (!d) {
245                         if (errno == ENOENT)
246                                 continue;
247
248                         return -errno;
249                 }
250
251                 r = image_make(NULL, dirfd(d), path, name, ret);
252                 if (r == 0 || r == -ENOENT) {
253                         _cleanup_free_ char *raw = NULL;
254
255                         raw = strappend(name, ".raw");
256                         if (!raw)
257                                 return -ENOMEM;
258
259                         r = image_make(NULL, dirfd(d), path, raw, ret);
260                         if (r == 0 || r == -ENOENT)
261                                 continue;
262                 }
263                 if (r < 0)
264                         return r;
265
266                 return 1;
267         }
268
269         if (streq(name, ".host"))
270                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
271
272         return 0;
273 };
274
275 int image_discover(Hashmap *h) {
276         const char *path;
277         int r;
278
279         assert(h);
280
281         NULSTR_FOREACH(path, image_search_path) {
282                 _cleanup_closedir_ DIR *d = NULL;
283                 struct dirent *de;
284
285                 d = opendir(path);
286                 if (!d) {
287                         if (errno == ENOENT)
288                                 continue;
289
290                         return -errno;
291                 }
292
293                 FOREACH_DIRENT_ALL(de, d, return -errno) {
294                         _cleanup_(image_unrefp) Image *image = NULL;
295
296                         if (!image_name_is_valid(de->d_name))
297                                 continue;
298
299                         if (hashmap_contains(h, de->d_name))
300                                 continue;
301
302                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
303                         if (r == 0 || r == -ENOENT)
304                                 continue;
305                         if (r < 0)
306                                 return r;
307
308                         r = hashmap_put(h, image->name, image);
309                         if (r < 0)
310                                 return r;
311
312                         image = NULL;
313                 }
314         }
315
316         if (!hashmap_contains(h, ".host")) {
317                 _cleanup_(image_unrefp) Image *image = NULL;
318
319                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
320                 if (r < 0)
321                         return r;
322
323                 r = hashmap_put(h, image->name, image);
324                 if (r < 0)
325                         return r;
326
327                 image = NULL;
328
329         }
330
331         return 0;
332 }
333
334 void image_hashmap_free(Hashmap *map) {
335         Image *i;
336
337         while ((i = hashmap_steal_first(map)))
338                 image_unref(i);
339
340         hashmap_free(map);
341 }
342
343 int image_remove(Image *i) {
344         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
345         int r;
346
347         assert(i);
348
349         if (path_equal(i->path, "/") ||
350             path_startswith(i->path, "/usr"))
351                 return -EROFS;
352
353         /* Make sure we don't interfere with a running nspawn */
354         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
355         if (r < 0)
356                 return r;
357
358         switch (i->type) {
359
360         case IMAGE_SUBVOLUME:
361                 return btrfs_subvol_remove(i->path);
362
363         case IMAGE_DIRECTORY:
364                 /* Allow deletion of read-only directories */
365                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
366
367                 /* fall through */
368
369         case IMAGE_RAW:
370                 return rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL);
371
372         default:
373                 return -EOPNOTSUPP;
374         }
375 }
376
377 int image_rename(Image *i, const char *new_name) {
378         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
379         _cleanup_free_ char *new_path = NULL, *nn = NULL;
380         unsigned file_attr = 0;
381         int r;
382
383         assert(i);
384
385         if (!image_name_is_valid(new_name))
386                 return -EINVAL;
387
388         if (path_equal(i->path, "/") ||
389             path_startswith(i->path, "/usr"))
390                 return -EROFS;
391
392         /* Make sure we don't interfere with a running nspawn */
393         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
394         if (r < 0)
395                 return r;
396
397         /* Make sure nobody takes the new name, between the time we
398          * checked it is currently unused in all search paths, and the
399          * time we take possesion of it */
400         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
401         if (r < 0)
402                 return r;
403
404         r = image_find(new_name, NULL);
405         if (r < 0)
406                 return r;
407         if (r > 0)
408                 return -EEXIST;
409
410         switch (i->type) {
411
412         case IMAGE_DIRECTORY:
413                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
414                 (void) read_attr_path(i->path, &file_attr);
415
416                 if (file_attr & FS_IMMUTABLE_FL)
417                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
418
419                 /* fall through */
420
421         case IMAGE_SUBVOLUME:
422                 new_path = file_in_same_dir(i->path, new_name);
423                 break;
424
425         case IMAGE_RAW: {
426                 const char *fn;
427
428                 fn = strjoina(new_name, ".raw");
429                 new_path = file_in_same_dir(i->path, fn);
430                 break;
431         }
432
433         default:
434                 return -EOPNOTSUPP;
435         }
436
437         if (!new_path)
438                 return -ENOMEM;
439
440         nn = strdup(new_name);
441         if (!nn)
442                 return -ENOMEM;
443
444         r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
445         if (r < 0)
446                 return r;
447
448         /* Restore the immutable bit, if it was set before */
449         if (file_attr & FS_IMMUTABLE_FL)
450                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
451
452         free(i->path);
453         i->path = new_path;
454         new_path = NULL;
455
456         free(i->name);
457         i->name = nn;
458         nn = NULL;
459
460         return 0;
461 }
462
463 int image_clone(Image *i, const char *new_name, bool read_only) {
464         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
465         const char *new_path;
466         int r;
467
468         assert(i);
469
470         if (!image_name_is_valid(new_name))
471                 return -EINVAL;
472
473         /* Make sure nobody takes the new name, between the time we
474          * checked it is currently unused in all search paths, and the
475          * time we take possesion of it */
476         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
477         if (r < 0)
478                 return r;
479
480         r = image_find(new_name, NULL);
481         if (r < 0)
482                 return r;
483         if (r > 0)
484                 return -EEXIST;
485
486         switch (i->type) {
487
488         case IMAGE_SUBVOLUME:
489         case IMAGE_DIRECTORY:
490                 new_path = strjoina("/var/lib/machines/", new_name);
491
492                 r = btrfs_subvol_snapshot(i->path, new_path, read_only, true);
493                 break;
494
495         case IMAGE_RAW:
496                 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
497
498                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
499                 break;
500
501         default:
502                 return -EOPNOTSUPP;
503         }
504
505         if (r < 0)
506                 return r;
507
508         return 0;
509 }
510
511 int image_read_only(Image *i, bool b) {
512         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
513         int r;
514         assert(i);
515
516         if (path_equal(i->path, "/") ||
517             path_startswith(i->path, "/usr"))
518                 return -EROFS;
519
520         /* Make sure we don't interfere with a running nspawn */
521         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
522         if (r < 0)
523                 return r;
524
525         switch (i->type) {
526
527         case IMAGE_SUBVOLUME:
528                 r = btrfs_subvol_set_read_only(i->path, b);
529                 if (r < 0)
530                         return r;
531
532                 break;
533
534         case IMAGE_DIRECTORY:
535                 /* For simple directory trees we cannot use the access
536                    mode of the top-level directory, since it has an
537                    effect on the container itself.  However, we can
538                    use the "immutable" flag, to at least make the
539                    top-level directory read-only. It's not as good as
540                    a read-only subvolume, but at least something, and
541                    we can read the value back.*/
542
543                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
544                 if (r < 0)
545                         return r;
546
547                 break;
548
549         case IMAGE_RAW: {
550                 struct stat st;
551
552                 if (stat(i->path, &st) < 0)
553                         return -errno;
554
555                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
556                         return -errno;
557
558                 /* If the images is now read-only, it's a good time to
559                  * defrag it, given that no write patterns will
560                  * fragment it again. */
561                 if (b)
562                         (void) btrfs_defrag(i->path);
563                 break;
564         }
565
566         default:
567                 return -EOPNOTSUPP;
568         }
569
570         return 0;
571 }
572
573 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
574         _cleanup_free_ char *p = NULL;
575         LockFile t = LOCK_FILE_INIT;
576         struct stat st;
577         int r;
578
579         assert(path);
580         assert(global);
581         assert(local);
582
583         /* Locks an image path. This actually creates two locks: one
584          * "local" one, next to the image path itself, which might be
585          * shared via NFS. And another "global" one, in /run, that
586          * uses the device/inode number. This has the benefit that we
587          * can even lock a tree that is a mount point, correctly. */
588
589         if (path_equal(path, "/"))
590                 return -EBUSY;
591
592         if (!path_is_absolute(path))
593                 return -EINVAL;
594
595         if (stat(path, &st) >= 0) {
596                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
597                         return -ENOMEM;
598         }
599
600         r = make_lock_file_for(path, operation, &t);
601         if (r < 0)
602                 return r;
603
604         if (p) {
605                 mkdir_p("/run/systemd/nspawn/locks", 0600);
606
607                 r = make_lock_file(p, operation, global);
608                 if (r < 0) {
609                         release_lock_file(&t);
610                         return r;
611                 }
612         }
613
614         *local = t;
615         return 0;
616 }
617
618 int image_set_limit(Image *i, uint64_t referenced_max) {
619         assert(i);
620
621         if (path_equal(i->path, "/") ||
622             path_startswith(i->path, "/usr"))
623                 return -EROFS;
624
625         if (i->type != IMAGE_SUBVOLUME)
626                 return -EOPNOTSUPP;
627
628         return btrfs_quota_limit(i->path, referenced_max);
629 }
630
631 int image_name_lock(const char *name, int operation, LockFile *ret) {
632         const char *p;
633
634         assert(name);
635         assert(ret);
636
637         /* Locks an image name, regardless of the precise path used. */
638
639         if (!image_name_is_valid(name))
640                 return -EINVAL;
641
642         if (streq(name, ".host"))
643                 return -EBUSY;
644
645         mkdir_p("/run/systemd/nspawn/locks", 0600);
646         p = strjoina("/run/systemd/nspawn/locks/name-", name);
647
648         return make_lock_file(p, operation, ret);
649 }
650
651 bool image_name_is_valid(const char *s) {
652         if (!filename_is_valid(s))
653                 return false;
654
655         if (string_has_cc(s, NULL))
656                 return false;
657
658         if (!utf8_is_valid(s))
659                 return false;
660
661         /* Temporary files for atomically creating new files */
662         if (startswith(s, ".#"))
663                 return false;
664
665         return true;
666 }
667
668 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
669         [IMAGE_DIRECTORY] = "directory",
670         [IMAGE_SUBVOLUME] = "subvolume",
671         [IMAGE_RAW] = "raw",
672 };
673
674 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);