chiark / gitweb /
shared/capability: go frugal on space for caps
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "strv.h"
27 #include "utf8.h"
28 #include "btrfs-util.h"
29 #include "path-util.h"
30 #include "copy.h"
31 #include "mkdir.h"
32 #include "machine-image.h"
33
34 static const char image_search_path[] =
35         "/var/lib/machines\0"
36         "/var/lib/container\0"
37         "/usr/local/lib/machines\0"
38         "/usr/lib/machines\0";
39
40 Image *image_unref(Image *i) {
41         if (!i)
42                 return NULL;
43
44         free(i->name);
45         free(i->path);
46         free(i);
47         return NULL;
48 }
49
50 static int image_new(
51                 ImageType t,
52                 const char *pretty,
53                 const char *path,
54                 const char *filename,
55                 bool read_only,
56                 usec_t crtime,
57                 usec_t mtime,
58                 Image **ret) {
59
60         _cleanup_(image_unrefp) Image *i = NULL;
61
62         assert(t >= 0);
63         assert(t < _IMAGE_TYPE_MAX);
64         assert(pretty);
65         assert(filename);
66         assert(ret);
67
68         i = new0(Image, 1);
69         if (!i)
70                 return -ENOMEM;
71
72         i->type = t;
73         i->read_only = read_only;
74         i->crtime = crtime;
75         i->mtime = mtime;
76         i->usage = i->usage_exclusive = (uint64_t) -1;
77         i->limit = i->limit_exclusive = (uint64_t) -1;
78
79         i->name = strdup(pretty);
80         if (!i->name)
81                 return -ENOMEM;
82
83         if (path)
84                 i->path = strjoin(path, "/", filename, NULL);
85         else
86                 i->path = strdup(filename);
87
88         if (!i->path)
89                 return -ENOMEM;
90
91         path_kill_slashes(i->path);
92
93         *ret = i;
94         i = NULL;
95
96         return 0;
97 }
98
99 static int image_make(
100                 const char *pretty,
101                 int dfd,
102                 const char *path,
103                 const char *filename,
104                 Image **ret) {
105
106         struct stat st;
107         bool read_only;
108         int r;
109
110         assert(filename);
111
112         /* We explicitly *do* follow symlinks here, since we want to
113          * allow symlinking trees into /var/lib/machines/, and treat
114          * them normally. */
115
116         if (fstatat(dfd, filename, &st, 0) < 0)
117                 return -errno;
118
119         read_only =
120                 (path && path_startswith(path, "/usr")) ||
121                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
122
123         if (S_ISDIR(st.st_mode)) {
124                 _cleanup_close_ int fd = -1;
125                 unsigned file_attr = 0;
126
127                 if (!ret)
128                         return 1;
129
130                 if (!pretty)
131                         pretty = filename;
132
133                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
134                 if (fd < 0)
135                         return -errno;
136
137                 /* btrfs subvolumes have inode 256 */
138                 if (st.st_ino == 256) {
139                         struct statfs sfs;
140
141                         if (fstatfs(fd, &sfs) < 0)
142                                 return -errno;
143
144                         if (F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC)) {
145                                 BtrfsSubvolInfo info;
146                                 BtrfsQuotaInfo quota;
147
148                                 /* It's a btrfs subvolume */
149
150                                 r = btrfs_subvol_get_info_fd(fd, &info);
151                                 if (r < 0)
152                                         return r;
153
154                                 r = image_new(IMAGE_SUBVOLUME,
155                                               pretty,
156                                               path,
157                                               filename,
158                                               info.read_only || read_only,
159                                               info.otime,
160                                               0,
161                                               ret);
162                                 if (r < 0)
163                                         return r;
164
165                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
166                                 if (r >= 0) {
167                                         (*ret)->usage = quota.referred;
168                                         (*ret)->usage_exclusive = quota.exclusive;
169
170                                         (*ret)->limit = quota.referred_max;
171                                         (*ret)->limit_exclusive = quota.exclusive_max;
172                                 }
173
174                                 return 1;
175                         }
176                 }
177
178                 /* If the IMMUTABLE bit is set, we consider the
179                  * directory read-only. Since the ioctl is not
180                  * supported everywhere we ignore failures. */
181                 (void) read_attr_fd(fd, &file_attr);
182
183                 /* It's just a normal directory. */
184                 r = image_new(IMAGE_DIRECTORY,
185                               pretty,
186                               path,
187                               filename,
188                               read_only || (file_attr & FS_IMMUTABLE_FL),
189                               0,
190                               0,
191                               ret);
192                 if (r < 0)
193                         return r;
194
195                 return 1;
196
197         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
198                 usec_t crtime = 0;
199
200                 /* It's a RAW disk image */
201
202                 if (!ret)
203                         return 1;
204
205                 fd_getcrtime_at(dfd, filename, &crtime, 0);
206
207                 if (!pretty)
208                         pretty = strndupa(filename, strlen(filename) - 4);
209
210                 r = image_new(IMAGE_RAW,
211                               pretty,
212                               path,
213                               filename,
214                               !(st.st_mode & 0222) || read_only,
215                               crtime,
216                               timespec_load(&st.st_mtim),
217                               ret);
218                 if (r < 0)
219                         return r;
220
221                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
222                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
223
224                 return 1;
225         }
226
227         return 0;
228 }
229
230 int image_find(const char *name, Image **ret) {
231         const char *path;
232         int r;
233
234         assert(name);
235
236         /* There are no images with invalid names */
237         if (!image_name_is_valid(name))
238                 return 0;
239
240         NULSTR_FOREACH(path, image_search_path) {
241                 _cleanup_closedir_ DIR *d = NULL;
242
243                 d = opendir(path);
244                 if (!d) {
245                         if (errno == ENOENT)
246                                 continue;
247
248                         return -errno;
249                 }
250
251                 r = image_make(NULL, dirfd(d), path, name, ret);
252                 if (r == 0 || r == -ENOENT) {
253                         _cleanup_free_ char *raw = NULL;
254
255                         raw = strappend(name, ".raw");
256                         if (!raw)
257                                 return -ENOMEM;
258
259                         r = image_make(NULL, dirfd(d), path, raw, ret);
260                         if (r == 0 || r == -ENOENT)
261                                 continue;
262                 }
263                 if (r < 0)
264                         return r;
265
266                 return 1;
267         }
268
269         if (streq(name, ".host"))
270                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
271
272         return 0;
273 };
274
275 int image_discover(Hashmap *h) {
276         const char *path;
277         int r;
278
279         assert(h);
280
281         NULSTR_FOREACH(path, image_search_path) {
282                 _cleanup_closedir_ DIR *d = NULL;
283                 struct dirent *de;
284
285                 d = opendir(path);
286                 if (!d) {
287                         if (errno == ENOENT)
288                                 continue;
289
290                         return -errno;
291                 }
292
293                 FOREACH_DIRENT_ALL(de, d, return -errno) {
294                         _cleanup_(image_unrefp) Image *image = NULL;
295
296                         if (!image_name_is_valid(de->d_name))
297                                 continue;
298
299                         if (hashmap_contains(h, de->d_name))
300                                 continue;
301
302                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
303                         if (r == 0 || r == -ENOENT)
304                                 continue;
305                         if (r < 0)
306                                 return r;
307
308                         r = hashmap_put(h, image->name, image);
309                         if (r < 0)
310                                 return r;
311
312                         image = NULL;
313                 }
314         }
315
316         if (!hashmap_contains(h, ".host")) {
317                 _cleanup_(image_unrefp) Image *image = NULL;
318
319                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
320                 if (r < 0)
321                         return r;
322
323                 r = hashmap_put(h, image->name, image);
324                 if (r < 0)
325                         return r;
326
327                 image = NULL;
328
329         }
330
331         return 0;
332 }
333
334 void image_hashmap_free(Hashmap *map) {
335         Image *i;
336
337         while ((i = hashmap_steal_first(map)))
338                 image_unref(i);
339
340         hashmap_free(map);
341 }
342
343 int image_remove(Image *i) {
344         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
345         int r;
346
347         assert(i);
348
349         if (path_equal(i->path, "/") ||
350             path_startswith(i->path, "/usr"))
351                 return -EROFS;
352
353         /* Make sure we don't interfere with a running nspawn */
354         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
355         if (r < 0)
356                 return r;
357
358         switch (i->type) {
359
360         case IMAGE_SUBVOLUME:
361                 return btrfs_subvol_remove(i->path);
362
363         case IMAGE_DIRECTORY:
364                 /* Allow deletion of read-only directories */
365                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
366
367                 /* fall through */
368
369         case IMAGE_RAW:
370                 return rm_rf_dangerous(i->path, false, true, false);
371
372         default:
373                 return -ENOTSUP;
374         }
375 }
376
377 int image_rename(Image *i, const char *new_name) {
378         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
379         _cleanup_free_ char *new_path = NULL, *nn = NULL;
380         unsigned file_attr = 0;
381         int r;
382
383         assert(i);
384
385         if (!image_name_is_valid(new_name))
386                 return -EINVAL;
387
388         if (path_equal(i->path, "/") ||
389             path_startswith(i->path, "/usr"))
390                 return -EROFS;
391
392         /* Make sure we don't interfere with a running nspawn */
393         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
394         if (r < 0)
395                 return r;
396
397         /* Make sure nobody takes the new name, between the time we
398          * checked it is currently unused in all search paths, and the
399          * time we take possesion of it */
400         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
401         if (r < 0)
402                 return r;
403
404         r = image_find(new_name, NULL);
405         if (r < 0)
406                 return r;
407         if (r > 0)
408                 return -EEXIST;
409
410         switch (i->type) {
411
412         case IMAGE_DIRECTORY:
413                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
414                 (void) read_attr_path(i->path, &file_attr);
415
416                 if (file_attr & FS_IMMUTABLE_FL)
417                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
418
419                 /* fall through */
420
421         case IMAGE_SUBVOLUME:
422                 new_path = file_in_same_dir(i->path, new_name);
423                 break;
424
425         case IMAGE_RAW: {
426                 const char *fn;
427
428                 fn = strappenda(new_name, ".raw");
429                 new_path = file_in_same_dir(i->path, fn);
430                 break;
431         }
432
433         default:
434                 return -ENOTSUP;
435         }
436
437         if (!new_path)
438                 return -ENOMEM;
439
440         nn = strdup(new_name);
441         if (!nn)
442                 return -ENOMEM;
443
444         if (renameat2(AT_FDCWD, i->path, AT_FDCWD, new_path, RENAME_NOREPLACE) < 0)
445                 return -errno;
446
447         /* Restore the immutable bit, if it was set before */
448         if (file_attr & FS_IMMUTABLE_FL)
449                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
450
451         free(i->path);
452         i->path = new_path;
453         new_path = NULL;
454
455         free(i->name);
456         i->name = nn;
457         nn = NULL;
458
459         return 0;
460 }
461
462 int image_clone(Image *i, const char *new_name, bool read_only) {
463         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
464         const char *new_path;
465         int r;
466
467         assert(i);
468
469         if (!image_name_is_valid(new_name))
470                 return -EINVAL;
471
472         /* Make sure nobody takes the new name, between the time we
473          * checked it is currently unused in all search paths, and the
474          * time we take possesion of it */
475         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
476         if (r < 0)
477                 return r;
478
479         r = image_find(new_name, NULL);
480         if (r < 0)
481                 return r;
482         if (r > 0)
483                 return -EEXIST;
484
485         switch (i->type) {
486
487         case IMAGE_SUBVOLUME:
488         case IMAGE_DIRECTORY:
489                 new_path = strappenda("/var/lib/machines/", new_name);
490
491                 r = btrfs_subvol_snapshot(i->path, new_path, read_only, true);
492                 break;
493
494         case IMAGE_RAW:
495                 new_path = strappenda("/var/lib/machines/", new_name, ".raw");
496
497                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
498                 break;
499
500         default:
501                 return -ENOTSUP;
502         }
503
504         if (r < 0)
505                 return r;
506
507         return 0;
508 }
509
510 int image_read_only(Image *i, bool b) {
511         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
512         int r;
513         assert(i);
514
515         if (path_equal(i->path, "/") ||
516             path_startswith(i->path, "/usr"))
517                 return -EROFS;
518
519         /* Make sure we don't interfere with a running nspawn */
520         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
521         if (r < 0)
522                 return r;
523
524         switch (i->type) {
525
526         case IMAGE_SUBVOLUME:
527                 r = btrfs_subvol_set_read_only(i->path, b);
528                 if (r < 0)
529                         return r;
530
531                 break;
532
533         case IMAGE_DIRECTORY:
534                 /* For simple directory trees we cannot use the access
535                    mode of the top-level directory, since it has an
536                    effect on the container itself.  However, we can
537                    use the "immutable" flag, to at least make the
538                    top-level directory read-only. It's not as good as
539                    a read-only subvolume, but at least something, and
540                    we can read the value back.*/
541
542                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
543                 if (r < 0)
544                         return r;
545
546                 break;
547
548         case IMAGE_RAW: {
549                 struct stat st;
550
551                 if (stat(i->path, &st) < 0)
552                         return -errno;
553
554                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
555                         return -errno;
556
557                 /* If the images is now read-only, it's a good time to
558                  * defrag it, given that no write patterns will
559                  * fragment it again. */
560                 if (b)
561                         (void) btrfs_defrag(i->path);
562                 break;
563         }
564
565         default:
566                 return -ENOTSUP;
567         }
568
569         return 0;
570 }
571
572 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
573         _cleanup_free_ char *p = NULL;
574         LockFile t = LOCK_FILE_INIT;
575         struct stat st;
576         int r;
577
578         assert(path);
579         assert(global);
580         assert(local);
581
582         /* Locks an image path. This actually creates two locks: one
583          * "local" one, next to the image path itself, which might be
584          * shared via NFS. And another "global" one, in /run, that
585          * uses the device/inode number. This has the benefit that we
586          * can even lock a tree that is a mount point, correctly. */
587
588         if (path_equal(path, "/"))
589                 return -EBUSY;
590
591         if (!path_is_absolute(path))
592                 return -EINVAL;
593
594         if (stat(path, &st) >= 0) {
595                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
596                         return -ENOMEM;
597         }
598
599         r = make_lock_file_for(path, operation, &t);
600         if (r < 0)
601                 return r;
602
603         if (p) {
604                 mkdir_p("/run/systemd/nspawn/locks", 0600);
605
606                 r = make_lock_file(p, operation, global);
607                 if (r < 0) {
608                         release_lock_file(&t);
609                         return r;
610                 }
611         }
612
613         *local = t;
614         return 0;
615 }
616
617 int image_name_lock(const char *name, int operation, LockFile *ret) {
618         const char *p;
619
620         assert(name);
621         assert(ret);
622
623         /* Locks an image name, regardless of the precise path used. */
624
625         if (!image_name_is_valid(name))
626                 return -EINVAL;
627
628         if (streq(name, ".host"))
629                 return -EBUSY;
630
631         mkdir_p("/run/systemd/nspawn/locks", 0600);
632         p = strappenda("/run/systemd/nspawn/locks/name-", name);
633
634         return make_lock_file(p, operation, ret);
635 }
636
637 bool image_name_is_valid(const char *s) {
638         if (!filename_is_valid(s))
639                 return false;
640
641         if (string_has_cc(s, NULL))
642                 return false;
643
644         if (!utf8_is_valid(s))
645                 return false;
646
647         /* Temporary files for atomically creating new files */
648         if (startswith(s, ".#"))
649                 return false;
650
651         return true;
652 }
653
654 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
655         [IMAGE_DIRECTORY] = "directory",
656         [IMAGE_SUBVOLUME] = "subvolume",
657         [IMAGE_RAW] = "raw",
658 };
659
660 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);