chiark / gitweb /
Don't try to set up cgroups for new users
[elogind.git] / src / shared / machine-image.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2013 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/statfs.h>
23 #include <linux/fs.h>
24 #include <fcntl.h>
25
26 #include "utf8.h"
27 #include "btrfs-util.h"
28 #include "path-util.h"
29 #include "copy.h"
30 #include "mkdir.h"
31 #include "machine-image.h"
32
33 static const char image_search_path[] =
34         "/var/lib/machines\0"
35         "/var/lib/container\0"
36         "/usr/local/lib/machines\0"
37         "/usr/lib/machines\0";
38
39 Image *image_unref(Image *i) {
40         if (!i)
41                 return NULL;
42
43         free(i->name);
44         free(i->path);
45         free(i);
46         return NULL;
47 }
48
49 static int image_new(
50                 ImageType t,
51                 const char *pretty,
52                 const char *path,
53                 const char *filename,
54                 bool read_only,
55                 usec_t crtime,
56                 usec_t mtime,
57                 Image **ret) {
58
59         _cleanup_(image_unrefp) Image *i = NULL;
60
61         assert(t >= 0);
62         assert(t < _IMAGE_TYPE_MAX);
63         assert(pretty);
64         assert(filename);
65         assert(ret);
66
67         i = new0(Image, 1);
68         if (!i)
69                 return -ENOMEM;
70
71         i->type = t;
72         i->read_only = read_only;
73         i->crtime = crtime;
74         i->mtime = mtime;
75         i->usage = i->usage_exclusive = (uint64_t) -1;
76         i->limit = i->limit_exclusive = (uint64_t) -1;
77
78         i->name = strdup(pretty);
79         if (!i->name)
80                 return -ENOMEM;
81
82         if (path)
83                 i->path = strjoin(path, "/", filename, NULL);
84         else
85                 i->path = strdup(filename);
86
87         if (!i->path)
88                 return -ENOMEM;
89
90         path_kill_slashes(i->path);
91
92         *ret = i;
93         i = NULL;
94
95         return 0;
96 }
97
98 static int image_make(
99                 const char *pretty,
100                 int dfd,
101                 const char *path,
102                 const char *filename,
103                 Image **ret) {
104
105         struct stat st;
106         bool read_only;
107         int r;
108
109         assert(filename);
110
111         /* We explicitly *do* follow symlinks here, since we want to
112          * allow symlinking trees into /var/lib/machines/, and treat
113          * them normally. */
114
115         if (fstatat(dfd, filename, &st, 0) < 0)
116                 return -errno;
117
118         read_only =
119                 (path && path_startswith(path, "/usr")) ||
120                 (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
121
122         if (S_ISDIR(st.st_mode)) {
123                 _cleanup_close_ int fd = -1;
124                 unsigned file_attr = 0;
125
126                 if (!ret)
127                         return 1;
128
129                 if (!pretty)
130                         pretty = filename;
131
132                 fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
133                 if (fd < 0)
134                         return -errno;
135
136                 /* btrfs subvolumes have inode 256 */
137                 if (st.st_ino == 256) {
138                         struct statfs sfs;
139
140                         if (fstatfs(fd, &sfs) < 0)
141                                 return -errno;
142
143                         if (F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC)) {
144                                 BtrfsSubvolInfo info;
145                                 BtrfsQuotaInfo quota;
146
147                                 /* It's a btrfs subvolume */
148
149                                 r = btrfs_subvol_get_info_fd(fd, &info);
150                                 if (r < 0)
151                                         return r;
152
153                                 r = image_new(IMAGE_SUBVOLUME,
154                                               pretty,
155                                               path,
156                                               filename,
157                                               info.read_only || read_only,
158                                               info.otime,
159                                               0,
160                                               ret);
161                                 if (r < 0)
162                                         return r;
163
164                                 r = btrfs_subvol_get_quota_fd(fd, &quota);
165                                 if (r >= 0) {
166                                         (*ret)->usage = quota.referenced;
167                                         (*ret)->usage_exclusive = quota.exclusive;
168
169                                         (*ret)->limit = quota.referenced_max;
170                                         (*ret)->limit_exclusive = quota.exclusive_max;
171                                 }
172
173                                 return 1;
174                         }
175                 }
176
177                 /* If the IMMUTABLE bit is set, we consider the
178                  * directory read-only. Since the ioctl is not
179                  * supported everywhere we ignore failures. */
180                 (void) read_attr_fd(fd, &file_attr);
181
182                 /* It's just a normal directory. */
183                 r = image_new(IMAGE_DIRECTORY,
184                               pretty,
185                               path,
186                               filename,
187                               read_only || (file_attr & FS_IMMUTABLE_FL),
188                               0,
189                               0,
190                               ret);
191                 if (r < 0)
192                         return r;
193
194                 return 1;
195
196         } else if (S_ISREG(st.st_mode) && endswith(filename, ".raw")) {
197                 usec_t crtime = 0;
198
199                 /* It's a RAW disk image */
200
201                 if (!ret)
202                         return 1;
203
204                 fd_getcrtime_at(dfd, filename, &crtime, 0);
205
206                 if (!pretty)
207                         pretty = strndupa(filename, strlen(filename) - 4);
208
209                 r = image_new(IMAGE_RAW,
210                               pretty,
211                               path,
212                               filename,
213                               !(st.st_mode & 0222) || read_only,
214                               crtime,
215                               timespec_load(&st.st_mtim),
216                               ret);
217                 if (r < 0)
218                         return r;
219
220                 (*ret)->usage = (*ret)->usage_exclusive = st.st_blocks * 512;
221                 (*ret)->limit = (*ret)->limit_exclusive = st.st_size;
222
223                 return 1;
224         }
225
226         return 0;
227 }
228
229 int image_find(const char *name, Image **ret) {
230         const char *path;
231         int r;
232
233         assert(name);
234
235         /* There are no images with invalid names */
236         if (!image_name_is_valid(name))
237                 return 0;
238
239         NULSTR_FOREACH(path, image_search_path) {
240                 _cleanup_closedir_ DIR *d = NULL;
241
242                 d = opendir(path);
243                 if (!d) {
244                         if (errno == ENOENT)
245                                 continue;
246
247                         return -errno;
248                 }
249
250                 r = image_make(NULL, dirfd(d), path, name, ret);
251                 if (r == 0 || r == -ENOENT) {
252                         _cleanup_free_ char *raw = NULL;
253
254                         raw = strappend(name, ".raw");
255                         if (!raw)
256                                 return -ENOMEM;
257
258                         r = image_make(NULL, dirfd(d), path, raw, ret);
259                         if (r == 0 || r == -ENOENT)
260                                 continue;
261                 }
262                 if (r < 0)
263                         return r;
264
265                 return 1;
266         }
267
268         if (streq(name, ".host"))
269                 return image_make(".host", AT_FDCWD, NULL, "/", ret);
270
271         return 0;
272 };
273
274 int image_discover(Hashmap *h) {
275         const char *path;
276         int r;
277
278         assert(h);
279
280         NULSTR_FOREACH(path, image_search_path) {
281                 _cleanup_closedir_ DIR *d = NULL;
282                 struct dirent *de;
283
284                 d = opendir(path);
285                 if (!d) {
286                         if (errno == ENOENT)
287                                 continue;
288
289                         return -errno;
290                 }
291
292                 FOREACH_DIRENT_ALL(de, d, return -errno) {
293                         _cleanup_(image_unrefp) Image *image = NULL;
294
295                         if (!image_name_is_valid(de->d_name))
296                                 continue;
297
298                         if (hashmap_contains(h, de->d_name))
299                                 continue;
300
301                         r = image_make(NULL, dirfd(d), path, de->d_name, &image);
302                         if (r == 0 || r == -ENOENT)
303                                 continue;
304                         if (r < 0)
305                                 return r;
306
307                         r = hashmap_put(h, image->name, image);
308                         if (r < 0)
309                                 return r;
310
311                         image = NULL;
312                 }
313         }
314
315         if (!hashmap_contains(h, ".host")) {
316                 _cleanup_(image_unrefp) Image *image = NULL;
317
318                 r = image_make(".host", AT_FDCWD, NULL, "/", &image);
319                 if (r < 0)
320                         return r;
321
322                 r = hashmap_put(h, image->name, image);
323                 if (r < 0)
324                         return r;
325
326                 image = NULL;
327
328         }
329
330         return 0;
331 }
332
333 void image_hashmap_free(Hashmap *map) {
334         Image *i;
335
336         while ((i = hashmap_steal_first(map)))
337                 image_unref(i);
338
339         hashmap_free(map);
340 }
341
342 int image_remove(Image *i) {
343         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
344         int r;
345
346         assert(i);
347
348         if (path_equal(i->path, "/") ||
349             path_startswith(i->path, "/usr"))
350                 return -EROFS;
351
352         /* Make sure we don't interfere with a running nspawn */
353         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
354         if (r < 0)
355                 return r;
356
357         switch (i->type) {
358
359         case IMAGE_SUBVOLUME:
360                 return btrfs_subvol_remove(i->path);
361
362         case IMAGE_DIRECTORY:
363                 /* Allow deletion of read-only directories */
364                 (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
365
366                 /* fall through */
367
368         case IMAGE_RAW:
369                 return rm_rf_dangerous(i->path, false, true, false);
370
371         default:
372                 return -EOPNOTSUPP;
373         }
374 }
375
376 int image_rename(Image *i, const char *new_name) {
377         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
378         _cleanup_free_ char *new_path = NULL, *nn = NULL;
379         unsigned file_attr = 0;
380         int r;
381
382         assert(i);
383
384         if (!image_name_is_valid(new_name))
385                 return -EINVAL;
386
387         if (path_equal(i->path, "/") ||
388             path_startswith(i->path, "/usr"))
389                 return -EROFS;
390
391         /* Make sure we don't interfere with a running nspawn */
392         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
393         if (r < 0)
394                 return r;
395
396         /* Make sure nobody takes the new name, between the time we
397          * checked it is currently unused in all search paths, and the
398          * time we take possesion of it */
399         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
400         if (r < 0)
401                 return r;
402
403         r = image_find(new_name, NULL);
404         if (r < 0)
405                 return r;
406         if (r > 0)
407                 return -EEXIST;
408
409         switch (i->type) {
410
411         case IMAGE_DIRECTORY:
412                 /* Turn of the immutable bit while we rename the image, so that we can rename it */
413                 (void) read_attr_path(i->path, &file_attr);
414
415                 if (file_attr & FS_IMMUTABLE_FL)
416                         (void) chattr_path(i->path, false, FS_IMMUTABLE_FL);
417
418                 /* fall through */
419
420         case IMAGE_SUBVOLUME:
421                 new_path = file_in_same_dir(i->path, new_name);
422                 break;
423
424         case IMAGE_RAW: {
425                 const char *fn;
426
427                 fn = strjoina(new_name, ".raw");
428                 new_path = file_in_same_dir(i->path, fn);
429                 break;
430         }
431
432         default:
433                 return -EOPNOTSUPP;
434         }
435
436         if (!new_path)
437                 return -ENOMEM;
438
439         nn = strdup(new_name);
440         if (!nn)
441                 return -ENOMEM;
442
443         r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
444         if (r < 0)
445                 return r;
446
447         /* Restore the immutable bit, if it was set before */
448         if (file_attr & FS_IMMUTABLE_FL)
449                 (void) chattr_path(new_path, true, FS_IMMUTABLE_FL);
450
451         free(i->path);
452         i->path = new_path;
453         new_path = NULL;
454
455         free(i->name);
456         i->name = nn;
457         nn = NULL;
458
459         return 0;
460 }
461
462 int image_clone(Image *i, const char *new_name, bool read_only) {
463         _cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
464         const char *new_path;
465         int r;
466
467         assert(i);
468
469         if (!image_name_is_valid(new_name))
470                 return -EINVAL;
471
472         /* Make sure nobody takes the new name, between the time we
473          * checked it is currently unused in all search paths, and the
474          * time we take possesion of it */
475         r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
476         if (r < 0)
477                 return r;
478
479         r = image_find(new_name, NULL);
480         if (r < 0)
481                 return r;
482         if (r > 0)
483                 return -EEXIST;
484
485         switch (i->type) {
486
487         case IMAGE_SUBVOLUME:
488         case IMAGE_DIRECTORY:
489                 new_path = strjoina("/var/lib/machines/", new_name);
490
491                 r = btrfs_subvol_snapshot(i->path, new_path, read_only, true);
492                 break;
493
494         case IMAGE_RAW:
495                 new_path = strjoina("/var/lib/machines/", new_name, ".raw");
496
497                 r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, false, FS_NOCOW_FL);
498                 break;
499
500         default:
501                 return -EOPNOTSUPP;
502         }
503
504         if (r < 0)
505                 return r;
506
507         return 0;
508 }
509
510 int image_read_only(Image *i, bool b) {
511         _cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
512         int r;
513         assert(i);
514
515         if (path_equal(i->path, "/") ||
516             path_startswith(i->path, "/usr"))
517                 return -EROFS;
518
519         /* Make sure we don't interfere with a running nspawn */
520         r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
521         if (r < 0)
522                 return r;
523
524         switch (i->type) {
525
526         case IMAGE_SUBVOLUME:
527                 r = btrfs_subvol_set_read_only(i->path, b);
528                 if (r < 0)
529                         return r;
530
531                 break;
532
533         case IMAGE_DIRECTORY:
534                 /* For simple directory trees we cannot use the access
535                    mode of the top-level directory, since it has an
536                    effect on the container itself.  However, we can
537                    use the "immutable" flag, to at least make the
538                    top-level directory read-only. It's not as good as
539                    a read-only subvolume, but at least something, and
540                    we can read the value back.*/
541
542                 r = chattr_path(i->path, b, FS_IMMUTABLE_FL);
543                 if (r < 0)
544                         return r;
545
546                 break;
547
548         case IMAGE_RAW: {
549                 struct stat st;
550
551                 if (stat(i->path, &st) < 0)
552                         return -errno;
553
554                 if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
555                         return -errno;
556
557                 /* If the images is now read-only, it's a good time to
558                  * defrag it, given that no write patterns will
559                  * fragment it again. */
560                 if (b)
561                         (void) btrfs_defrag(i->path);
562                 break;
563         }
564
565         default:
566                 return -EOPNOTSUPP;
567         }
568
569         return 0;
570 }
571
572 int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
573         _cleanup_free_ char *p = NULL;
574         LockFile t = LOCK_FILE_INIT;
575         struct stat st;
576         int r;
577
578         assert(path);
579         assert(global);
580         assert(local);
581
582         /* Locks an image path. This actually creates two locks: one
583          * "local" one, next to the image path itself, which might be
584          * shared via NFS. And another "global" one, in /run, that
585          * uses the device/inode number. This has the benefit that we
586          * can even lock a tree that is a mount point, correctly. */
587
588         if (path_equal(path, "/"))
589                 return -EBUSY;
590
591         if (!path_is_absolute(path))
592                 return -EINVAL;
593
594         if (stat(path, &st) >= 0) {
595                 if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
596                         return -ENOMEM;
597         }
598
599         r = make_lock_file_for(path, operation, &t);
600         if (r < 0)
601                 return r;
602
603         if (p) {
604                 mkdir_p("/run/systemd/nspawn/locks", 0600);
605
606                 r = make_lock_file(p, operation, global);
607                 if (r < 0) {
608                         release_lock_file(&t);
609                         return r;
610                 }
611         }
612
613         *local = t;
614         return 0;
615 }
616
617 int image_set_limit(Image *i, uint64_t referenced_max) {
618         assert(i);
619
620         if (path_equal(i->path, "/") ||
621             path_startswith(i->path, "/usr"))
622                 return -EROFS;
623
624         if (i->type != IMAGE_SUBVOLUME)
625                 return -EOPNOTSUPP;
626
627         return btrfs_quota_limit(i->path, referenced_max);
628 }
629
630 int image_name_lock(const char *name, int operation, LockFile *ret) {
631         const char *p;
632
633         assert(name);
634         assert(ret);
635
636         /* Locks an image name, regardless of the precise path used. */
637
638         if (!image_name_is_valid(name))
639                 return -EINVAL;
640
641         if (streq(name, ".host"))
642                 return -EBUSY;
643
644         mkdir_p("/run/systemd/nspawn/locks", 0600);
645         p = strjoina("/run/systemd/nspawn/locks/name-", name);
646
647         return make_lock_file(p, operation, ret);
648 }
649
650 bool image_name_is_valid(const char *s) {
651         if (!filename_is_valid(s))
652                 return false;
653
654         if (string_has_cc(s, NULL))
655                 return false;
656
657         if (!utf8_is_valid(s))
658                 return false;
659
660         /* Temporary files for atomically creating new files */
661         if (startswith(s, ".#"))
662                 return false;
663
664         return true;
665 }
666
667 static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
668         [IMAGE_DIRECTORY] = "directory",
669         [IMAGE_SUBVOLUME] = "subvolume",
670         [IMAGE_RAW] = "raw",
671 };
672
673 DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);