Commit | Line | Data |
---|---|---|
99248ed2 | 1 | #! @PERL@ |
99248ed2 MW |
2 | ### |
3 | ### Remove an LVM snapshot, without falling foul of LVM bugs | |
4 | ### | |
5 | ### (c) 2011 Mark Wooding | |
6 | ### | |
7 | ||
8 | ###----- Licensing notice --------------------------------------------------- | |
9 | ### | |
13678d88 MW |
10 | ### This file is part of the distorted.org.uk backup suite. |
11 | ### | |
12 | ### distorted-backup is free software; you can redistribute it and/or modify | |
99248ed2 MW |
13 | ### it under the terms of the GNU General Public License as published by |
14 | ### the Free Software Foundation; either version 2 of the License, or | |
15 | ### (at your option) any later version. | |
16 | ### | |
13678d88 | 17 | ### distorted-backup is distributed in the hope that it will be useful, |
99248ed2 MW |
18 | ### but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ### GNU General Public License for more details. | |
21 | ### | |
13678d88 MW |
22 | ### You should have received a copy of the GNU General Public License along |
23 | ### with distorted-backup; if not, write to the Free Software Foundation, | |
99248ed2 MW |
24 | ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
25 | ||
26 | use Cwd qw(realpath); | |
27 | use Errno qw(:POSIX); | |
28 | use Fcntl qw(:mode); | |
29 | use File::stat; | |
30 | use Getopt::Long qw(:config gnu_compat bundling no_ignore_case); | |
31 | use IO::Handle; | |
32 | use Time::HiRes qw(time); | |
33 | ||
34 | our $VERSION = "@VERSION@"; | |
35 | ||
36 | ###-------------------------------------------------------------------------- | |
37 | ### Utilities. | |
38 | ||
39 | ## Error handling and reporting. | |
40 | (our $QUIS = $0) =~ s:^.*/::; | |
41 | our $DEBUG = 0; | |
42 | sub whine ($) { my ($msg) = @_; print STDERR "$QUIS: $msg\n"; } | |
43 | sub burble ($) { my ($msg) = @_; whine $msg if $DEBUG; } | |
44 | sub fail ($) { my ($msg) = @_; whine $msg; exit $! || ($? >> 8) || 255; } | |
45 | ||
46 | ## Cleanups. Call `cleanup BLOCK' to arrange to have BLOCK executed at the | |
47 | ## end of the program. | |
48 | our @CLEANUP = (); | |
49 | sub runcleanups { for my $f (@CLEANUP) { &$f } } | |
50 | END { runcleanups; } | |
51 | $SIG{INT} = $SIG{TERM} = sub { | |
52 | my $sig = shift; | |
53 | runcleanups; | |
54 | $SIG{$sig} = 'DEFAULT'; | |
55 | kill $sig => $$; | |
56 | }; | |
57 | sub cleanup (&) { unshift @CLEANUP, $_[0]; } | |
58 | ||
59 | sub fixint ($) { my ($x) = @_; return $x =~ /^0/ ? oct $x : $x + 0; } | |
60 | ||
61 | ###-------------------------------------------------------------------------- | |
62 | ### Device fiddling. | |
63 | ||
64 | sub devsys ($) { | |
65 | ## devsys DEV | |
66 | ## | |
67 | ## Return a sysfs path for a device DEV. | |
68 | ||
69 | my ($dev) = @_; | |
70 | my $st = stat $dev or fail "stat ($dev): $!"; | |
71 | my $kind; | |
72 | if (S_ISBLK($st->mode)) { $kind = "block"; } | |
73 | elsif (S_ISCHR($st->mode)) { $kind = "char"; } | |
74 | else { fail "$dev is not a device"; } | |
75 | my ($maj, $min) = (($st->rdev >> 8) & 0xff, $st->rdev & 0xff); | |
76 | (my $whole = realpath "/sys/dev/$kind/$maj:$min") =~ s:^/sys/:/:; | |
77 | return $whole; | |
78 | } | |
79 | ||
80 | our %DMTAB = (); | |
81 | ||
82 | sub dmtable_update () { | |
83 | ## dmtable_update | |
84 | ## | |
85 | ## Update the device-mapper table in %DMTAB. | |
86 | ||
87 | burble "re-read device-mapper table"; | |
88 | %DMTAB = (); | |
89 | open my $dt, "-|", "dmsetup", "table" or fail "open (dm table): $!"; | |
90 | while (my $line = $dt->getline) { | |
91 | my ($dev, $rest) = split /[:\s]+/, $line, 2; | |
92 | push @{$DMTAB{$dev}}, [split ' ', $rest]; | |
93 | } | |
94 | close $dt or fail "dmsetup table failed (rc = $?)"; | |
95 | } | |
96 | ||
97 | sub dmname ($) { | |
98 | ## dmname SYSPATH | |
99 | ## | |
100 | ## Return the device-mapper node name for the sysfs path SYSPATH. | |
101 | ||
102 | my ($sys) = @_; | |
103 | open my $f, "<", "/sys$sys/dm/name" or fail "open ($sys/dm/name): $!"; | |
104 | chomp (my $name = $f->getline); | |
105 | close $f; | |
106 | return $name; | |
107 | } | |
108 | ||
109 | ###-------------------------------------------------------------------------- | |
110 | ### I/O utilities. | |
111 | ||
112 | sub sel ($;$$$) { | |
113 | ## sel TIMEOUT, [READS, WRITES, EXCEPTIONS] | |
114 | ## | |
115 | ## Wait for at most TIMEOUT seconds (indefinitely if TIMEOUT is `undef'). | |
116 | ## Each of READS, WRITES and EXCEPTIONS is a listref containing FILE => SUB | |
117 | ## pairs: if the FILE is readable (writable, has an exceptional condition) | |
118 | ## then the SUB is invoked. | |
119 | ||
120 | my ($t, $r, $w, $x) = @_; | |
121 | my ($vr, $vw, $vx); | |
122 | my (%r, %w, %x); | |
123 | ||
124 | ## Read the arguments and build a data structure. | |
125 | for my $i ([$r, \$vr, \%r], [$w, \$vw, \%w], [$x, \$vx, \%x]) { | |
126 | my ($a, $v, $h) = @$i; | |
127 | next unless $a; | |
128 | my @a = @$a; | |
129 | while (@a) { | |
130 | my ($f, $g) = splice @a, 0, 2; | |
131 | my $fd = $f->fileno; | |
132 | $h->{$fd} = $g; | |
133 | vec($$v, $fd, 1) = 1; | |
134 | } | |
135 | } | |
136 | ||
137 | ## Do the wait and sift through the results. | |
138 | defined select $vr, $vw, $vx, $t or fail "select: $!"; | |
139 | for my $i ([$vr, \%r], [$vw, \%w], [$vx, \%x]) { | |
140 | my ($v, $h) = @$i; | |
141 | while (my ($f, $g) = each %$h) { | |
142 | if (vec $v, $f, 1) { &$g; } | |
143 | } | |
144 | } | |
145 | } | |
146 | ||
147 | sub doread ($;$) { | |
148 | ## doread FILE, [LEN] | |
149 | ## | |
150 | ## Read LEN bytes (or a default amount) from FILE. If the file ends, | |
151 | ## return undef. If reading would block then return an empty string. | |
152 | ## Otherwise return he stuff. | |
153 | ||
154 | my ($f, $n) = @_; | |
155 | $n = sysread $f, my $buf, $n // 4096; | |
156 | if (!defined $n) { return "" if $! == EAGAIN; fail "read: $!"; } | |
157 | elsif (!$n) { return undef; } | |
158 | else { return $buf; } | |
159 | } | |
160 | ||
161 | sub run ($$@) { | |
162 | ## run WHAT, PROG, ARGS... | |
163 | ## | |
164 | ## Run PROG, passing it ARGS. Fails if PROG exits nonzero. | |
165 | ||
166 | my ($what, $prog, @args) = @_; | |
167 | system($prog, @args) == 0 or fail "$prog ($what) failed (rc = $?)"; | |
168 | } | |
169 | ||
170 | sub capture ($@) { | |
171 | ## capture PROG, ARGS... | |
172 | ## | |
173 | ## Run PROG, passing it ARGS. Returns exit status, stdout, and stderr, as | |
174 | ## strings. | |
175 | ||
176 | my ($prog, @args) = @_; | |
177 | my ($out, $err) = ("", ""); | |
178 | my ($outpipe_in, $outpipe_out, $errpipe_in, $errpipe_out); | |
179 | pipe $outpipe_in, $outpipe_out or fail "pipe ($prog out): $!"; | |
180 | pipe $errpipe_in, $errpipe_out or fail "pipe ($prog err): $!"; | |
181 | defined (my $kid = fork) or fail "fork ($prog): $!"; | |
182 | if ($kid == 0) { | |
183 | close $outpipe_in | |
184 | and close $errpipe_in | |
185 | and open STDOUT, ">&", $outpipe_out | |
186 | and open STDERR, ">&", $errpipe_out | |
187 | and exec $prog, @args | |
188 | or fail "exec $prog: $!"; | |
189 | } | |
190 | close $outpipe_out; | |
191 | close $errpipe_out; | |
192 | for (;;) { | |
193 | my @r = (); | |
194 | for my $i ([\$outpipe_in, \$out, "out"], | |
195 | [\$errpipe_in, \$err, "err"]) { | |
196 | my ($p, $b, $w) = @$i; | |
197 | push @r, $$p => sub { | |
198 | my $buf = doread $$p; | |
199 | if (defined $buf) { $$b .= $buf; } | |
200 | else { close $$p; $$p = undef; } | |
201 | } if $$p; | |
202 | } | |
203 | last unless @r; | |
204 | sel undef, \@r; | |
205 | } | |
206 | waitpid $kid, 0 or fail "waitpid ($prog): $!"; | |
207 | return $?, $out, $err; | |
208 | } | |
209 | ||
210 | ###-------------------------------------------------------------------------- | |
211 | ### Monitoring udev events. | |
212 | ||
213 | sub umon_create (@) { | |
214 | ## umon_create ARGS... | |
215 | ## | |
216 | ## Create a udev monitor, with the given `udevadm monitor' arguments, and | |
217 | ## return an object. We always select only kernel events. We try to wait | |
218 | ## for the monitor to start up before returning. Don't trust this: use | |
219 | ## `umon_sync' anyway. | |
220 | ||
221 | my @args = @_; | |
222 | my $u = {}; | |
223 | ||
224 | ## Start the monitor process. | |
225 | $u->{KID} = open($u->{PIPE}, "-|", | |
226 | "stdbuf", "-o0", | |
227 | "udevadm", "monitor", "--kernel", "--property", @args) | |
228 | or fail "open (umon): $!"; | |
229 | cleanup { kill 9, $u->{KID} }; | |
230 | $u->{PIPE}->blocking(0) or fail "set non-blocking (umon): $!"; | |
231 | ||
232 | ## Wait for the end of the preamble, indicated by the first blank line. | |
233 | ## From observation with strace(1), this means that the monitor has | |
234 | ## successfully attached itself to its netlink socket and is ready to fetch | |
235 | ## events. | |
236 | my $ok = 0; | |
237 | my $buf = ""; | |
238 | my $now = time; | |
239 | my $end = $now + 5; | |
240 | while (!$ok) { | |
241 | sel | |
242 | $end - $now, | |
243 | [ $u->{PIPE} => sub { | |
244 | defined (my $b = doread $u->{PIPE}) or fail "read (umon): eof"; | |
245 | $buf .= $b; | |
246 | if ($buf =~ /\n\n(.*)$/) { $ok = 1; $buf = $1; } | |
247 | } | |
248 | ]; | |
249 | $now = time; | |
250 | if ($now >= $end) { fail "umon timeout"; } | |
251 | } | |
252 | $u->{BUF} = $buf; | |
253 | ||
254 | ## Done. | |
255 | return $u; | |
256 | } | |
257 | ||
258 | sub umon_read ($) { | |
259 | ## umon_read UMON | |
260 | ## | |
261 | ## Read events from UMON, as a list of hash references mapping properties | |
262 | ## to their values. | |
263 | ||
264 | my ($u) = @_; | |
265 | my @s = (); | |
266 | for (;;) { | |
267 | defined (my $buf = doread $u->{PIPE}) or fail "read (umon): end of file"; | |
268 | $buf eq "" and last; | |
269 | $buf = $u->{BUF} . $buf; | |
270 | my @r = split /\n\n/, $buf, -1; | |
271 | $u->{BUF} = pop @r; | |
272 | for my $r (@r) { | |
273 | push @s, { map { /^(\w+)=(.*)$/ } split /\n/, $r }; | |
274 | } | |
275 | } | |
276 | return @s; | |
277 | } | |
278 | ||
279 | sub umon_sync ($$) { | |
280 | ## umon_sync UMON, DEV | |
281 | ## | |
282 | ## Wait for UMON to report an event about the device DEV (without its | |
283 | ## `/dev/' prefix), triggering periodically just in case it missed one. | |
284 | ## This is useful for synchronizing. Returns the list of events which | |
285 | ## weren't interesting. | |
286 | ||
287 | my ($u, $dev) = @_; | |
288 | my $now = time; | |
289 | my $retry = 0; | |
290 | my $done = 0; | |
291 | my @ev = (); | |
292 | burble "sync with udev"; | |
293 | ||
294 | until ($done) { | |
295 | ||
296 | ## Too late. Trigger a change event and try again. | |
297 | if ($now >= $retry) { | |
298 | $retry = $now + 2; | |
299 | run "trigger $dev", "udevadm", "trigger", "--sysname-match=$dev"; | |
300 | } | |
301 | ||
302 | ## Now read events and see what happens. | |
303 | sel | |
304 | $retry - $now, | |
305 | [ $u->{PIPE} => sub { | |
306 | my @e = umon_read $u; | |
307 | while (@e) { | |
308 | my $e = shift @e; | |
309 | if ($e->{DEVNAME} eq $dev) { $done = 1; push @ev, @e; last; } | |
310 | else { push @ev, $e; } | |
311 | } | |
312 | } | |
313 | ]; | |
314 | $now = time; | |
315 | } | |
316 | ||
317 | return @ev; | |
318 | } | |
319 | ||
320 | ###-------------------------------------------------------------------------- | |
321 | ### Main code. | |
322 | ||
323 | ## Parse the command line. | |
324 | our $USAGE = "usage: $QUIS VGNAME/LVNAME"; | |
325 | sub version { print "$QUIS, version $VERSION\n"; } | |
326 | sub help { | |
327 | print <<EOF; | |
328 | $USAGE | |
329 | ||
330 | Options: | |
331 | -h, --help Show this help text. | |
332 | -v, --version Show the program version number. | |
333 | -d, --debug Show debugging information. | |
334 | -n, --no-act Don't take corrective actions. | |
335 | EOF | |
336 | } | |
337 | ||
338 | our $NOACT = 0; | |
339 | GetOptions('help|h|?' => sub { version; help; exit; }, | |
340 | 'version|v' => sub { version; exit; }, | |
341 | 'debug|d' => \$DEBUG, | |
342 | 'noact|n' => \$NOACT) | |
343 | and @ARGV == 1 | |
344 | and @ARGV[0] =~ m:(.+)/(.+): | |
345 | or do { print STDERR $USAGE, "\n"; exit 1; }; | |
346 | our ($VG, $LV) = ($1, $2); | |
347 | ||
348 | ## Check that the volume in question actually exists, and is a device-mapper | |
349 | ## device, before we wheel out the big guns. | |
350 | dmtable_update; | |
351 | our $SYS = devsys "/dev/$VG/$LV"; | |
352 | burble "sysfs name is $SYS"; | |
353 | my $t = $DMTAB{dmname $SYS} | |
354 | or fail "/dev/$VG/$LV isn't a device-mapper device"; | |
355 | if ($DEBUG) { | |
356 | burble "found table..."; | |
357 | burble "\t" . join " ", @$_ foreach @$t; | |
358 | } | |
359 | $t->[0][2] eq "snapshot" or fail "/dev/$VG/$LV isn't a snapshot"; | |
360 | ||
361 | ## Create a udev monitor. We're only interested in disk-shaped block | |
362 | ## devices. (If we use some other device kind for synchronization then this | |
363 | ## filter will have to be broadened.) | |
364 | my $u = umon_create "--subsystem-match=block/disk"; | |
365 | ||
366 | ## Prepare for the awful synchronization hack. We need to make sure, below, | |
367 | ## that we've read all of the interesting events resulting from an `lvremove' | |
368 | ## call. To do this, we wait for an event on a different device -- but we | |
369 | ## must avoid being fooled by spurious events on this device. As an attempt | |
370 | ## to minimize the probability of this going wrong, acquire a pet device | |
371 | ## which nobody else is using. The best idea seems to be a loopback device. | |
372 | open my $lopipe, "-|", "losetup", "--show", "--find", "/etc/motd" | |
373 | or fail "open (losetup attach)"; | |
374 | chomp (my $lo = $lopipe->getline); | |
375 | { local $/ = undef; <$lopipe>; } | |
376 | $lo =~ s:^/dev/::; | |
377 | $lopipe->close or fail "wait (losetup attach): $!"; | |
378 | cleanup { system "losetup", "--detach", "/dev/$lo" }; | |
379 | ||
380 | ## Initial synchronization, to make sure stuff works. | |
381 | umon_sync $u, $lo; | |
382 | ||
383 | ## Try to remove the snapshot. Capture stdout and stderr, and relay them if | |
384 | ## nothing serious went wrong. | |
385 | burble "initial attempt to remove snapshot"; | |
386 | my ($rc, $out, $err) = capture "lvremove", "--force", "$VG/$LV"; | |
387 | if ($rc != 0x500) { | |
388 | print STDOUT $out; | |
389 | print STDERR $err; | |
390 | burble "lvremove didn't explode (rc = $rc): we're done here"; | |
391 | if ($rc >> 8) { $rc >>= 8 } | |
392 | elsif ($rc & 255) { $rc += 128 } | |
393 | exit $rc; | |
394 | } | |
395 | burble "initial lvremove failed"; | |
396 | ||
397 | ## OK, stuff went wrong. First see if there was a udev cookie left over, and | |
398 | ## if so try to release it. It's important to know that we've read all of | |
399 | ## the relevant uevents, so synchronize again. | |
400 | my @e = umon_sync $u, $lo; | |
401 | my %c = (); | |
402 | for my $e (@e) { | |
403 | $c{($e->{DM_COOKIE} & 0xffff) | 0xd4d0000} = 1 | |
404 | if $e->{DEVPATH} eq $SYS && exists $e->{DM_COOKIE}; | |
405 | } | |
406 | burble "cookies used: " . join ", ", map { sprintf "0x%x", $_ } keys %c; | |
407 | ||
408 | ## Find the used cookies which are still extant, and release them. | |
409 | open $uc, "-|", "dmsetup", "udevcookies" or fail "open (cookies): $!"; | |
410 | $uc->getline; | |
411 | my @leak = (); | |
412 | while (my $l = $uc->getline) { | |
413 | my @f = split ' ', $l; | |
414 | push @leak, $f[0] if $c{fixint $f[0]}; | |
415 | } | |
416 | close $uc or fail "udevcookies failed (rc = $?)"; | |
417 | for my $c (@leak) { | |
418 | burble "release leaked cookie $c"; | |
419 | run "release cookie", "dmsetup", "udevreleasecookie", $c unless $NOACT; | |
420 | } | |
421 | ||
422 | ## If we're very unlucky, the origin volume may still be suspended. Resume | |
423 | ## it now, or the next attempt will get stuck. (Resuming is idempotent, so | |
424 | ## we don't need to check whether it's already running.) Finding the origin | |
425 | ## is annoying: search the device-mapper table for a device with a | |
426 | ## `snapshot-origin' table referencing the same backing store as the | |
427 | ## snapshot. | |
428 | my $back = $DMTAB{dmname $SYS}[0][3]; | |
429 | my $orig = undef; | |
430 | burble "backend device $back"; | |
431 | for my $dm (keys %DMTAB) { | |
432 | my $t = $DMTAB{$dm}; | |
433 | next unless @$t == 1 && | |
434 | $t->[0][2] eq "snapshot-origin" && | |
435 | $t->[0][3] eq $back; | |
436 | defined $orig and fail "snapshot appears to have multiple origins"; | |
437 | $orig = $dm; | |
438 | } | |
439 | defined $orig or fail "couldn't find snapshot origin device"; | |
440 | burble "found origin volume $orig; resuming..."; | |
441 | run "resume origin $orig", "dmsetup", "resume", $orig unless $NOACT; | |
442 | ||
443 | ## See whether removing the snapshot again helps any. | |
444 | burble "retry snapshot removal"; | |
445 | run "retry", "lvremove", "--force", "$VG/$LV" unless $NOACT; | |
446 | ||
447 | ## OK, we're on the way to recovery. The origin device may now be not a | |
448 | ## snapshot-origin any more. Refresh the device-mapper table and inspect it. | |
449 | dmtable_update; | |
450 | if (-d "/sys/dev/block/$back") { | |
451 | my $backdm = dmname "/dev/block/$back"; | |
452 | if ($DMTAB{$orig}[0][2] ne "snapshot-origin") { | |
453 | burble "origin released but backend $backdm still exists: remove"; | |
454 | run "remove backend $backdm", "dmsetup", "remove", $backdm | |
455 | unless $NOACT; | |
456 | } | |
457 | } | |
458 | ||
459 | ## All done. There, that wasn't so bad, was it? | |
460 | burble "completed successfully"; | |
461 | exit 0; | |
462 | ||
463 | ###----- That's all, folks -------------------------------------------------- |