#! @PERL@ ### ### Remove an LVM snapshot, without falling foul of LVM bugs ### ### (c) 2011 Mark Wooding ### ###----- Licensing notice --------------------------------------------------- ### ### This file is part of the distorted.org.uk backup suite. ### ### distorted-backup is free software; you can redistribute it and/or modify ### it under the terms of the GNU General Public License as published by ### the Free Software Foundation; either version 2 of the License, or ### (at your option) any later version. ### ### distorted-backup is distributed in the hope that it will be useful, ### but WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ### GNU General Public License for more details. ### ### You should have received a copy of the GNU General Public License along ### with distorted-backup; if not, write to the Free Software Foundation, ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. use Cwd qw(realpath); use Errno qw(:POSIX); use Fcntl qw(:mode); use File::stat; use Getopt::Long qw(:config gnu_compat bundling no_ignore_case); use IO::Handle; use Time::HiRes qw(time); our $VERSION = "@VERSION@"; ###-------------------------------------------------------------------------- ### Utilities. ## Error handling and reporting. (our $QUIS = $0) =~ s:^.*/::; our $DEBUG = 0; sub whine ($) { my ($msg) = @_; print STDERR "$QUIS: $msg\n"; } sub burble ($) { my ($msg) = @_; whine $msg if $DEBUG; } sub fail ($) { my ($msg) = @_; whine $msg; exit $! || ($? >> 8) || 255; } ## Cleanups. Call `cleanup BLOCK' to arrange to have BLOCK executed at the ## end of the program. our @CLEANUP = (); sub runcleanups { for my $f (@CLEANUP) { &$f } } END { runcleanups; } $SIG{INT} = $SIG{TERM} = sub { my $sig = shift; runcleanups; $SIG{$sig} = 'DEFAULT'; kill $sig => $$; }; sub cleanup (&) { unshift @CLEANUP, $_[0]; } sub fixint ($) { my ($x) = @_; return $x =~ /^0/ ? oct $x : $x + 0; } ###-------------------------------------------------------------------------- ### Device fiddling. sub devsys ($) { ## devsys DEV ## ## Return a sysfs path for a device DEV. my ($dev) = @_; my $st = stat $dev or fail "stat ($dev): $!"; my $kind; if (S_ISBLK($st->mode)) { $kind = "block"; } elsif (S_ISCHR($st->mode)) { $kind = "char"; } else { fail "$dev is not a device"; } my ($maj, $min) = (($st->rdev >> 8) & 0xff, $st->rdev & 0xff); (my $whole = realpath "/sys/dev/$kind/$maj:$min") =~ s:^/sys/:/:; return $whole; } our %DMTAB = (); sub dmtable_update () { ## dmtable_update ## ## Update the device-mapper table in %DMTAB. burble "re-read device-mapper table"; %DMTAB = (); open my $dt, "-|", "dmsetup", "table" or fail "open (dm table): $!"; while (my $line = $dt->getline) { my ($dev, $rest) = split /[:\s]+/, $line, 2; push @{$DMTAB{$dev}}, [split ' ', $rest]; } close $dt or fail "dmsetup table failed (rc = $?)"; } sub dmname ($) { ## dmname SYSPATH ## ## Return the device-mapper node name for the sysfs path SYSPATH. my ($sys) = @_; open my $f, "<", "/sys$sys/dm/name" or fail "open ($sys/dm/name): $!"; chomp (my $name = $f->getline); close $f; return $name; } ###-------------------------------------------------------------------------- ### I/O utilities. sub sel ($;$$$) { ## sel TIMEOUT, [READS, WRITES, EXCEPTIONS] ## ## Wait for at most TIMEOUT seconds (indefinitely if TIMEOUT is `undef'). ## Each of READS, WRITES and EXCEPTIONS is a listref containing FILE => SUB ## pairs: if the FILE is readable (writable, has an exceptional condition) ## then the SUB is invoked. my ($t, $r, $w, $x) = @_; my ($vr, $vw, $vx); my (%r, %w, %x); ## Read the arguments and build a data structure. for my $i ([$r, \$vr, \%r], [$w, \$vw, \%w], [$x, \$vx, \%x]) { my ($a, $v, $h) = @$i; next unless $a; my @a = @$a; while (@a) { my ($f, $g) = splice @a, 0, 2; my $fd = $f->fileno; $h->{$fd} = $g; vec($$v, $fd, 1) = 1; } } ## Do the wait and sift through the results. defined select $vr, $vw, $vx, $t or fail "select: $!"; for my $i ([$vr, \%r], [$vw, \%w], [$vx, \%x]) { my ($v, $h) = @$i; while (my ($f, $g) = each %$h) { if (vec $v, $f, 1) { &$g; } } } } sub doread ($;$) { ## doread FILE, [LEN] ## ## Read LEN bytes (or a default amount) from FILE. If the file ends, ## return undef. If reading would block then return an empty string. ## Otherwise return he stuff. my ($f, $n) = @_; $n = sysread $f, my $buf, $n // 4096; if (!defined $n) { return "" if $! == EAGAIN; fail "read: $!"; } elsif (!$n) { return undef; } else { return $buf; } } sub run ($$@) { ## run WHAT, PROG, ARGS... ## ## Run PROG, passing it ARGS. Fails if PROG exits nonzero. my ($what, $prog, @args) = @_; system($prog, @args) == 0 or fail "$prog ($what) failed (rc = $?)"; } sub capture ($@) { ## capture PROG, ARGS... ## ## Run PROG, passing it ARGS. Returns exit status, stdout, and stderr, as ## strings. my ($prog, @args) = @_; my ($out, $err) = ("", ""); my ($outpipe_in, $outpipe_out, $errpipe_in, $errpipe_out); pipe $outpipe_in, $outpipe_out or fail "pipe ($prog out): $!"; pipe $errpipe_in, $errpipe_out or fail "pipe ($prog err): $!"; defined (my $kid = fork) or fail "fork ($prog): $!"; if ($kid == 0) { close $outpipe_in and close $errpipe_in and open STDOUT, ">&", $outpipe_out and open STDERR, ">&", $errpipe_out and exec $prog, @args or fail "exec $prog: $!"; } close $outpipe_out; close $errpipe_out; for (;;) { my @r = (); for my $i ([\$outpipe_in, \$out, "out"], [\$errpipe_in, \$err, "err"]) { my ($p, $b, $w) = @$i; push @r, $$p => sub { my $buf = doread $$p; if (defined $buf) { $$b .= $buf; } else { close $$p; $$p = undef; } } if $$p; } last unless @r; sel undef, \@r; } waitpid $kid, 0 or fail "waitpid ($prog): $!"; return $?, $out, $err; } ###-------------------------------------------------------------------------- ### Monitoring udev events. sub umon_create (@) { ## umon_create ARGS... ## ## Create a udev monitor, with the given `udevadm monitor' arguments, and ## return an object. We always select only kernel events. We try to wait ## for the monitor to start up before returning. Don't trust this: use ## `umon_sync' anyway. my @args = @_; my $u = {}; ## Start the monitor process. $u->{KID} = open($u->{PIPE}, "-|", "stdbuf", "-o0", "udevadm", "monitor", "--kernel", "--property", @args) or fail "open (umon): $!"; cleanup { kill 9, $u->{KID} }; $u->{PIPE}->blocking(0) or fail "set non-blocking (umon): $!"; ## Wait for the end of the preamble, indicated by the first blank line. ## From observation with strace(1), this means that the monitor has ## successfully attached itself to its netlink socket and is ready to fetch ## events. my $ok = 0; my $buf = ""; my $now = time; my $end = $now + 5; while (!$ok) { sel $end - $now, [ $u->{PIPE} => sub { defined (my $b = doread $u->{PIPE}) or fail "read (umon): eof"; $buf .= $b; if ($buf =~ /\n\n(.*)$/) { $ok = 1; $buf = $1; } } ]; $now = time; if ($now >= $end) { fail "umon timeout"; } } $u->{BUF} = $buf; ## Done. return $u; } sub umon_read ($) { ## umon_read UMON ## ## Read events from UMON, as a list of hash references mapping properties ## to their values. my ($u) = @_; my @s = (); for (;;) { defined (my $buf = doread $u->{PIPE}) or fail "read (umon): end of file"; $buf eq "" and last; $buf = $u->{BUF} . $buf; my @r = split /\n\n/, $buf, -1; $u->{BUF} = pop @r; for my $r (@r) { push @s, { map { /^(\w+)=(.*)$/ } split /\n/, $r }; } } return @s; } sub umon_sync ($$) { ## umon_sync UMON, DEV ## ## Wait for UMON to report an event about the device DEV (without its ## `/dev/' prefix), triggering periodically just in case it missed one. ## This is useful for synchronizing. Returns the list of events which ## weren't interesting. my ($u, $dev) = @_; my $now = time; my $retry = 0; my $done = 0; my @ev = (); burble "sync with udev"; until ($done) { ## Too late. Trigger a change event and try again. if ($now >= $retry) { $retry = $now + 2; run "trigger $dev", "udevadm", "trigger", "--sysname-match=$dev"; } ## Now read events and see what happens. sel $retry - $now, [ $u->{PIPE} => sub { my @e = umon_read $u; while (@e) { my $e = shift @e; if ($e->{DEVNAME} eq $dev) { $done = 1; push @ev, @e; last; } else { push @ev, $e; } } } ]; $now = time; } return @ev; } ###-------------------------------------------------------------------------- ### Main code. ## Parse the command line. our $USAGE = "usage: $QUIS VGNAME/LVNAME"; sub version { print "$QUIS, version $VERSION\n"; } sub help { print < sub { version; help; exit; }, 'version|v' => sub { version; exit; }, 'debug|d' => \$DEBUG, 'noact|n' => \$NOACT) and @ARGV == 1 and @ARGV[0] =~ m:(.+)/(.+): or do { print STDERR $USAGE, "\n"; exit 1; }; our ($VG, $LV) = ($1, $2); ## Check that the volume in question actually exists, and is a device-mapper ## device, before we wheel out the big guns. dmtable_update; our $SYS = devsys "/dev/$VG/$LV"; burble "sysfs name is $SYS"; my $t = $DMTAB{dmname $SYS} or fail "/dev/$VG/$LV isn't a device-mapper device"; if ($DEBUG) { burble "found table..."; burble "\t" . join " ", @$_ foreach @$t; } $t->[0][2] eq "snapshot" or fail "/dev/$VG/$LV isn't a snapshot"; ## Create a udev monitor. We're only interested in disk-shaped block ## devices. (If we use some other device kind for synchronization then this ## filter will have to be broadened.) my $u = umon_create "--subsystem-match=block/disk"; ## Prepare for the awful synchronization hack. We need to make sure, below, ## that we've read all of the interesting events resulting from an `lvremove' ## call. To do this, we wait for an event on a different device -- but we ## must avoid being fooled by spurious events on this device. As an attempt ## to minimize the probability of this going wrong, acquire a pet device ## which nobody else is using. The best idea seems to be a loopback device. open my $lopipe, "-|", "losetup", "--show", "--find", "/etc/motd" or fail "open (losetup attach)"; chomp (my $lo = $lopipe->getline); { local $/ = undef; <$lopipe>; } $lo =~ s:^/dev/::; $lopipe->close or fail "wait (losetup attach): $!"; cleanup { system "losetup", "--detach", "/dev/$lo" }; ## Initial synchronization, to make sure stuff works. umon_sync $u, $lo; ## Try to remove the snapshot. Capture stdout and stderr, and relay them if ## nothing serious went wrong. burble "initial attempt to remove snapshot"; my ($rc, $out, $err) = capture "lvremove", "--force", "$VG/$LV"; if ($rc != 0x500) { print STDOUT $out; print STDERR $err; burble "lvremove didn't explode (rc = $rc): we're done here"; if ($rc >> 8) { $rc >>= 8 } elsif ($rc & 255) { $rc += 128 } exit $rc; } burble "initial lvremove failed"; ## OK, stuff went wrong. First see if there was a udev cookie left over, and ## if so try to release it. It's important to know that we've read all of ## the relevant uevents, so synchronize again. my @e = umon_sync $u, $lo; my %c = (); for my $e (@e) { $c{($e->{DM_COOKIE} & 0xffff) | 0xd4d0000} = 1 if $e->{DEVPATH} eq $SYS && exists $e->{DM_COOKIE}; } burble "cookies used: " . join ", ", map { sprintf "0x%x", $_ } keys %c; ## Find the used cookies which are still extant, and release them. open $uc, "-|", "dmsetup", "udevcookies" or fail "open (cookies): $!"; $uc->getline; my @leak = (); while (my $l = $uc->getline) { my @f = split ' ', $l; push @leak, $f[0] if $c{fixint $f[0]}; } close $uc or fail "udevcookies failed (rc = $?)"; for my $c (@leak) { burble "release leaked cookie $c"; run "release cookie", "dmsetup", "udevreleasecookie", $c unless $NOACT; } ## If we're very unlucky, the origin volume may still be suspended. Resume ## it now, or the next attempt will get stuck. (Resuming is idempotent, so ## we don't need to check whether it's already running.) Finding the origin ## is annoying: search the device-mapper table for a device with a ## `snapshot-origin' table referencing the same backing store as the ## snapshot. my $back = $DMTAB{dmname $SYS}[0][3]; my $orig = undef; burble "backend device $back"; for my $dm (keys %DMTAB) { my $t = $DMTAB{$dm}; next unless @$t == 1 && $t->[0][2] eq "snapshot-origin" && $t->[0][3] eq $back; defined $orig and fail "snapshot appears to have multiple origins"; $orig = $dm; } defined $orig or fail "couldn't find snapshot origin device"; burble "found origin volume $orig; resuming..."; run "resume origin $orig", "dmsetup", "resume", $orig unless $NOACT; ## See whether removing the snapshot again helps any. burble "retry snapshot removal"; run "retry", "lvremove", "--force", "$VG/$LV" unless $NOACT; ## OK, we're on the way to recovery. The origin device may now be not a ## snapshot-origin any more. Refresh the device-mapper table and inspect it. dmtable_update; if (-d "/sys/dev/block/$back") { my $backdm = dmname "/dev/block/$back"; if ($DMTAB{$orig}[0][2] ne "snapshot-origin") { burble "origin released but backend $backdm still exists: remove"; run "remove backend $backdm", "dmsetup", "remove", $backdm unless $NOACT; } } ## All done. There, that wasn't so bad, was it? burble "completed successfully"; exit 0; ###----- That's all, folks --------------------------------------------------