chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import binascii as B
  28 import errno as E
  29 import hashlib as H
  30 import optparse as OP
  31 import os as OS
  32 import re as RX
  33 import sqlite3 as DB
  34 import stat as ST
  35 import time as T
  36 import zlib as Z
  37
  38 PACKAGE = '@PACKAGE@'
  39 VERSION = '@VERSION@'
  40
  41 ###--------------------------------------------------------------------------
  42 ### Utilities.
  43
  44 import sys as _SYS
  45 _PYVER = _SYS.version_info
  46 if _PYVER >= (3,):
  47   _FSENC = _SYS.getfilesystemencoding()
  48   if _PYVER >= (3, 1): _FSENCERR = "surrogateescape"
  49   else: _FSENCERR = "strict"
  50   from io import BytesIO, StringIO
  51   def bin(x): return x.encode(_FSENC, _FSENCERR)
  52   def text(x): return x.decode(_FSENC, _FSENCERR)
  53   def bytechr(x): return bytes([x])
  54   def byteord(x): return x
  55 else:
  56   from cStringIO import StringIO; BytesIO = StringIO
  57   def bin(x): return x
  58   def text(x): return x
  59   def bytechr(x): return chr(x)
  60   def byteord(x): return ord(x)
  61 def excval(): return exc_info()[1]
  62
  63 QUIS = OS.path.basename(argv[0])
  64
  65 def moan(msg):
  66   stderr.write('%s: %s\n' % (QUIS, msg))
  67
  68 def die(msg, rc = 1):
  69   moan(msg)
  70   exit(rc)
  71
  72 SYSERR = 0
  73 def syserr(msg):
  74   global SYSERR
  75   moan(msg)
  76   SYSERR += 1
  77
  78 def escapify(x):
  79   out = StringIO()
  80   for ch in bin(x):
  81     k = byteord(ch)
  82     if k == 9: out.write("\\t")
  83     elif k == 10: out.write("\\n")
  84     elif k == 13: out.write("\\r")
  85     elif k == 39: out.write("\\'")
  86     elif k == 92: out.write("\\\\")
  87     elif 20 <= k <= 126: out.write(chr(k))
  88     else: out.write("\\x%02x" % k)
  89   return out.getvalue()
  90
  91 R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
  92                       RX.VERBOSE)
  93 def unescapify(x):
  94   str = BytesIO()
  95   i, n = 0, len(x)
  96   while True:
  97     m = R_STRESC.search(x, i)
  98     if m is not None: j = m.start(0)
  99     else: j = n
 100     str.write(bin(str[i:j]))
 101     if m is None: break
 102     k, e = m.group(1), m.group(2)
 103     if k is not None: ch = int(k, 16)
 104     elif ch == "a": ch = 7
 105     elif ch == "b": ch = 8
 106     elif ch == "f": ch = 12
 107     elif ch == "n": ch = 10
 108     elif ch == "r": ch = 13
 109     elif ch == "t": ch = 9
 110     elif ch == "v": ch = 11
 111     else: ch = byteord(e)
 112     str.write(bytechr(ch))
 113     i = m.end(0)
 114   return text(out.getvalue())
 115
 116 ###--------------------------------------------------------------------------
 117 ### File system enumeration.
 118
 119 class FileInfo (object):
 120   def __init__(me, file, st = None):
 121     me.name = file
 122     if st:
 123       me.st = st
 124       me.err = None
 125     else:
 126       try:
 127         me.st = OS.lstat(file)
 128         me.err = None
 129       except OSError:
 130         me.st = None
 131         me.err = excval()
 132
 133 def enum_walk(file, func):
 134
 135   def dirents(name):
 136     try:
 137       return OS.listdir(name)
 138     except OSError:
 139       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
 140       return []
 141
 142   def dir(ee, dev):
 143     ff = []
 144     dd = []
 145     for e in ee:
 146       fi = FileInfo(e)
 147       if fi.st and fi.st.st_dev != dev: pass
 148       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
 149       else: ff.append(fi)
 150     ff.sort(key = lambda fi: fi.name)
 151     dd.sort(key = lambda fi: fi.name + '/')
 152     for f in ff:
 153       func(f)
 154     for d in dd:
 155       if d.st.st_dev == dev:
 156         func(d)
 157         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 158
 159   if file.endswith('/'):
 160     cwd = OS.open('.', OS.O_RDONLY)
 161     try:
 162       OS.chdir(file)
 163       fi = FileInfo('.')
 164       func(fi)
 165       dir(dirents('.'), fi.st.st_dev)
 166     finally:
 167       OS.fchdir(cwd)
 168       OS.close(cwd)
 169   else:
 170     fi = FileInfo(file)
 171     func(fi)
 172     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 173       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 174           fi.st.st_dev)
 175
 176 def enum_find0(f, func):
 177   tail = ""
 178   while True:
 179     buf = f.read(8192)
 180     last = len(buf) == 0
 181     names = (tail + buf).split('\0')
 182     tail = names.pop()
 183     for n in names:
 184       func(FileInfo(n))
 185     if last:
 186       break
 187   if len(tail):
 188     moan("ignored trailing junk after last filename")
 189
 190 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 191 def enum_rsync(f, func):
 192
 193   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 194   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 195   ## one space character after the TIME and may begin with a space.
 196   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 197   ## for a byte with that value.  Newlines, and backslashes which would be
 198   ## ambiguous, are converted into this form; all other characters are
 199   ## literal.
 200   ##
 201   ## We ignore the stat information and retrieve it ourselves, because it's
 202   ## incomplete.  Hopefully the dcache is still warm.
 203
 204   for line in f:
 205     if line.endswith('\n'): line = line[:-1]
 206
 207     ## Extract the escaped name.
 208     ff = line.split(None, 3)
 209     if len(ff) != 4:
 210       syserr("ignoring invalid line from rsync: `%s'" % line)
 211       continue
 212     tail = ff[3]
 213     try:
 214       spc = tail.index(' ')
 215     except ValueError:
 216       syserr("ignoring invalid line from rsync: `%s'" % line)
 217       continue
 218     name = tail[spc + 1:]
 219
 220     ## Now translate escape sequences.
 221     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 222
 223     ## Call the client.
 224     try:
 225       fi = FileInfo(name)
 226     except OSError:
 227       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 228       continue
 229     func(fi)
 230
 231 ###--------------------------------------------------------------------------
 232 ### The hash cache.
 233
 234 class HashCache (object):
 235
 236   VERSION = 0
 237   BUFSZ = 128*1024
 238
 239   INIT = [
 240     """CREATE TABLE meta (
 241                version INTEGER NOT NULL,
 242                hash TEXT NOT NULL
 243        );""",
 244     """CREATE TABLE hash (
 245                ino INTEGER PRIMARY KEY,
 246                mtime INTEGER NOT NULL,
 247                ctime INTEGER NOT NULL,
 248                size INTEGER NOT NULL,
 249                hash TEXT NOT NULL,
 250                seen BOOLEAN NOT NULL DEFAULT TRUE
 251        );""",
 252     """PRAGMA journal_mode = WAL;"""
 253   ]
 254
 255   def __init__(me, file, hash = None):
 256
 257     if file is None:
 258
 259       ## We're going this alone, with no cache.
 260       db = None
 261       if hash is None:
 262         die("no hash specified and no database cache to read from")
 263     else:
 264
 265       ## Connect to the database.
 266       db = DB.connect(file)
 267       db.text_factory = str
 268
 269       ## See whether we can understand the cache database.
 270       c = db.cursor()
 271       v = h = None
 272       try:
 273         c.execute('SELECT version, hash FROM meta')
 274         v, h = c.fetchone()
 275         if c.fetchone() is not None:
 276           die("cache database corrupt: meta table has mutliple rows")
 277       except (DB.Error, TypeError):
 278         pass
 279
 280       ## If that didn't work, we'd better clear the thing and start again.
 281       ## But only if we know how to initialize it.
 282       if v != me.VERSION:
 283
 284         ## Explain the situation.
 285         moan("cache version %s not understood" % v)
 286         if hash is None:
 287           if h is None:
 288             die("can't initialize cache: no hash function set")
 289           else:
 290             hash = h
 291         try:
 292           H.new(hash)
 293         except Exception:
 294           die("unknown hash function `%s'" % hash)
 295
 296         ## Drop old things.
 297         c.execute('SELECT type, name FROM sqlite_master')
 298         for type, name in c.fetchall():
 299           c.execute('DROP %s IF EXISTS %s' % (type, name))
 300
 301         ## Now we're ready to go.
 302         for stmt in me.INIT:
 303           c.execute(stmt)
 304         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 305         db.commit()
 306
 307       ## Check the hash function if necessary.
 308       if hash is None:
 309         hash = h
 310       elif h is not None and  h != hash:
 311         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 312
 313     ## All done.
 314     me.hash = hash
 315     me._db = db
 316     me._pend = 0
 317
 318   def hashfile(me, fi):
 319
 320     ## If this isn't a proper file then don't try to hash it.
 321     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 322       return None
 323
 324     ## See whether there's a valid entry in the cache.
 325     if me._db:
 326       c = me._db.cursor()
 327       c.execute(
 328         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 329         [fi.st.st_ino])
 330       r = c.fetchone()
 331       if r is not None:
 332         mt, sz, h, s = r
 333         if mt == fi.st.st_mtime and \
 334            sz == fi.st.st_size:
 335           if not s:
 336             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 337                       [fi.st.st_ino])
 338           me._update()
 339           return h
 340
 341     ## Hash the file.  Beware raciness: update the file information from the
 342     ## open descriptor, but set the size from what we actually read.
 343     h = H.new(me.hash)
 344     try:
 345       with open(fi.name, 'rb') as f:
 346         sz = 0
 347         while True:
 348           buf = f.read(me.BUFSZ)
 349           if len(buf) == 0:
 350             break
 351           sz += len(buf)
 352           h.update(buf)
 353         fi.st = OS.fstat(f.fileno())
 354         ##fi.st.st_size = sz
 355       hash = h.digest()
 356     except (OSError, IOError):
 357       fi.st = None
 358       fi.err = excval()
 359       return None
 360     hash = text(B.hexlify(hash))
 361
 362     ## Insert a record into the database.
 363     if me._db:
 364       c.execute("""
 365               INSERT OR REPLACE INTO hash
 366                       (ino, mtime, ctime, size, hash, seen)
 367               VALUES
 368                       (?, ?, ?, ?, ?, 1);
 369       """, [fi.st.st_ino,
 370             fi.st.st_mtime,
 371             fi.st.st_ctime,
 372             fi.st.st_size,
 373             hash])
 374       me._update()
 375
 376     ## Done.
 377     return hash
 378
 379   def _update(me):
 380     me._pend += 1
 381     if me._pend >= 1024:
 382       me.flush()
 383
 384   def flush(me):
 385     if me._db:
 386       me._db.commit()
 387     me._pend = 0
 388
 389   def need_db(me):
 390     if not me._db:
 391       die("no cache database")
 392
 393   def forget(me, ino):
 394     me.need_db()
 395     c = me._db.cursor()
 396     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 397
 398   def reset(me):
 399     me.need_db()
 400     c = me._db.cursor()
 401     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 402     me.flush()
 403
 404   def prune(me):
 405     me.need_db()
 406     c = me._db.cursor()
 407     c.execute('DELETE FROM hash WHERE NOT seen')
 408     me.flush()
 409
 410 ###--------------------------------------------------------------------------
 411 ### Printing output.
 412
 413 class GenericFormatter (object):
 414   def __init__(me, fi):
 415     me.fi = fi
 416   def _fmt_time(me, t):
 417     tm = T.gmtime(t)
 418     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 419   def _enc_name(me, n):
 420     return ' \\-> '.join(escapify(n).split(' -> '))
 421   def name(me):
 422     return me._enc_name(me.fi.name)
 423   def info(me):
 424     return me.TYPE
 425   def mode(me):
 426     return '%06o' % me.fi.st.st_mode
 427   def size(me):
 428     return me.fi.st.st_size
 429   def mtime(me):
 430     return me._fmt_time(me.fi.st.st_mtime)
 431   def owner(me):
 432     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 433
 434 class ErrorFormatter (GenericFormatter):
 435   def info(me):
 436     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 437   def error(me): return 'error'
 438   mode = size = mtime = owner = error
 439
 440 class SocketFormatter (GenericFormatter):
 441   TYPE = 'socket'
 442 class PipeFormatter (GenericFormatter):
 443   TYPE = 'fifo'
 444
 445 class LinkFormatter (GenericFormatter):
 446   TYPE = 'symbolic-link'
 447   def name(me):
 448     n = GenericFormatter.name(me)
 449     try:
 450       d = OS.readlink(me.fi.name)
 451       return '%s -> %s' % (n, me._enc_name(d))
 452     except OSError:
 453       err = excval()
 454       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 455
 456 class DirectoryFormatter (GenericFormatter):
 457   TYPE = 'directory'
 458   def name(me): return GenericFormatter.name(me) + '/'
 459   def size(me): return 'dir'
 460
 461 class DeviceFormatter (GenericFormatter):
 462   def info(me):
 463     return '%s %d:%d' % (me.TYPE,
 464                          OS.major(me.fi.st.st_rdev),
 465                          OS.minor(me.fi.st.st_rdev))
 466 class BlockDeviceFormatter (DeviceFormatter):
 467   TYPE = 'block-device'
 468 class CharDeviceFormatter (DeviceFormatter):
 469   TYPE = 'character-device'
 470
 471 class FileFormatter (GenericFormatter):
 472   TYPE = 'regular-file'
 473
 474 class Reporter (object):
 475
 476   TYMAP = {
 477     ST.S_IFSOCK: SocketFormatter,
 478     ST.S_IFDIR: DirectoryFormatter,
 479     ST.S_IFLNK: LinkFormatter,
 480     ST.S_IFREG: FileFormatter,
 481     ST.S_IFBLK: BlockDeviceFormatter,
 482     ST.S_IFCHR: CharDeviceFormatter,
 483     ST.S_IFIFO: PipeFormatter,
 484   }
 485
 486   def __init__(me, db):
 487     me._inomap = {}
 488     me._vinomap = {}
 489     me._db = db
 490     me._hsz = int(H.new(db.hash).digest_size)
 491
 492   def file(me, fi):
 493     h = me._db.hashfile(fi)
 494     if fi.err:
 495       fmt = ErrorFormatter(fi)
 496       vino = 'error'
 497     else:
 498       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 499       inoidx = fi.st.st_dev, fi.st.st_ino
 500       try:
 501         vino = me._inomap[inoidx]
 502       except KeyError:
 503         suffix = ''
 504         seq = 0
 505         while True:
 506           vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
 507           if vino not in me._vinomap: break
 508           suffix = '\0%d' % seq
 509           seq += 1
 510         me._inomap[inoidx] = vino
 511         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 512     if h: info = h
 513     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 514     print('%s %8s %6s %-12s %-20s %20s %s' %
 515           (info, vino, fmt.mode(), fmt.owner(),
 516            fmt.mtime(), fmt.size(), fmt.name()))
 517
 518 ###--------------------------------------------------------------------------
 519 ### Database clearing from diff files.
 520
 521 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 522
 523 def clear_entry(db, lno, line):
 524
 525   good = True
 526
 527   if line.startswith('['):
 528     pos = line.find(']')
 529     if pos < 0:
 530       moan("failed to parse file entry (type field; line %d)" % lno)
 531       return False
 532     ty = line[1:pos].strip()
 533     rest = line[pos + 1:]
 534     hash = None
 535   else:
 536     ff = line.split(None, 1)
 537     if len(ff) != 2:
 538       moan("failed to parse file entry (field split; line %d)" % lno)
 539       return False
 540     ty = 'regular-file'
 541     hash, rest = ff
 542
 543   ff = rest.split(None, 5)
 544   if len(ff) != 6:
 545     moan("failed to parse file entry (field split; line %d)" % lno)
 546     return False
 547   ino, mode, uidgid, mtime, sz, name = ff
 548
 549   if ty != 'symbolic-link':
 550     target = None
 551   else:
 552     nn = name.split(' -> ', 1)
 553     if len(nn) != 2:
 554       moan("failed to parse file entry (name split; line %d)" % lno)
 555       return False
 556     name, target = nn
 557     target = unescapify(target)
 558   name = unescapify(name)
 559
 560   try:
 561     st = OS.lstat(name)
 562   except OSError:
 563     e = excval()
 564     moan("failed to stat `%s': %s" % (name, e.strerror))
 565     if e.errno != E.ENOENT: good = False
 566   else:
 567     print("Clear cache entry for `%s'" % name)
 568     db.forget(st.st_ino)
 569
 570   return good
 571
 572 def clear_cache(db):
 573
 574   ## Work through the input diff file one line at a time.
 575   diffstate = 'gap'
 576   lno = 0
 577   good = True
 578   for line in stdin:
 579     if line.endswith('\n'): line = line[:-1]
 580     lno += 1
 581
 582     ## We're in a gap between hunks.  Find a hunk header and extract the line
 583     ## counts.
 584     if diffstate == 'gap':
 585       m = R_HUNK.match(line)
 586       if m:
 587         oldlines = int(m.group(1))
 588         newlines = int(m.group(2))
 589         diffstate = 'hunk'
 590         hdrlno = lno
 591
 592     ## We're in a hunk.  Keep track of whether we've reached the end, and
 593     ## discard entries from the cache for mismatching lines.
 594     elif diffstate == 'hunk':
 595       if len(line) == 0:
 596         moan("empty line in diff hunk (line %d)" % lno)
 597         good = False
 598       ty = line[0]
 599       if ty == ' ':
 600         oldlines -= 1; newlines -= 1
 601       elif ty == '+':
 602         newlines -= 1
 603         if not clear_entry(db, lno, line[1:]): good = False
 604       elif ty == '-':
 605         oldlines -= 1
 606         if not clear_entry(db, lno, line[1:]): good = False
 607       else:
 608         moan("incomprehensible line in diff hunk (line %d)" % lno)
 609         good = false
 610       if oldlines < 0 or newlines < 0:
 611         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 612         good = False
 613       if oldlines == newlines == 0:
 614         diffstate = 'gap'
 615
 616   if diffstate == 'hunk':
 617     moan("truncated diff hunk (started at line %d)" % hdrlno)
 618     good = False
 619
 620   return good
 621
 622 ###--------------------------------------------------------------------------
 623 ### Main program.
 624
 625 FMTMAP = {
 626   'rsync': lambda f: enum_rsync(stdin, f),
 627   'find0': lambda f: enum_find0(stdin, f)
 628 }
 629 op = OP.OptionParser(
 630   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 631   version = '%%prog, version %s' % VERSION,
 632   description = '''\
 633 Print a digest of a filesystem (or a collection of specified files) to
 634 standard output.  The idea is that the digest should be mostly /complete/
 635 (i.e., any `interesting\' change to the filesystem results in a different
 636 digest) and /canonical/ (i.e., identical filesystem contents result in
 637 identical output).
 638 ''')
 639
 640 for short, long, props in [
 641   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 642                     'help': 'clear cache of all files not seen' }),
 643   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 644                       'help': 'use FILE as a cache for file hashes' }),
 645   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 646                       'type': 'choice', 'choices': list(FMTMAP.keys()),
 647                       'help': 'read files to report in the given FORMAT' }),
 648   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 649                       'help': 'read diff from stdin, clear cache entries' }),
 650   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 651                        'type': 'int', 'default': 2,
 652                        'help': 'produce output with given compatibility VERSION' }),
 653   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 654                      ##'type': 'choice', 'choices': H.algorithms,
 655                      'help': 'use HASH as the hash function' })]:
 656   op.add_option(short, long, **props)
 657 OPTS, args = op.parse_args(argv)
 658 if not 1 <= OPTS.compat <= 2:
 659   die("unknown compatibility version %d" % OPTS.compat)
 660 if OPTS.udiff:
 661   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 662     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 663   db = HashCache(OPTS.cache, OPTS.hash)
 664   if len(args) == 2: OS.chdir(args[1])
 665   good = True
 666   if not clear_cache(db): good = False
 667   if good: db.flush()
 668   else: exit(2)
 669 else:
 670   if not OPTS.files and len(args) <= 1:
 671     die("no filename sources: nothing to do")
 672   db = HashCache(OPTS.cache, OPTS.hash)
 673   if OPTS.all:
 674     db.reset()
 675   if OPTS.compat >= 2:
 676     print("## fshash report format version %d" % OPTS.compat)
 677   rep = Reporter(db)
 678   if OPTS.files:
 679     FMTMAP[OPTS.files](rep.file)
 680   for dir in args[1:]:
 681     enum_walk(dir, rep.file)
 682   if OPTS.all:
 683     db.prune()
 684   db.flush()
 685
 686 ###----- That's all, folks --------------------------------------------------