chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import binascii as B
  28 import errno as E
  29 import hashlib as H
  30 import optparse as OP
  31 import os as OS
  32 import re as RX
  33 import sqlite3 as DB
  34 import stat as ST
  35 import time as T
  36 import zlib as Z
  37
  38 PACKAGE = '@PACKAGE@'
  39 VERSION = '@VERSION@'
  40
  41 ###--------------------------------------------------------------------------
  42 ### Utilities.
  43
  44 from cStringIO import StringIO; BytesIO = StringIO
  45 def bin(x): return x
  46 def text(x): return x
  47 def bytechr(x): return chr(x)
  48 def byteord(x): return ord(x)
  49 def excval(): return exc_info()[1]
  50
  51 QUIS = OS.path.basename(argv[0])
  52
  53 def moan(msg):
  54   stderr.write('%s: %s\n' % (QUIS, msg))
  55
  56 def die(msg, rc = 1):
  57   moan(msg)
  58   exit(rc)
  59
  60 SYSERR = 0
  61 def syserr(msg):
  62   global SYSERR
  63   moan(msg)
  64   SYSERR += 1
  65
  66 def escapify(x):
  67   out = StringIO()
  68   for ch in bin(x):
  69     k = byteord(ch)
  70     if k == 9: out.write("\\t")
  71     elif k == 10: out.write("\\n")
  72     elif k == 13: out.write("\\r")
  73     elif k == 39: out.write("\\'")
  74     elif k == 92: out.write("\\\\")
  75     elif 20 <= k <= 126: out.write(chr(k))
  76     else: out.write("\\x%02x" % k)
  77   return out.getvalue()
  78
  79 R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
  80                       RX.VERBOSE)
  81 def unescapify(x):
  82   str = BytesIO()
  83   i, n = 0, len(x)
  84   while True:
  85     m = R_STRESC.search(x, i)
  86     if m is not None: j = m.start(0)
  87     else: j = n
  88     str.write(bin(str[i:j]))
  89     if m is None: break
  90     k, e = m.group(1), m.group(2)
  91     if k is not None: ch = int(k, 16)
  92     elif ch == "a": ch = 7
  93     elif ch == "b": ch = 8
  94     elif ch == "f": ch = 12
  95     elif ch == "n": ch = 10
  96     elif ch == "r": ch = 13
  97     elif ch == "t": ch = 9
  98     elif ch == "v": ch = 11
  99     else: ch = byteord(e)
 100     str.write(bytechr(ch))
 101     i = m.end(0)
 102   return text(out.getvalue())
 103
 104 ###--------------------------------------------------------------------------
 105 ### File system enumeration.
 106
 107 class FileInfo (object):
 108   def __init__(me, file, st = None):
 109     me.name = file
 110     if st:
 111       me.st = st
 112       me.err = None
 113     else:
 114       try:
 115         me.st = OS.lstat(file)
 116         me.err = None
 117       except OSError:
 118         me.st = None
 119         me.err = excval()
 120
 121 def enum_walk(file, func):
 122
 123   def dirents(name):
 124     try:
 125       return OS.listdir(name)
 126     except OSError:
 127       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
 128       return []
 129
 130   def dir(ee, dev):
 131     ff = []
 132     dd = []
 133     for e in ee:
 134       fi = FileInfo(e)
 135       if fi.st and fi.st.st_dev != dev: pass
 136       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
 137       else: ff.append(fi)
 138     ff.sort(key = lambda fi: fi.name)
 139     dd.sort(key = lambda fi: fi.name + '/')
 140     for f in ff:
 141       func(f)
 142     for d in dd:
 143       if d.st.st_dev == dev:
 144         func(d)
 145         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 146
 147   if file.endswith('/'):
 148     cwd = OS.open('.', OS.O_RDONLY)
 149     try:
 150       OS.chdir(file)
 151       fi = FileInfo('.')
 152       func(fi)
 153       dir(dirents('.'), fi.st.st_dev)
 154     finally:
 155       OS.fchdir(cwd)
 156       OS.close(cwd)
 157   else:
 158     fi = FileInfo(file)
 159     func(fi)
 160     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 161       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 162           fi.st.st_dev)
 163
 164 def enum_find0(f, func):
 165   tail = ""
 166   while True:
 167     buf = f.read(8192)
 168     last = len(buf) == 0
 169     names = (tail + buf).split('\0')
 170     tail = names.pop()
 171     for n in names:
 172       func(FileInfo(n))
 173     if last:
 174       break
 175   if len(tail):
 176     moan("ignored trailing junk after last filename")
 177
 178 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 179 def enum_rsync(f, func):
 180
 181   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 182   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 183   ## one space character after the TIME and may begin with a space.
 184   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 185   ## for a byte with that value.  Newlines, and backslashes which would be
 186   ## ambiguous, are converted into this form; all other characters are
 187   ## literal.
 188   ##
 189   ## We ignore the stat information and retrieve it ourselves, because it's
 190   ## incomplete.  Hopefully the dcache is still warm.
 191
 192   for line in f:
 193     if line.endswith('\n'): line = line[:-1]
 194
 195     ## Extract the escaped name.
 196     ff = line.split(None, 3)
 197     if len(ff) != 4:
 198       syserr("ignoring invalid line from rsync: `%s'" % line)
 199       continue
 200     tail = ff[3]
 201     try:
 202       spc = tail.index(' ')
 203     except ValueError:
 204       syserr("ignoring invalid line from rsync: `%s'" % line)
 205       continue
 206     name = tail[spc + 1:]
 207
 208     ## Now translate escape sequences.
 209     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 210
 211     ## Call the client.
 212     try:
 213       fi = FileInfo(name)
 214     except OSError:
 215       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 216       continue
 217     func(fi)
 218
 219 ###--------------------------------------------------------------------------
 220 ### The hash cache.
 221
 222 class HashCache (object):
 223
 224   VERSION = 0
 225   BUFSZ = 128*1024
 226
 227   INIT = [
 228     """CREATE TABLE meta (
 229                version INTEGER NOT NULL,
 230                hash TEXT NOT NULL
 231        );""",
 232     """CREATE TABLE hash (
 233                ino INTEGER PRIMARY KEY,
 234                mtime INTEGER NOT NULL,
 235                ctime INTEGER NOT NULL,
 236                size INTEGER NOT NULL,
 237                hash TEXT NOT NULL,
 238                seen BOOLEAN NOT NULL DEFAULT TRUE
 239        );""",
 240     """PRAGMA journal_mode = WAL;"""
 241   ]
 242
 243   def __init__(me, file, hash = None):
 244
 245     if file is None:
 246
 247       ## We're going this alone, with no cache.
 248       db = None
 249       if hash is None:
 250         die("no hash specified and no database cache to read from")
 251     else:
 252
 253       ## Connect to the database.
 254       db = DB.connect(file)
 255       db.text_factory = str
 256
 257       ## See whether we can understand the cache database.
 258       c = db.cursor()
 259       v = h = None
 260       try:
 261         c.execute('SELECT version, hash FROM meta')
 262         v, h = c.fetchone()
 263         if c.fetchone() is not None:
 264           die("cache database corrupt: meta table has mutliple rows")
 265       except (DB.Error, TypeError):
 266         pass
 267
 268       ## If that didn't work, we'd better clear the thing and start again.
 269       ## But only if we know how to initialize it.
 270       if v != me.VERSION:
 271
 272         ## Explain the situation.
 273         moan("cache version %s not understood" % v)
 274         if hash is None:
 275           if h is None:
 276             die("can't initialize cache: no hash function set")
 277           else:
 278             hash = h
 279         try:
 280           H.new(hash)
 281         except Exception:
 282           die("unknown hash function `%s'" % hash)
 283
 284         ## Drop old things.
 285         c.execute('SELECT type, name FROM sqlite_master')
 286         for type, name in c.fetchall():
 287           c.execute('DROP %s IF EXISTS %s' % (type, name))
 288
 289         ## Now we're ready to go.
 290         for stmt in me.INIT:
 291           c.execute(stmt)
 292         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 293         db.commit()
 294
 295       ## Check the hash function if necessary.
 296       if hash is None:
 297         hash = h
 298       elif h is not None and  h != hash:
 299         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 300
 301     ## All done.
 302     me.hash = hash
 303     me._db = db
 304     me._pend = 0
 305
 306   def hashfile(me, fi):
 307
 308     ## If this isn't a proper file then don't try to hash it.
 309     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 310       return None
 311
 312     ## See whether there's a valid entry in the cache.
 313     if me._db:
 314       c = me._db.cursor()
 315       c.execute(
 316         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 317         [fi.st.st_ino])
 318       r = c.fetchone()
 319       if r is not None:
 320         mt, sz, h, s = r
 321         if mt == fi.st.st_mtime and \
 322            sz == fi.st.st_size:
 323           if not s:
 324             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 325                       [fi.st.st_ino])
 326           me._update()
 327           return h
 328
 329     ## Hash the file.  Beware raciness: update the file information from the
 330     ## open descriptor, but set the size from what we actually read.
 331     h = H.new(me.hash)
 332     try:
 333       with open(fi.name, 'rb') as f:
 334         sz = 0
 335         while True:
 336           buf = f.read(me.BUFSZ)
 337           if len(buf) == 0:
 338             break
 339           sz += len(buf)
 340           h.update(buf)
 341         fi.st = OS.fstat(f.fileno())
 342         ##fi.st.st_size = sz
 343       hash = h.digest()
 344     except (OSError, IOError):
 345       fi.st = None
 346       fi.err = excval()
 347       return None
 348     hash = text(B.hexlify(hash))
 349
 350     ## Insert a record into the database.
 351     if me._db:
 352       c.execute("""
 353               INSERT OR REPLACE INTO hash
 354                       (ino, mtime, ctime, size, hash, seen)
 355               VALUES
 356                       (?, ?, ?, ?, ?, 1);
 357       """, [fi.st.st_ino,
 358             fi.st.st_mtime,
 359             fi.st.st_ctime,
 360             fi.st.st_size,
 361             hash])
 362       me._update()
 363
 364     ## Done.
 365     return hash
 366
 367   def _update(me):
 368     me._pend += 1
 369     if me._pend >= 1024:
 370       me.flush()
 371
 372   def flush(me):
 373     if me._db:
 374       me._db.commit()
 375     me._pend = 0
 376
 377   def need_db(me):
 378     if not me._db:
 379       die("no cache database")
 380
 381   def forget(me, ino):
 382     me.need_db()
 383     c = me._db.cursor()
 384     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 385
 386   def reset(me):
 387     me.need_db()
 388     c = me._db.cursor()
 389     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 390     me.flush()
 391
 392   def prune(me):
 393     me.need_db()
 394     c = me._db.cursor()
 395     c.execute('DELETE FROM hash WHERE NOT seen')
 396     me.flush()
 397
 398 ###--------------------------------------------------------------------------
 399 ### Printing output.
 400
 401 class GenericFormatter (object):
 402   def __init__(me, fi):
 403     me.fi = fi
 404   def _fmt_time(me, t):
 405     tm = T.gmtime(t)
 406     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 407   def _enc_name(me, n):
 408     return ' \\-> '.join(escapify(n).split(' -> '))
 409   def name(me):
 410     return me._enc_name(me.fi.name)
 411   def info(me):
 412     return me.TYPE
 413   def mode(me):
 414     return '%06o' % me.fi.st.st_mode
 415   def size(me):
 416     return me.fi.st.st_size
 417   def mtime(me):
 418     return me._fmt_time(me.fi.st.st_mtime)
 419   def owner(me):
 420     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 421
 422 class ErrorFormatter (GenericFormatter):
 423   def info(me):
 424     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 425   def error(me): return 'error'
 426   mode = size = mtime = owner = error
 427
 428 class SocketFormatter (GenericFormatter):
 429   TYPE = 'socket'
 430 class PipeFormatter (GenericFormatter):
 431   TYPE = 'fifo'
 432
 433 class LinkFormatter (GenericFormatter):
 434   TYPE = 'symbolic-link'
 435   def name(me):
 436     n = GenericFormatter.name(me)
 437     try:
 438       d = OS.readlink(me.fi.name)
 439       return '%s -> %s' % (n, me._enc_name(d))
 440     except OSError:
 441       err = excval()
 442       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 443
 444 class DirectoryFormatter (GenericFormatter):
 445   TYPE = 'directory'
 446   def name(me): return GenericFormatter.name(me) + '/'
 447   def size(me): return 'dir'
 448
 449 class DeviceFormatter (GenericFormatter):
 450   def info(me):
 451     return '%s %d:%d' % (me.TYPE,
 452                          OS.major(me.fi.st.st_rdev),
 453                          OS.minor(me.fi.st.st_rdev))
 454 class BlockDeviceFormatter (DeviceFormatter):
 455   TYPE = 'block-device'
 456 class CharDeviceFormatter (DeviceFormatter):
 457   TYPE = 'character-device'
 458
 459 class FileFormatter (GenericFormatter):
 460   TYPE = 'regular-file'
 461
 462 class Reporter (object):
 463
 464   TYMAP = {
 465     ST.S_IFSOCK: SocketFormatter,
 466     ST.S_IFDIR: DirectoryFormatter,
 467     ST.S_IFLNK: LinkFormatter,
 468     ST.S_IFREG: FileFormatter,
 469     ST.S_IFBLK: BlockDeviceFormatter,
 470     ST.S_IFCHR: CharDeviceFormatter,
 471     ST.S_IFIFO: PipeFormatter,
 472   }
 473
 474   def __init__(me, db):
 475     me._inomap = {}
 476     me._vinomap = {}
 477     me._db = db
 478     me._hsz = int(H.new(db.hash).digest_size)
 479
 480   def file(me, fi):
 481     h = me._db.hashfile(fi)
 482     if fi.err:
 483       fmt = ErrorFormatter(fi)
 484       vino = 'error'
 485     else:
 486       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 487       inoidx = fi.st.st_dev, fi.st.st_ino
 488       try:
 489         vino = me._inomap[inoidx]
 490       except KeyError:
 491         suffix = ''
 492         seq = 0
 493         while True:
 494           vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
 495           if vino not in me._vinomap: break
 496           suffix = '\0%d' % seq
 497           seq += 1
 498         me._inomap[inoidx] = vino
 499         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 500     if h: info = h
 501     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 502     print('%s %8s %6s %-12s %-20s %20s %s' %
 503           (info, vino, fmt.mode(), fmt.owner(),
 504            fmt.mtime(), fmt.size(), fmt.name()))
 505
 506 ###--------------------------------------------------------------------------
 507 ### Database clearing from diff files.
 508
 509 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 510
 511 def clear_entry(db, lno, line):
 512
 513   good = True
 514
 515   if line.startswith('['):
 516     pos = line.find(']')
 517     if pos < 0:
 518       moan("failed to parse file entry (type field; line %d)" % lno)
 519       return False
 520     ty = line[1:pos].strip()
 521     rest = line[pos + 1:]
 522     hash = None
 523   else:
 524     ff = line.split(None, 1)
 525     if len(ff) != 2:
 526       moan("failed to parse file entry (field split; line %d)" % lno)
 527       return False
 528     ty = 'regular-file'
 529     hash, rest = ff
 530
 531   ff = rest.split(None, 5)
 532   if len(ff) != 6:
 533     moan("failed to parse file entry (field split; line %d)" % lno)
 534     return False
 535   ino, mode, uidgid, mtime, sz, name = ff
 536
 537   if ty != 'symbolic-link':
 538     target = None
 539   else:
 540     nn = name.split(' -> ', 1)
 541     if len(nn) != 2:
 542       moan("failed to parse file entry (name split; line %d)" % lno)
 543       return False
 544     name, target = nn
 545     target = unescapify(target)
 546   name = unescapify(name)
 547
 548   try:
 549     st = OS.lstat(name)
 550   except OSError:
 551     e = excval()
 552     moan("failed to stat `%s': %s" % (name, e.strerror))
 553     if e.errno != E.ENOENT: good = False
 554   else:
 555     print("Clear cache entry for `%s'" % name)
 556     db.forget(st.st_ino)
 557
 558   return good
 559
 560 def clear_cache(db):
 561
 562   ## Work through the input diff file one line at a time.
 563   diffstate = 'gap'
 564   lno = 0
 565   good = True
 566   for line in stdin:
 567     if line.endswith('\n'): line = line[:-1]
 568     lno += 1
 569
 570     ## We're in a gap between hunks.  Find a hunk header and extract the line
 571     ## counts.
 572     if diffstate == 'gap':
 573       m = R_HUNK.match(line)
 574       if m:
 575         oldlines = int(m.group(1))
 576         newlines = int(m.group(2))
 577         diffstate = 'hunk'
 578         hdrlno = lno
 579
 580     ## We're in a hunk.  Keep track of whether we've reached the end, and
 581     ## discard entries from the cache for mismatching lines.
 582     elif diffstate == 'hunk':
 583       if len(line) == 0:
 584         moan("empty line in diff hunk (line %d)" % lno)
 585         good = False
 586       ty = line[0]
 587       if ty == ' ':
 588         oldlines -= 1; newlines -= 1
 589       elif ty == '+':
 590         newlines -= 1
 591         if not clear_entry(db, lno, line[1:]): good = False
 592       elif ty == '-':
 593         oldlines -= 1
 594         if not clear_entry(db, lno, line[1:]): good = False
 595       else:
 596         moan("incomprehensible line in diff hunk (line %d)" % lno)
 597         good = false
 598       if oldlines < 0 or newlines < 0:
 599         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 600         good = False
 601       if oldlines == newlines == 0:
 602         diffstate = 'gap'
 603
 604   if diffstate == 'hunk':
 605     moan("truncated diff hunk (started at line %d)" % hdrlno)
 606     good = False
 607
 608   return good
 609
 610 ###--------------------------------------------------------------------------
 611 ### Main program.
 612
 613 FMTMAP = {
 614   'rsync': lambda f: enum_rsync(stdin, f),
 615   'find0': lambda f: enum_find0(stdin, f)
 616 }
 617 op = OP.OptionParser(
 618   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 619   version = '%%prog, version %s' % VERSION,
 620   description = '''\
 621 Print a digest of a filesystem (or a collection of specified files) to
 622 standard output.  The idea is that the digest should be mostly /complete/
 623 (i.e., any `interesting\' change to the filesystem results in a different
 624 digest) and /canonical/ (i.e., identical filesystem contents result in
 625 identical output).
 626 ''')
 627
 628 for short, long, props in [
 629   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 630                     'help': 'clear cache of all files not seen' }),
 631   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 632                       'help': 'use FILE as a cache for file hashes' }),
 633   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 634                       'type': 'choice', 'choices': list(FMTMAP.keys()),
 635                       'help': 'read files to report in the given FORMAT' }),
 636   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 637                       'help': 'read diff from stdin, clear cache entries' }),
 638   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 639                        'type': 'int', 'default': 2,
 640                        'help': 'produce output with given compatibility VERSION' }),
 641   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 642                      ##'type': 'choice', 'choices': H.algorithms,
 643                      'help': 'use HASH as the hash function' })]:
 644   op.add_option(short, long, **props)
 645 OPTS, args = op.parse_args(argv)
 646 if not 1 <= OPTS.compat <= 2:
 647   die("unknown compatibility version %d" % OPTS.compat)
 648 if OPTS.udiff:
 649   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 650     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 651   db = HashCache(OPTS.cache, OPTS.hash)
 652   if len(args) == 2: OS.chdir(args[1])
 653   good = True
 654   if not clear_cache(db): good = False
 655   if good: db.flush()
 656   else: exit(2)
 657 else:
 658   if not OPTS.files and len(args) <= 1:
 659     die("no filename sources: nothing to do")
 660   db = HashCache(OPTS.cache, OPTS.hash)
 661   if OPTS.all:
 662     db.reset()
 663   if OPTS.compat >= 2:
 664     print("## fshash report format version %d" % OPTS.compat)
 665   rep = Reporter(db)
 666   if OPTS.files:
 667     FMTMAP[OPTS.files](rep.file)
 668   for dir in args[1:]:
 669     enum_walk(dir, rep.file)
 670   if OPTS.all:
 671     db.prune()
 672   db.flush()
 673
 674 ###----- That's all, folks --------------------------------------------------