chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import binascii as B
  28 import errno as E
  29 import grp as GR
  30 import hashlib as H
  31 import optparse as OP
  32 import os as OS
  33 import pwd as PW
  34 import re as RX
  35 import sqlite3 as DB
  36 import stat as ST
  37 import time as T
  38 import zlib as Z
  39
  40 PACKAGE = '@PACKAGE@'
  41 VERSION = '@VERSION@'
  42
  43 ###--------------------------------------------------------------------------
  44 ### Utilities.
  45
  46 import sys as _SYS
  47 _PYVER = _SYS.version_info
  48 if _PYVER >= (3,):
  49   _FSENC = _SYS.getfilesystemencoding()
  50   if _PYVER >= (3, 1): _FSENCERR = "surrogateescape"
  51   else: _FSENCERR = "strict"
  52   from io import BytesIO, StringIO
  53   def bin(x): return x.encode(_FSENC, _FSENCERR)
  54   def text(x): return x.decode(_FSENC, _FSENCERR)
  55   def bytechr(x): return bytes([x])
  56   def byteord(x): return x
  57   def iterkeys(x): return x.keys()
  58 else:
  59   from cStringIO import StringIO; BytesIO = StringIO
  60   def bin(x): return x
  61   def text(x): return x
  62   def bytechr(x): return chr(x)
  63   def byteord(x): return ord(x)
  64   def iterkeys(x): return x.iterkeys()
  65 def excval(): return exc_info()[1]
  66
  67 QUIS = OS.path.basename(argv[0])
  68
  69 def moan(msg):
  70   stderr.write('%s: %s\n' % (QUIS, msg))
  71
  72 def die(msg, rc = 1):
  73   moan(msg)
  74   exit(rc)
  75
  76 SYSERR = 0
  77 def syserr(msg):
  78   global SYSERR
  79   moan(msg)
  80   SYSERR += 1
  81
  82 def escapify(x):
  83   out = StringIO()
  84   for ch in bin(x):
  85     k = byteord(ch)
  86     if k == 9: out.write("\\t")
  87     elif k == 10: out.write("\\n")
  88     elif k == 13: out.write("\\r")
  89     elif k == 39: out.write("\\'")
  90     elif k == 92: out.write("\\\\")
  91     elif 20 <= k <= 126: out.write(chr(k))
  92     else: out.write("\\x%02x" % k)
  93   return out.getvalue()
  94
  95 R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
  96                       RX.VERBOSE)
  97 def unescapify(x):
  98   str = BytesIO()
  99   i, n = 0, len(x)
 100   while True:
 101     m = R_STRESC.search(x, i)
 102     if m is not None: j = m.start(0)
 103     else: j = n
 104     str.write(bin(str[i:j]))
 105     if m is None: break
 106     k, e = m.group(1), m.group(2)
 107     if k is not None: ch = int(k, 16)
 108     elif ch == "a": ch = 7
 109     elif ch == "b": ch = 8
 110     elif ch == "f": ch = 12
 111     elif ch == "n": ch = 10
 112     elif ch == "r": ch = 13
 113     elif ch == "t": ch = 9
 114     elif ch == "v": ch = 11
 115     else: ch = byteord(e)
 116     str.write(bytechr(ch))
 117     i = m.end(0)
 118   return text(out.getvalue())
 119
 120 def simple_memo(func):
 121   memo = dict()
 122   def _(*args):
 123     try:
 124       r = memo[args]
 125     except KeyError:
 126       r = func(*args)
 127       memo[args] = r
 128     return r
 129   return _
 130
 131 @simple_memo
 132 def name_uid(name):
 133   pw = PW.getpwnam(name)
 134   return pw[2]
 135
 136 @simple_memo
 137 def name_gid(name):
 138   gr = GR.getgrnam(name)
 139   return gr[2]
 140
 141 ###--------------------------------------------------------------------------
 142 ### Extended attributes.
 143
 144 def listxattr(f, follow_symlinks = True): return []
 145 if _PYVER >= (3, 3):
 146   if hasattr(OS, "listxattr"):
 147     getxattr, listxattr = OS.getxattr, OS.listxattr
 148 else:
 149   try:
 150     import xattr as _XA
 151   except ImportError:
 152     pass
 153   else:
 154     if hasattr(_XA, "list"):
 155       def listxattr(f, follow_symlinks = True):
 156         return _XA.list(f, nofollow = not follow_symlinks)
 157       def getxattr(f, a, follow_symlinks = True):
 158         return _XA.get(f, a, nofollow = not follow_symlinks)
 159     else:
 160       def listxattr(f, follow_symlinks = True):
 161         return _XA.listxattr(f, nofollow = not follow_symlinks)
 162       def getxattr(f, a, follow_symlinks = True):
 163         return _XA.getxattr(f, a, nofollow = not follow_symlinks)
 164
 165 ###--------------------------------------------------------------------------
 166 ### Access control lists.
 167
 168 HAVE_ACL_P = False
 169
 170 ACL_ACC= 1
 171 ACL_DFLT = 2
 172
 173 def getacl(f, which): return None
 174 try:
 175   import posix1e as ACL
 176 except ImportError:
 177   pass
 178 else:
 179
 180   ## Match a line from the standard ACL text format.
 181   R_ACLENT = RX.compile(r"""^
 182           \s*
 183           (?: (u | user | g | group | m | mask | o | other)
 184           \s* : \s*
 185           (| [^:\s] | [^:\s] [^:]* [^:\s])
 186           \s* : \s*
 187           ([-rwx]*)
 188           \s*) ?
 189           (?: \# .*)? $
 190   """, RX.VERBOSE)
 191
 192   ## Codes for the possible entry tag types.  These are ordered so that we
 193   ## can sort.
 194   AT_OWNUID = 1
 195   AT_USER = 2
 196   AT_MASK = 3
 197   AT_OWNGID = 4
 198   AT_GROUP = 5
 199   AT_OTHER = 6
 200
 201   ## Output tags corresponding to the codes.
 202   ACL_TAGMAP = [None, "u", "u", "m", "g", "g", "o"]
 203
 204   HAVE_ACL_P = True
 205
 206   def getacl(f, which):
 207
 208     ## Fetch the file ACL.
 209     if which == ACL_ACC: acl = ACL.ACL(file = f)
 210     elif which == ACL_DFLT: acl = ACL.ACL(filedef = f)
 211     else: raise ValueError("unexpected WHICH = %d" % which)
 212
 213     ## For maximum portability, only use the text format, which is guaranteed
 214     ## to be supported if anything is.  We'll have to parse this ourselves.
 215     ## Honestly, an important part of what we're doing here is producing a
 216     ## /canonical/ presentation of the ACL, which doesn't seem to be
 217     ## something that even the less portable functions will do for us.
 218     s = str(acl)
 219     extp = False
 220     entries = []
 221
 222     ## First pass: grind through the ACL entries and build a list of (TAG,
 223     ## QUAL, MODE) triples, where the TAG is an `AT_...' code, the QUAL is
 224     ## either `None' or a numeric ID, and the MODE is a bitmask of
 225     ## permissions.
 226     for line in s.split("\n"):
 227       m = R_ACLENT.match(line)
 228       if m is None: raise ValueError("unexpected ACL line `%s'" % line)
 229       if not m.group(1): continue
 230       tag, qual, perm = m.group(1), m.group(2), m.group(3)
 231
 232       if qual == "": qual = None
 233
 234       ## Convert the tag and qualifier.
 235       if tag == "u" or tag == "user":
 236         if qual is None: pass
 237         elif qual.isdigit(): qual = int(qual, 10)
 238         else: qual = name_uid(qual)
 239         if qual is None: tag = AT_OWNUID
 240         else: tag = AT_USER; extp = True
 241       elif tag == "m" or tag == "mask":
 242         if qual is not None:
 243           raise ValueError("unexpected mask qualifier `%s'" % qual)
 244         tag = AT_MASK; extp = True
 245       elif tag == "g" or tag == "group":
 246         if qual is None: pass
 247         elif qual.isdigit(): qual = int(qual, 10)
 248         else: qual = name_gid(qual)
 249         if qual is None: tag = AT_OWNGID
 250         else: tag = AT_GROUP; extp = True
 251       elif tag == "o" or tag == "other":
 252         if qual is not None:
 253           raise ValueError("unexpected other qualifier `%s'" % qual)
 254         tag = AT_OTHER
 255       else:
 256         raise ValueError("unexpected tag type `%s'" % tag)
 257
 258       ## Convert the permissions.
 259       mode = 0
 260       for ch in perm:
 261         if ch == "r": mode |= 4
 262         elif ch == "w": mode |= 2
 263         elif ch == "x": mode |= 1
 264         elif ch == "-": pass
 265         else: raise ValueError("unexpected permission character `%s'" % ch)
 266
 267       ## Done.
 268       entries.append((tag, qual, mode))
 269
 270     ## If the ACL is trivial then ignore it.  An access ACL trivial if it
 271     ## contains only entries which are reflected in the traditional
 272     ## permission bits.  A default ACL is trivial if it's empty.
 273     if (which == ACL_ACC and not extp) or \
 274        (which == ACL_DFLT and not entries):
 275       return None
 276
 277     ## Sort the entries.  The tag codes are arranged so that this is a useful
 278     ## ordering.
 279     entries.sort()
 280
 281     ## Produce output.  This happens to be the standard short text format,
 282     ## with exclusively numeric IDs.
 283     out = StringIO()
 284     firstp = True
 285     for tag, qual, mode in entries:
 286       if firstp: firstp = False
 287       else: out.write(",")
 288       out.write(ACL_TAGMAP[tag])
 289       out.write(":")
 290       if qual is not None: out.write(str(qual))
 291       out.write(":")
 292       if mode&4: out.write("r")
 293       else: out.write("-")
 294       if mode&2: out.write("w")
 295       else: out.write("-")
 296       if mode&1: out.write("x")
 297       else: out.write("-")
 298
 299     return out.getvalue()
 300
 301 ###--------------------------------------------------------------------------
 302 ### File system enumeration.
 303
 304 class FileAttr (object):
 305   def __init__(me, file, attr):
 306     try: value = getxattr(file, attr, follow_symlinks = False)
 307     except (OSError, IOError): me.value, me.err = None, excval()
 308     else: me.value, me.err = value, None
 309
 310 class FileInfo (object):
 311   def __init__(me, file, st = None):
 312     me.name = file
 313     if st:
 314       me.st = st
 315       me.err = None
 316     else:
 317       try:
 318         me.st = OS.lstat(file)
 319         me.err = None
 320       except OSError:
 321         me.st = None
 322         me.err = excval()
 323
 324     me.xa, me.xa_err = dict(), None
 325     me.acl_acc = me.aclerr_acc = None
 326     me.acl_dflt = me.aclerr_dflt = None
 327
 328     if me.st is not None:
 329
 330       def collect_acl(which):
 331         try:
 332           return getacl(file, which), None
 333         except (OSError, IOError):
 334           err = excval()
 335           if err.errno == E.ENOTSUP: return None, None
 336           else: return None, excval()
 337
 338       if not ST.S_ISLNK(me.st.st_mode):
 339         me.acl_acc, me.aclerr_acc = collect_acl(ACL_ACC)
 340       if ST.S_ISDIR(me.st.st_mode):
 341         me.acl_dflt, me.aclerr_dflt = collect_acl(ACL_DFLT)
 342
 343       try: names = listxattr(file, follow_symlinks = False)
 344       except (OSError, IOError): me.xa_err = excval()
 345       else:
 346         for name in names:
 347           if HAVE_ACL_P and (name == "system.posix_acl_access" or
 348                              name == "system.posix_acl_default"):
 349             continue
 350           me.xa[name] = FileAttr(file, name)
 351
 352 def enum_walk(file, func):
 353
 354   def dirents(name):
 355     try:
 356       return OS.listdir(name)
 357     except OSError:
 358       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
 359       return []
 360
 361   def dir(ee, dev):
 362     ff = []
 363     dd = []
 364     for e in ee:
 365       fi = FileInfo(e)
 366       if fi.st and fi.st.st_dev != dev: pass
 367       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
 368       else: ff.append(fi)
 369     ff.sort(key = lambda fi: fi.name)
 370     dd.sort(key = lambda fi: fi.name + '/')
 371     for f in ff:
 372       func(f)
 373     for d in dd:
 374       if d.st.st_dev == dev:
 375         func(d)
 376         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 377
 378   if file.endswith('/'):
 379     cwd = OS.open('.', OS.O_RDONLY)
 380     try:
 381       OS.chdir(file)
 382       fi = FileInfo('.')
 383       func(fi)
 384       dir(dirents('.'), fi.st.st_dev)
 385     finally:
 386       OS.fchdir(cwd)
 387       OS.close(cwd)
 388   else:
 389     fi = FileInfo(file)
 390     func(fi)
 391     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 392       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 393           fi.st.st_dev)
 394
 395 def enum_find0(f, func):
 396   tail = ""
 397   while True:
 398     buf = f.read(8192)
 399     last = len(buf) == 0
 400     names = (tail + buf).split('\0')
 401     tail = names.pop()
 402     for n in names:
 403       func(FileInfo(n))
 404     if last:
 405       break
 406   if len(tail):
 407     moan("ignored trailing junk after last filename")
 408
 409 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 410 def enum_rsync(f, func):
 411
 412   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 413   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 414   ## one space character after the TIME and may begin with a space.
 415   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 416   ## for a byte with that value.  Newlines, and backslashes which would be
 417   ## ambiguous, are converted into this form; all other characters are
 418   ## literal.
 419   ##
 420   ## We ignore the stat information and retrieve it ourselves, because it's
 421   ## incomplete.  Hopefully the dcache is still warm.
 422
 423   for line in f:
 424     if line.endswith('\n'): line = line[:-1]
 425
 426     ## Extract the escaped name.
 427     ff = line.split(None, 3)
 428     if len(ff) != 4:
 429       syserr("ignoring invalid line from rsync: `%s'" % line)
 430       continue
 431     tail = ff[3]
 432     try:
 433       spc = tail.index(' ')
 434     except ValueError:
 435       syserr("ignoring invalid line from rsync: `%s'" % line)
 436       continue
 437     name = tail[spc + 1:]
 438
 439     ## Now translate escape sequences.
 440     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 441
 442     ## Call the client.
 443     try:
 444       fi = FileInfo(name)
 445     except OSError:
 446       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 447       continue
 448     func(fi)
 449
 450 ###--------------------------------------------------------------------------
 451 ### The hash cache.
 452
 453 class HashCache (object):
 454
 455   VERSION = 0
 456   BUFSZ = 128*1024
 457
 458   INIT = [
 459     """CREATE TABLE meta (
 460                version INTEGER NOT NULL,
 461                hash TEXT NOT NULL
 462        );""",
 463     """CREATE TABLE hash (
 464                ino INTEGER PRIMARY KEY,
 465                mtime INTEGER NOT NULL,
 466                ctime INTEGER NOT NULL,
 467                size INTEGER NOT NULL,
 468                hash TEXT NOT NULL,
 469                seen BOOLEAN NOT NULL DEFAULT TRUE
 470        );""",
 471     """PRAGMA journal_mode = WAL;"""
 472   ]
 473
 474   def __init__(me, file, hash = None):
 475
 476     if file is None:
 477
 478       ## We're going this alone, with no cache.
 479       db = None
 480       if hash is None:
 481         die("no hash specified and no database cache to read from")
 482     else:
 483
 484       ## Connect to the database.
 485       db = DB.connect(file)
 486       db.text_factory = str
 487
 488       ## See whether we can understand the cache database.
 489       c = db.cursor()
 490       v = h = None
 491       try:
 492         c.execute('SELECT version, hash FROM meta')
 493         v, h = c.fetchone()
 494         if c.fetchone() is not None:
 495           die("cache database corrupt: meta table has mutliple rows")
 496       except (DB.Error, TypeError):
 497         pass
 498
 499       ## If that didn't work, we'd better clear the thing and start again.
 500       ## But only if we know how to initialize it.
 501       if v != me.VERSION:
 502
 503         ## Explain the situation.
 504         moan("cache version %s not understood" % v)
 505         if hash is None:
 506           if h is None:
 507             die("can't initialize cache: no hash function set")
 508           else:
 509             hash = h
 510         try:
 511           H.new(hash)
 512         except Exception:
 513           die("unknown hash function `%s'" % hash)
 514
 515         ## Drop old things.
 516         c.execute('SELECT type, name FROM sqlite_master')
 517         for type, name in c.fetchall():
 518           c.execute('DROP %s IF EXISTS %s' % (type, name))
 519
 520         ## Now we're ready to go.
 521         for stmt in me.INIT:
 522           c.execute(stmt)
 523         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 524         db.commit()
 525
 526       ## Check the hash function if necessary.
 527       if hash is None:
 528         hash = h
 529       elif h is not None and  h != hash:
 530         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 531
 532     ## All done.
 533     me.hash = hash
 534     me._db = db
 535     me._pend = 0
 536
 537   def hashblob(me, blob):
 538     h = H.new(me.hash)
 539     h.update(blob)
 540     return text(B.hexlify(h.digest()))
 541
 542   def hashfile(me, fi):
 543
 544     ## If this isn't a proper file then don't try to hash it.
 545     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 546       return None
 547
 548     ## See whether there's a valid entry in the cache.
 549     if me._db:
 550       c = me._db.cursor()
 551       c.execute(
 552         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 553         [fi.st.st_ino])
 554       r = c.fetchone()
 555       if r is not None:
 556         mt, sz, h, s = r
 557         if mt == fi.st.st_mtime and \
 558            sz == fi.st.st_size:
 559           if not s:
 560             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 561                       [fi.st.st_ino])
 562           me._update()
 563           return h
 564
 565     ## Hash the file.  Beware raciness: update the file information from the
 566     ## open descriptor, but set the size from what we actually read.
 567     h = H.new(me.hash)
 568     try:
 569       with open(fi.name, 'rb') as f:
 570         sz = 0
 571         while True:
 572           buf = f.read(me.BUFSZ)
 573           if len(buf) == 0:
 574             break
 575           sz += len(buf)
 576           h.update(buf)
 577         fi.st = OS.fstat(f.fileno())
 578         ##fi.st.st_size = sz
 579       hash = h.digest()
 580     except (OSError, IOError):
 581       fi.st = None
 582       fi.err = excval()
 583       return None
 584     hash = text(B.hexlify(hash))
 585
 586     ## Insert a record into the database.
 587     if me._db:
 588       c.execute("""
 589               INSERT OR REPLACE INTO hash
 590                       (ino, mtime, ctime, size, hash, seen)
 591               VALUES
 592                       (?, ?, ?, ?, ?, 1);
 593       """, [fi.st.st_ino,
 594             fi.st.st_mtime,
 595             fi.st.st_ctime,
 596             fi.st.st_size,
 597             hash])
 598       me._update()
 599
 600     ## Done.
 601     return hash
 602
 603   def _update(me):
 604     me._pend += 1
 605     if me._pend >= 1024:
 606       me.flush()
 607
 608   def flush(me):
 609     if me._db:
 610       me._db.commit()
 611     me._pend = 0
 612
 613   def need_db(me):
 614     if not me._db:
 615       die("no cache database")
 616
 617   def forget(me, ino):
 618     me.need_db()
 619     c = me._db.cursor()
 620     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 621
 622   def reset(me):
 623     me.need_db()
 624     c = me._db.cursor()
 625     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 626     me.flush()
 627
 628   def prune(me):
 629     me.need_db()
 630     c = me._db.cursor()
 631     c.execute('DELETE FROM hash WHERE NOT seen')
 632     me.flush()
 633
 634 ###--------------------------------------------------------------------------
 635 ### Printing output.
 636
 637 class GenericFormatter (object):
 638   def __init__(me, fi):
 639     me.fi = fi
 640   def _fmt_time(me, t):
 641     tm = T.gmtime(t)
 642     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 643   def _enc_name(me, n):
 644     return ' \\-> '.join(escapify(n).split(' -> '))
 645   def name(me):
 646     return me._enc_name(me.fi.name)
 647   def info(me):
 648     return me.TYPE
 649   def mode(me):
 650     return '%06o' % me.fi.st.st_mode
 651   def size(me):
 652     return me.fi.st.st_size
 653   def mtime(me):
 654     return me._fmt_time(me.fi.st.st_mtime)
 655   def owner(me):
 656     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 657
 658 class ErrorFormatter (GenericFormatter):
 659   def info(me):
 660     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 661   def error(me): return 'error'
 662   mode = size = mtime = owner = error
 663
 664 class SocketFormatter (GenericFormatter):
 665   TYPE = 'socket'
 666 class PipeFormatter (GenericFormatter):
 667   TYPE = 'fifo'
 668
 669 class LinkFormatter (GenericFormatter):
 670   TYPE = 'symbolic-link'
 671   def name(me):
 672     n = GenericFormatter.name(me)
 673     try:
 674       d = OS.readlink(me.fi.name)
 675       return '%s -> %s' % (n, me._enc_name(d))
 676     except OSError:
 677       err = excval()
 678       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 679
 680 class DirectoryFormatter (GenericFormatter):
 681   TYPE = 'directory'
 682   def name(me): return GenericFormatter.name(me) + '/'
 683   def size(me): return 'dir'
 684
 685 class DeviceFormatter (GenericFormatter):
 686   def info(me):
 687     return '%s %d:%d' % (me.TYPE,
 688                          OS.major(me.fi.st.st_rdev),
 689                          OS.minor(me.fi.st.st_rdev))
 690 class BlockDeviceFormatter (DeviceFormatter):
 691   TYPE = 'block-device'
 692 class CharDeviceFormatter (DeviceFormatter):
 693   TYPE = 'character-device'
 694
 695 class FileFormatter (GenericFormatter):
 696   TYPE = 'regular-file'
 697
 698 class Reporter (object):
 699
 700   TYMAP = {
 701     ST.S_IFSOCK: SocketFormatter,
 702     ST.S_IFDIR: DirectoryFormatter,
 703     ST.S_IFLNK: LinkFormatter,
 704     ST.S_IFREG: FileFormatter,
 705     ST.S_IFBLK: BlockDeviceFormatter,
 706     ST.S_IFCHR: CharDeviceFormatter,
 707     ST.S_IFIFO: PipeFormatter,
 708   }
 709
 710   def __init__(me, db):
 711     me._inomap = {}
 712     me._vinomap = {}
 713     me._db = db
 714     me._hsz = int(H.new(db.hash).digest_size)
 715
 716   def file(me, fi):
 717     h = me._db.hashfile(fi)
 718     if fi.err:
 719       fmt = ErrorFormatter(fi)
 720       vino = 'error'
 721     else:
 722       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 723       inoidx = fi.st.st_dev, fi.st.st_ino
 724       try:
 725         vino = me._inomap[inoidx]
 726       except KeyError:
 727         suffix = ''
 728         seq = 0
 729         while True:
 730           vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
 731           if vino not in me._vinomap: break
 732           suffix = '\0%d' % seq
 733           seq += 1
 734         me._inomap[inoidx] = vino
 735         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 736     if h: info = h
 737     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 738     print('%s %8s %6s %-12s %-20s %20s %s' %
 739           (info, vino, fmt.mode(), fmt.owner(),
 740            fmt.mtime(), fmt.size(), fmt.name()))
 741
 742     if OPTS.compat >= 3:
 743
 744       for which, acl, err in \
 745           [("posix-access", fi.acl_acc, fi.aclerr_acc),
 746            ("posix-default", fi.acl_dflt, fi.aclerr_dflt)]:
 747         if acl is not None:
 748           print("\tacl %s %s" % (which, acl))
 749         elif err is not None:
 750           print("\tacl %s <E%d %s>" % (which, err.errno, err.strerror))
 751
 752       if fi.xa_err is not None:
 753         print("\txattr <E%d %s>" % (fi.xa_err.errno, fi.xa_err.strerror))
 754       else:
 755         for name in sorted(iterkeys(fi.xa)):
 756           attr = fi.xa[name]
 757           if attr.err is None:
 758             print("\txattr %s %s" %
 759                   (escapify(name), me._db.hashblob(attr.value)))
 760           else:
 761             print("\txattr %s <E%d %s>" %
 762                   (escapify(name), attr.err.errno, attr.err.strerror))
 763
 764 ###--------------------------------------------------------------------------
 765 ### Database clearing from diff files.
 766
 767 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 768
 769 def clear_entry(db, lno, line):
 770
 771   good = True
 772
 773   if line.startswith('['):
 774     pos = line.find(']')
 775     if pos < 0:
 776       moan("failed to parse file entry (type field; line %d)" % lno)
 777       return False
 778     ty = line[1:pos].strip()
 779     rest = line[pos + 1:]
 780     hash = None
 781   else:
 782     ff = line.split(None, 1)
 783     if len(ff) != 2:
 784       moan("failed to parse file entry (field split; line %d)" % lno)
 785       return False
 786     ty = 'regular-file'
 787     hash, rest = ff
 788
 789   ff = rest.split(None, 5)
 790   if len(ff) != 6:
 791     moan("failed to parse file entry (field split; line %d)" % lno)
 792     return False
 793   ino, mode, uidgid, mtime, sz, name = ff
 794
 795   if ty != 'symbolic-link':
 796     target = None
 797   else:
 798     nn = name.split(' -> ', 1)
 799     if len(nn) != 2:
 800       moan("failed to parse file entry (name split; line %d)" % lno)
 801       return False
 802     name, target = nn
 803     target = unescapify(target)
 804   name = unescapify(name)
 805
 806   try:
 807     st = OS.lstat(name)
 808   except OSError:
 809     e = excval()
 810     moan("failed to stat `%s': %s" % (name, e.strerror))
 811     if e.errno != E.ENOENT: good = False
 812   else:
 813     print("Clear cache entry for `%s'" % name)
 814     db.forget(st.st_ino)
 815
 816   return good
 817
 818 def clear_cache(db):
 819
 820   ## Work through the input diff file one line at a time.
 821   diffstate = 'gap'
 822   lno = 0
 823   good = True
 824   for line in stdin:
 825     if line.endswith('\n'): line = line[:-1]
 826     lno += 1
 827
 828     ## We're in a gap between hunks.  Find a hunk header and extract the line
 829     ## counts.
 830     if diffstate == 'gap':
 831       m = R_HUNK.match(line)
 832       if m:
 833         oldlines = int(m.group(1))
 834         newlines = int(m.group(2))
 835         diffstate = 'hunk'
 836         hdrlno = lno
 837
 838     ## We're in a hunk.  Keep track of whether we've reached the end, and
 839     ## discard entries from the cache for mismatching lines.
 840     elif diffstate == 'hunk':
 841       if len(line) == 0:
 842         moan("empty line in diff hunk (line %d)" % lno)
 843         good = False
 844       ty = line[0]
 845       if ty == ' ':
 846         oldlines -= 1; newlines -= 1
 847       elif ty == '+':
 848         newlines -= 1
 849         if not clear_entry(db, lno, line[1:]): good = False
 850       elif ty == '-':
 851         oldlines -= 1
 852         if not clear_entry(db, lno, line[1:]): good = False
 853       else:
 854         moan("incomprehensible line in diff hunk (line %d)" % lno)
 855         good = false
 856       if oldlines < 0 or newlines < 0:
 857         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 858         good = False
 859       if oldlines == newlines == 0:
 860         diffstate = 'gap'
 861
 862   if diffstate == 'hunk':
 863     moan("truncated diff hunk (started at line %d)" % hdrlno)
 864     good = False
 865
 866   return good
 867
 868 ###--------------------------------------------------------------------------
 869 ### Main program.
 870
 871 FMTMAP = {
 872   'rsync': lambda f: enum_rsync(stdin, f),
 873   'find0': lambda f: enum_find0(stdin, f)
 874 }
 875 op = OP.OptionParser(
 876   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 877   version = '%%prog, version %s' % VERSION,
 878   description = '''\
 879 Print a digest of a filesystem (or a collection of specified files) to
 880 standard output.  The idea is that the digest should be mostly /complete/
 881 (i.e., any `interesting\' change to the filesystem results in a different
 882 digest) and /canonical/ (i.e., identical filesystem contents result in
 883 identical output).
 884 ''')
 885
 886 for short, long, props in [
 887   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 888                     'help': 'clear cache of all files not seen' }),
 889   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 890                       'help': 'use FILE as a cache for file hashes' }),
 891   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 892                       'type': 'choice', 'choices': list(FMTMAP.keys()),
 893                       'help': 'read files to report in the given FORMAT' }),
 894   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 895                       'help': 'read diff from stdin, clear cache entries' }),
 896   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 897                        'type': 'int', 'default': 3,
 898                        'help': 'produce output with given compatibility VERSION' }),
 899   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 900                      ##'type': 'choice', 'choices': H.algorithms,
 901                      'help': 'use HASH as the hash function' })]:
 902   op.add_option(short, long, **props)
 903 OPTS, args = op.parse_args(argv)
 904 if not 1 <= OPTS.compat <= 3:
 905   die("unknown compatibility version %d" % OPTS.compat)
 906 if OPTS.udiff:
 907   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 908     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 909   db = HashCache(OPTS.cache, OPTS.hash)
 910   if len(args) == 2: OS.chdir(args[1])
 911   good = True
 912   if not clear_cache(db): good = False
 913   if good: db.flush()
 914   else: exit(2)
 915 else:
 916   if not OPTS.files and len(args) <= 1:
 917     die("no filename sources: nothing to do")
 918   db = HashCache(OPTS.cache, OPTS.hash)
 919   if OPTS.all:
 920     db.reset()
 921   if OPTS.compat >= 2:
 922     print("## fshash report format version %d" % OPTS.compat)
 923   rep = Reporter(db)
 924   if OPTS.files:
 925     FMTMAP[OPTS.files](rep.file)
 926   for dir in args[1:]:
 927     enum_walk(dir, rep.file)
 928   if OPTS.all:
 929     db.prune()
 930   db.flush()
 931
 932 ###----- That's all, folks --------------------------------------------------