#! @PYTHON@
###
### Efficiently construct canonical digests of filesystems
###
### (c) 2012 Mark Wooding
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of the `rsync-backup' program.
###
### rsync-backup is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation; either version 2 of the License, or
### (at your option) any later version.
###
### rsync-backup is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with rsync-backup; if not, write to the Free Software Foundation,
### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

from sys import argv, exc_info, exit, stdin, stdout, stderr
import binascii as B
import errno as E
import grp as GR
import hashlib as H
import optparse as OP
import os as OS
import pwd as PW
import re as RX
import sqlite3 as DB
import stat as ST
import time as T
import zlib as Z

PACKAGE = '@PACKAGE@'
VERSION = '@VERSION@'

###--------------------------------------------------------------------------
### Utilities.

import sys as _SYS
_PYVER = _SYS.version_info
if _PYVER >= (3,):
  _FSENC = _SYS.getfilesystemencoding()
  if _PYVER >= (3, 1): _FSENCERR = "surrogateescape"
  else: _FSENCERR = "strict"
  from io import BytesIO, StringIO
  def bin(x): return x.encode(_FSENC, _FSENCERR)
  def text(x): return x.decode(_FSENC, _FSENCERR)
  def bytechr(x): return bytes([x])
  def byteord(x): return x
  def iterkeys(x): return x.keys()
else:
  from cStringIO import StringIO; BytesIO = StringIO
  def bin(x): return x
  def text(x): return x
  def bytechr(x): return chr(x)
  def byteord(x): return ord(x)
  def iterkeys(x): return x.iterkeys()
def excval(): return exc_info()[1]

QUIS = OS.path.basename(argv[0])

def moan(msg):
  stderr.write('%s: %s\n' % (QUIS, msg))

def die(msg, rc = 1):
  moan(msg)
  exit(rc)

SYSERR = 0
def syserr(msg):
  global SYSERR
  moan(msg)
  SYSERR += 1

def escapify(x):
  out = StringIO()
  for ch in bin(x):
    k = byteord(ch)
    if k == 9: out.write("\\t")
    elif k == 10: out.write("\\n")
    elif k == 13: out.write("\\r")
    elif k == 39: out.write("\\'")
    elif k == 92: out.write("\\\\")
    elif 20 <= k <= 126: out.write(chr(k))
    else: out.write("\\x%02x" % k)
  return out.getvalue()

R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
                      RX.VERBOSE)
def unescapify(x):
  str = BytesIO()
  i, n = 0, len(x)
  while True:
    m = R_STRESC.search(x, i)
    if m is not None: j = m.start(0)
    else: j = n
    str.write(bin(str[i:j]))
    if m is None: break
    k, e = m.group(1), m.group(2)
    if k is not None: ch = int(k, 16)
    elif ch == "a": ch = 7
    elif ch == "b": ch = 8
    elif ch == "f": ch = 12
    elif ch == "n": ch = 10
    elif ch == "r": ch = 13
    elif ch == "t": ch = 9
    elif ch == "v": ch = 11
    else: ch = byteord(e)
    str.write(bytechr(ch))
    i = m.end(0)
  return text(out.getvalue())

def simple_memo(func):
  memo = dict()
  def _(*args):
    try:
      r = memo[args]
    except KeyError:
      r = func(*args)
      memo[args] = r
    return r
  return _

@simple_memo
def name_uid(name):
  pw = PW.getpwnam(name)
  return pw[2]

@simple_memo
def name_gid(name):
  gr = GR.getgrnam(name)
  return gr[2]

###--------------------------------------------------------------------------
### Extended attributes.

def listxattr(f, follow_symlinks = True): return []
if _PYVER >= (3, 3):
  if hasattr(OS, "listxattr"):
    getxattr, listxattr = OS.getxattr, OS.listxattr
else:
  try:
    import xattr as _XA
  except ImportError:
    pass
  else:
    if hasattr(_XA, "list"):
      def listxattr(f, follow_symlinks = True):
        return _XA.list(f, nofollow = not follow_symlinks)
      def getxattr(f, a, follow_symlinks = True):
        return _XA.get(f, a, nofollow = not follow_symlinks)
    else:
      def listxattr(f, follow_symlinks = True):
        return _XA.listxattr(f, nofollow = not follow_symlinks)
      def getxattr(f, a, follow_symlinks = True):
        return _XA.getxattr(f, a, nofollow = not follow_symlinks)

###--------------------------------------------------------------------------
### Access control lists.

HAVE_ACL_P = False

ACL_ACC= 1
ACL_DFLT = 2

def getacl(f, which): return None
try:
  import posix1e as ACL
except ImportError:
  pass
else:

  ## Match a line from the standard ACL text format.
  R_ACLENT = RX.compile(r"""^
          \s*
          (?: (u | user | g | group | m | mask | o | other)
          \s* : \s*
          (| [^:\s] | [^:\s] [^:]* [^:\s])
          \s* : \s*
          ([-rwx]*)
          \s*) ?
          (?: \# .*)? $
  """, RX.VERBOSE)

  ## Codes for the possible entry tag types.  These are ordered so that we
  ## can sort.
  AT_OWNUID = 1
  AT_USER = 2
  AT_MASK = 3
  AT_OWNGID = 4
  AT_GROUP = 5
  AT_OTHER = 6

  ## Output tags corresponding to the codes.
  ACL_TAGMAP = [None, "u", "u", "m", "g", "g", "o"]

  HAVE_ACL_P = True

  def getacl(f, which):

    ## Fetch the file ACL.
    if which == ACL_ACC: acl = ACL.ACL(file = f)
    elif which == ACL_DFLT: acl = ACL.ACL(filedef = f)
    else: raise ValueError("unexpected WHICH = %d" % which)

    ## For maximum portability, only use the text format, which is guaranteed
    ## to be supported if anything is.  We'll have to parse this ourselves.
    ## Honestly, an important part of what we're doing here is producing a
    ## /canonical/ presentation of the ACL, which doesn't seem to be
    ## something that even the less portable functions will do for us.
    s = str(acl)
    extp = False
    entries = []

    ## First pass: grind through the ACL entries and build a list of (TAG,
    ## QUAL, MODE) triples, where the TAG is an `AT_...' code, the QUAL is
    ## either `None' or a numeric ID, and the MODE is a bitmask of
    ## permissions.
    for line in s.split("\n"):
      m = R_ACLENT.match(line)
      if m is None: raise ValueError("unexpected ACL line `%s'" % line)
      if not m.group(1): continue
      tag, qual, perm = m.group(1), m.group(2), m.group(3)

      if qual == "": qual = None

      ## Convert the tag and qualifier.
      if tag == "u" or tag == "user":
        if qual is None: pass
        elif qual.isdigit(): qual = int(qual, 10)
        else: qual = name_uid(qual)
        if qual is None: tag = AT_OWNUID
        else: tag = AT_USER; extp = True
      elif tag == "m" or tag == "mask":
        if qual is not None:
          raise ValueError("unexpected mask qualifier `%s'" % qual)
        tag = AT_MASK; extp = True
      elif tag == "g" or tag == "group":
        if qual is None: pass
        elif qual.isdigit(): qual = int(qual, 10)
        else: qual = name_gid(qual)
        if qual is None: tag = AT_OWNGID
        else: tag = AT_GROUP; extp = True
      elif tag == "o" or tag == "other":
        if qual is not None:
          raise ValueError("unexpected other qualifier `%s'" % qual)
        tag = AT_OTHER
      else:
        raise ValueError("unexpected tag type `%s'" % tag)

      ## Convert the permissions.
      mode = 0
      for ch in perm:
        if ch == "r": mode |= 4
        elif ch == "w": mode |= 2
        elif ch == "x": mode |= 1
        elif ch == "-": pass
        else: raise ValueError("unexpected permission character `%s'" % ch)

      ## Done.
      entries.append((tag, qual, mode))

    ## If the ACL is trivial then ignore it.  An access ACL trivial if it
    ## contains only entries which are reflected in the traditional
    ## permission bits.  A default ACL is trivial if it's empty.
    if (which == ACL_ACC and not extp) or \
       (which == ACL_DFLT and not entries):
      return None

    ## Sort the entries.  The tag codes are arranged so that this is a useful
    ## ordering.
    entries.sort()

    ## Produce output.  This happens to be the standard short text format,
    ## with exclusively numeric IDs.
    out = StringIO()
    firstp = True
    for tag, qual, mode in entries:
      if firstp: firstp = False
      else: out.write(",")
      out.write(ACL_TAGMAP[tag])
      out.write(":")
      if qual is not None: out.write(str(qual))
      out.write(":")
      if mode&4: out.write("r")
      else: out.write("-")
      if mode&2: out.write("w")
      else: out.write("-")
      if mode&1: out.write("x")
      else: out.write("-")

    return out.getvalue()

###--------------------------------------------------------------------------
### File system enumeration.

class FileAttr (object):
  def __init__(me, file, attr):
    try: value = getxattr(file, attr, follow_symlinks = False)
    except (OSError, IOError): me.value, me.err = None, excval()
    else: me.value, me.err = value, None

class FileInfo (object):
  def __init__(me, file, st = None):
    me.name = file
    if st:
      me.st = st
      me.err = None
    else:
      try:
        me.st = OS.lstat(file)
        me.err = None
      except OSError:
        me.st = None
        me.err = excval()

    me.xa, me.xa_err = dict(), None
    me.acl_acc = me.aclerr_acc = None
    me.acl_dflt = me.aclerr_dflt = None

    if me.st is not None:

      def collect_acl(which):
        try:
          return getacl(file, which), None
        except (OSError, IOError):
          err = excval()
          if err.errno == E.ENOTSUP: return None, None
          else: return None, excval()

      if not ST.S_ISLNK(me.st.st_mode):
        me.acl_acc, me.aclerr_acc = collect_acl(ACL_ACC)
      if ST.S_ISDIR(me.st.st_mode):
        me.acl_dflt, me.aclerr_dflt = collect_acl(ACL_DFLT)

      try: names = listxattr(file, follow_symlinks = False)
      except (OSError, IOError): me.xa_err = excval()
      else:
        for name in names:
          if HAVE_ACL_P and (name == "system.posix_acl_access" or
                             name == "system.posix_acl_default"):
            continue
          me.xa[name] = FileAttr(file, name)

def enum_walk(file, func):

  def dirents(name):
    try:
      return OS.listdir(name)
    except OSError:
      syserr("failed to read directory `%s': %s" % (name, excval().strerror))
      return []

  def dir(ee, dev):
    ff = []
    dd = []
    for e in ee:
      fi = FileInfo(e)
      if fi.st and fi.st.st_dev != dev: pass
      if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
      else: ff.append(fi)
    ff.sort(key = lambda fi: fi.name)
    dd.sort(key = lambda fi: fi.name + '/')
    for f in ff:
      func(f)
    for d in dd:
      if d.st.st_dev == dev:
        func(d)
        dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)

  if file.endswith('/'):
    cwd = OS.open('.', OS.O_RDONLY)
    try:
      OS.chdir(file)
      fi = FileInfo('.')
      func(fi)
      dir(dirents('.'), fi.st.st_dev)
    finally:
      OS.fchdir(cwd)
      OS.close(cwd)
  else:
    fi = FileInfo(file)
    func(fi)
    if fi.st and ST.S_ISDIR(fi.st.st_mode):
      dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
          fi.st.st_dev)

def enum_find0(f, func):
  tail = ""
  while True:
    buf = f.read(8192)
    last = len(buf) == 0
    names = (tail + buf).split('\0')
    tail = names.pop()
    for n in names:
      func(FileInfo(n))
    if last:
      break
  if len(tail):
    moan("ignored trailing junk after last filename")

R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
def enum_rsync(f, func):

  ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
  ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
  ## one space character after the TIME and may begin with a space.
  ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
  ## for a byte with that value.  Newlines, and backslashes which would be
  ## ambiguous, are converted into this form; all other characters are
  ## literal.
  ##
  ## We ignore the stat information and retrieve it ourselves, because it's
  ## incomplete.  Hopefully the dcache is still warm.

  for line in f:
    if line.endswith('\n'): line = line[:-1]

    ## Extract the escaped name.
    ff = line.split(None, 3)
    if len(ff) != 4:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    tail = ff[3]
    try:
      spc = tail.index(' ')
    except ValueError:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    name = tail[spc + 1:]

    ## Now translate escape sequences.
    name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)

    ## Call the client.
    try:
      fi = FileInfo(name)
    except OSError:
      syserr("failed to stat `%s': %s" % (name, excval().strerror))
      continue
    func(fi)

###--------------------------------------------------------------------------
### The hash cache.

class HashCache (object):

  VERSION = 0
  BUFSZ = 128*1024

  INIT = [
    """CREATE TABLE meta (
               version INTEGER NOT NULL,
               hash TEXT NOT NULL
       );""",
    """CREATE TABLE hash (
               ino INTEGER PRIMARY KEY,
               mtime INTEGER NOT NULL,
               ctime INTEGER NOT NULL,
               size INTEGER NOT NULL,
               hash TEXT NOT NULL,
               seen BOOLEAN NOT NULL DEFAULT TRUE
       );""",
    """PRAGMA journal_mode = WAL;"""
  ]

  def __init__(me, file, hash = None):

    if file is None:

      ## We're going this alone, with no cache.
      db = None
      if hash is None:
        die("no hash specified and no database cache to read from")
    else:

      ## Connect to the database.
      db = DB.connect(file)
      db.text_factory = str

      ## See whether we can understand the cache database.
      c = db.cursor()
      v = h = None
      try:
        c.execute('SELECT version, hash FROM meta')
        v, h = c.fetchone()
        if c.fetchone() is not None:
          die("cache database corrupt: meta table has mutliple rows")
      except (DB.Error, TypeError):
        pass

      ## If that didn't work, we'd better clear the thing and start again.
      ## But only if we know how to initialize it.
      if v != me.VERSION:

        ## Explain the situation.
        moan("cache version %s not understood" % v)
        if hash is None:
          if h is None:
            die("can't initialize cache: no hash function set")
          else:
            hash = h
        try:
          H.new(hash)
        except Exception:
          die("unknown hash function `%s'" % hash)

        ## Drop old things.
        c.execute('SELECT type, name FROM sqlite_master')
        for type, name in c.fetchall():
          c.execute('DROP %s IF EXISTS %s' % (type, name))

        ## Now we're ready to go.
        for stmt in me.INIT:
          c.execute(stmt)
        c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
        db.commit()

      ## Check the hash function if necessary.
      if hash is None:
        hash = h
      elif h is not None and  h != hash:
        die("hash mismatch: cache uses %s but %s requested" % (h, hash))

    ## All done.
    me.hash = hash
    me._db = db
    me._pend = 0

  def hashblob(me, blob):
    h = H.new(me.hash)
    h.update(blob)
    return text(B.hexlify(h.digest()))

  def hashfile(me, fi):

    ## If this isn't a proper file then don't try to hash it.
    if fi.err or not ST.S_ISREG(fi.st.st_mode):
      return None

    ## See whether there's a valid entry in the cache.
    if me._db:
      c = me._db.cursor()
      c.execute(
        'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
        [fi.st.st_ino])
      r = c.fetchone()
      if r is not None:
        mt, sz, h, s = r
        if mt == fi.st.st_mtime and \
           sz == fi.st.st_size:
          if not s:
            c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
                      [fi.st.st_ino])
          me._update()
          return h

    ## Hash the file.  Beware raciness: update the file information from the
    ## open descriptor, but set the size from what we actually read.
    h = H.new(me.hash)
    try:
      with open(fi.name, 'rb') as f:
        sz = 0
        while True:
          buf = f.read(me.BUFSZ)
          if len(buf) == 0:
            break
          sz += len(buf)
          h.update(buf)
        fi.st = OS.fstat(f.fileno())
        ##fi.st.st_size = sz
      hash = h.digest()
    except (OSError, IOError):
      fi.st = None
      fi.err = excval()
      return None
    hash = text(B.hexlify(hash))

    ## Insert a record into the database.
    if me._db:
      c.execute("""
              INSERT OR REPLACE INTO hash
                      (ino, mtime, ctime, size, hash, seen)
              VALUES
                      (?, ?, ?, ?, ?, 1);
      """, [fi.st.st_ino,
            fi.st.st_mtime,
            fi.st.st_ctime,
            fi.st.st_size,
            hash])
      me._update()

    ## Done.
    return hash

  def _update(me):
    me._pend += 1
    if me._pend >= 1024:
      me.flush()

  def flush(me):
    if me._db:
      me._db.commit()
    me._pend = 0

  def need_db(me):
    if not me._db:
      die("no cache database")

  def forget(me, ino):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE ino = ?', [ino])

  def reset(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('UPDATE hash SET seen = 0 WHERE seen')
    me.flush()

  def prune(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE NOT seen')
    me.flush()

###--------------------------------------------------------------------------
### Printing output.

class GenericFormatter (object):
  def __init__(me, fi):
    me.fi = fi
  def _fmt_time(me, t):
    tm = T.gmtime(t)
    return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
  def _enc_name(me, n):
    return ' \\-> '.join(escapify(n).split(' -> '))
  def name(me):
    return me._enc_name(me.fi.name)
  def info(me):
    return me.TYPE
  def mode(me):
    return '%06o' % me.fi.st.st_mode
  def size(me):
    return me.fi.st.st_size
  def mtime(me):
    return me._fmt_time(me.fi.st.st_mtime)
  def owner(me):
    return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)

class ErrorFormatter (GenericFormatter):
  def info(me):
    return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
  def error(me): return 'error'
  mode = size = mtime = owner = error

class SocketFormatter (GenericFormatter):
  TYPE = 'socket'
class PipeFormatter (GenericFormatter):
  TYPE = 'fifo'

class LinkFormatter (GenericFormatter):
  TYPE = 'symbolic-link'
  def name(me):
    n = GenericFormatter.name(me)
    try:
      d = OS.readlink(me.fi.name)
      return '%s -> %s' % (n, me._enc_name(d))
    except OSError:
      err = excval()
      return '%s -> <E%d %s>' % (n, err.errno, err.strerror)

class DirectoryFormatter (GenericFormatter):
  TYPE = 'directory'
  def name(me): return GenericFormatter.name(me) + '/'
  def size(me): return 'dir'

class DeviceFormatter (GenericFormatter):
  def info(me):
    return '%s %d:%d' % (me.TYPE,
                         OS.major(me.fi.st.st_rdev),
                         OS.minor(me.fi.st.st_rdev))
class BlockDeviceFormatter (DeviceFormatter):
  TYPE = 'block-device'
class CharDeviceFormatter (DeviceFormatter):
  TYPE = 'character-device'

class FileFormatter (GenericFormatter):
  TYPE = 'regular-file'

class Reporter (object):

  TYMAP = {
    ST.S_IFSOCK: SocketFormatter,
    ST.S_IFDIR: DirectoryFormatter,
    ST.S_IFLNK: LinkFormatter,
    ST.S_IFREG: FileFormatter,
    ST.S_IFBLK: BlockDeviceFormatter,
    ST.S_IFCHR: CharDeviceFormatter,
    ST.S_IFIFO: PipeFormatter,
  }

  def __init__(me, db):
    me._inomap = {}
    me._vinomap = {}
    me._db = db
    me._hsz = int(H.new(db.hash).digest_size)

  def file(me, fi):
    h = me._db.hashfile(fi)
    if fi.err:
      fmt = ErrorFormatter(fi)
      vino = 'error'
    else:
      fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
      inoidx = fi.st.st_dev, fi.st.st_ino
      try:
        vino = me._inomap[inoidx]
      except KeyError:
        suffix = ''
        seq = 0
        while True:
          vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
          if vino not in me._vinomap: break
          suffix = '\0%d' % seq
          seq += 1
        me._inomap[inoidx] = vino
        if OPTS.compat >= 2: me._vinomap[vino] = inoidx
    if h: info = h
    else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
    print('%s %8s %6s %-12s %-20s %20s %s' %
          (info, vino, fmt.mode(), fmt.owner(),
           fmt.mtime(), fmt.size(), fmt.name()))

    if OPTS.compat >= 3:

      for which, acl, err in \
          [("posix-access", fi.acl_acc, fi.aclerr_acc),
           ("posix-default", fi.acl_dflt, fi.aclerr_dflt)]:
        if acl is not None:
          print("\tacl %s %s" % (which, acl))
        elif err is not None:
          print("\tacl %s <E%d %s>" % (which, err.errno, err.strerror))

      if fi.xa_err is not None:
        print("\txattr <E%d %s>" % (fi.xa_err.errno, fi.xa_err.strerror))
      else:
        for name in sorted(iterkeys(fi.xa)):
          attr = fi.xa[name]
          if attr.err is None:
            print("\txattr %s %s" %
                  (escapify(name), me._db.hashblob(attr.value)))
          else:
            print("\txattr %s <E%d %s>" %
                  (escapify(name), attr.err.errno, attr.err.strerror))

###--------------------------------------------------------------------------
### Database clearing from diff files.

R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')

def clear_entry(db, lno, line):

  good = True

  if line.startswith('['):
    pos = line.find(']')
    if pos < 0:
      moan("failed to parse file entry (type field; line %d)" % lno)
      return False
    ty = line[1:pos].strip()
    rest = line[pos + 1:]
    hash = None
  else:
    ff = line.split(None, 1)
    if len(ff) != 2:
      moan("failed to parse file entry (field split; line %d)" % lno)
      return False
    ty = 'regular-file'
    hash, rest = ff

  ff = rest.split(None, 5)
  if len(ff) != 6:
    moan("failed to parse file entry (field split; line %d)" % lno)
    return False
  ino, mode, uidgid, mtime, sz, name = ff

  if ty != 'symbolic-link':
    target = None
  else:
    nn = name.split(' -> ', 1)
    if len(nn) != 2:
      moan("failed to parse file entry (name split; line %d)" % lno)
      return False
    name, target = nn
    target = unescapify(target)
  name = unescapify(name)

  try:
    st = OS.lstat(name)
  except OSError:
    e = excval()
    moan("failed to stat `%s': %s" % (name, e.strerror))
    if e.errno != E.ENOENT: good = False
  else:
    print("Clear cache entry for `%s'" % name)
    db.forget(st.st_ino)

  return good

def clear_cache(db):

  ## Work through the input diff file one line at a time.
  diffstate = 'gap'
  lno = 0
  good = True
  for line in stdin:
    if line.endswith('\n'): line = line[:-1]
    lno += 1

    ## We're in a gap between hunks.  Find a hunk header and extract the line
    ## counts.
    if diffstate == 'gap':
      m = R_HUNK.match(line)
      if m:
        oldlines = int(m.group(1))
        newlines = int(m.group(2))
        diffstate = 'hunk'
        hdrlno = lno

    ## We're in a hunk.  Keep track of whether we've reached the end, and
    ## discard entries from the cache for mismatching lines.
    elif diffstate == 'hunk':
      if len(line) == 0:
        moan("empty line in diff hunk (line %d)" % lno)
        good = False
      ty = line[0]
      if ty == ' ':
        oldlines -= 1; newlines -= 1
      elif ty == '+':
        newlines -= 1
        if not clear_entry(db, lno, line[1:]): good = False
      elif ty == '-':
        oldlines -= 1
        if not clear_entry(db, lno, line[1:]): good = False
      else:
        moan("incomprehensible line in diff hunk (line %d)" % lno)
        good = false
      if oldlines < 0 or newlines < 0:
        moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
        good = False
      if oldlines == newlines == 0:
        diffstate = 'gap'

  if diffstate == 'hunk':
    moan("truncated diff hunk (started at line %d)" % hdrlno)
    good = False

  return good

###--------------------------------------------------------------------------
### Main program.

FMTMAP = {
  'rsync': lambda f: enum_rsync(stdin, f),
  'find0': lambda f: enum_find0(stdin, f)
}
op = OP.OptionParser(
  usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
  version = '%%prog, version %s' % VERSION,
  description = '''\
Print a digest of a filesystem (or a collection of specified files) to
standard output.  The idea is that the digest should be mostly /complete/
(i.e., any `interesting\' change to the filesystem results in a different
digest) and /canonical/ (i.e., identical filesystem contents result in
identical output).
''')

for short, long, props in [
  ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
                    'help': 'clear cache of all files not seen' }),
  ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
                      'help': 'use FILE as a cache for file hashes' }),
  ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
                      'type': 'choice', 'choices': list(FMTMAP.keys()),
                      'help': 'read files to report in the given FORMAT' }),
  ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
                      'help': 'read diff from stdin, clear cache entries' }),
  ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
                       'type': 'int', 'default': 3,
                       'help': 'produce output with given compatibility VERSION' }),
  ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
                     ##'type': 'choice', 'choices': H.algorithms,
                     'help': 'use HASH as the hash function' })]:
  op.add_option(short, long, **props)
OPTS, args = op.parse_args(argv)
if not 1 <= OPTS.compat <= 3:
  die("unknown compatibility version %d" % OPTS.compat)
if OPTS.udiff:
  if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
    die("incompatible options: `-u' requires `-c CACHE', forbids others")
  db = HashCache(OPTS.cache, OPTS.hash)
  if len(args) == 2: OS.chdir(args[1])
  good = True
  if not clear_cache(db): good = False
  if good: db.flush()
  else: exit(2)
else:
  if not OPTS.files and len(args) <= 1:
    die("no filename sources: nothing to do")
  db = HashCache(OPTS.cache, OPTS.hash)
  if OPTS.all:
    db.reset()
  if OPTS.compat >= 2:
    print("## fshash report format version %d" % OPTS.compat)
  rep = Reporter(db)
  if OPTS.files:
    FMTMAP[OPTS.files](rep.file)
  for dir in args[1:]:
    enum_walk(dir, rep.file)
  if OPTS.all:
    db.prune()
  db.flush()

###----- That's all, folks --------------------------------------------------