#! @PYTHON@
###
### Efficiently construct canonical digests of filesystems
###
### (c) 2012 Mark Wooding
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of the `rsync-backup' program.
###
### rsync-backup is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
### the Free Software Foundation; either version 2 of the License, or
### (at your option) any later version.
###
### rsync-backup is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU General Public License for more details.
###
### You should have received a copy of the GNU General Public License
### along with rsync-backup; if not, write to the Free Software Foundation,
### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

from sys import argv, exit, stdin, stdout, stderr
import os as OS
import re as RX
import time as T
import stat as ST
import optparse as OP
import hashlib as H
import sqlite3 as DB
import zlib as Z

PACKAGE = '@PACKAGE@'
VERSION = '@VERSION@'

###--------------------------------------------------------------------------
### Utilities.

QUIS = OS.path.basename(argv[0])

def moan(msg):
  stderr.write('%s: %s\n' % (QUIS, msg))

def die(msg, rc = 1):
  moan(msg)
  exit(rc)

SYSERR = 0
def syserr(msg):
  global SYSERR
  moan(msg)
  SYSERR += 1

###--------------------------------------------------------------------------
### File system enumeration.

class FileInfo (object):
  def __init__(me, file, st = None):
    me.name = file
    if st:
      me.st = st
      me.err = None
    else:
      try:
        me.st = OS.lstat(file)
        me.err = None
      except OSError, err:
        me.st = None
        me.err = err

def enum_walk(file, func):

  def dirents(name):
    try:
      return OS.listdir(name)
    except OSError, err:
      syserr("failed to read directory `%s': %s" % (name, err.strerror))
      return []

  def dir(ee, dev):
    ff = []
    dd = []
    for e in ee:
      fi = FileInfo(e)
      if fi.st and fi.st.st_dev != dev: pass
      if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
      else: ff.append(fi)
    ff.sort(key = lambda fi: fi.name)
    dd.sort(key = lambda fi: fi.name + '/')
    for f in ff:
      func(f)
    for d in dd:
      if d.st.st_dev == dev:
        func(d)
        dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)

  if file.endswith('/'):
    cwd = OS.open('.', OS.O_RDONLY)
    try:
      OS.chdir(file)
      fi = FileInfo('.')
      func(fi)
      dir(dirents('.'), fi.st.st_dev)
    finally:
      OS.fchdir(cwd)
      OS.close(cwd)
  else:
    fi = FileInfo(file)
    func(fi)
    if fi.st and ST.S_ISDIR(fi.st.st_mode):
      dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
          fi.st.st_dev)

def enum_find0(f, func):
  tail = ""
  while True:
    buf = f.read(8192)
    last = len(buf) == 0
    names = (tail + buf).split('\0')
    tail = names.pop()
    for n in names:
      func(FileInfo(n))
    if last:
      break
  if len(tail):
    moan("ignored trailing junk after last filename")

RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
def enum_rsync(f, func):

  ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
  ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
  ## one space character after the TIME and may begin with a space.
  ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
  ## for a byte with that value.  Newlines and backslashes which would be
  ## ambiguous are converted into this form; all other characters are
  ## literal.
  ##
  ## We ignore the stat information and retrieve it ourselves, because it's
  ## incomplete.  Hopefully the dcache is still warm.

  for line in f:
    if line.endswith('\n'): line = line[:-1]

    ## Extract the escaped name.
    ff = line.split(None, 3)
    if len(ff) != 4:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    tail = ff[3]
    try:
      spc = tail.index(' ')
    except ValueError:
      syserr("ignoring invalid line from rsync: `%s'" % line)
      continue
    name = tail[spc + 1:]

    ## Now translate escape sequences.
    name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)

    ## Call the client.
    try:
      fi = FileInfo(name)
    except OSError, err:
      syserr("failed to stat `%s': %s" % (name, err.strerror))
      continue
    func(fi)

###--------------------------------------------------------------------------
### The hash cache.

class HashCache (object):

  VERSION = 0
  BUFSZ = 128*1024

  INIT = [
    """CREATE TABLE meta (
               version INTEGER NOT NULL,
               hash TEXT NOT NULL
       );""",
    """CREATE TABLE hash (
               ino INTEGER PRIMARY KEY,
               mtime INTEGER NOT NULL,
               ctime INTEGER NOT NULL,
               size INTEGER NOT NULL,
               hash TEXT NOT NULL,
               seen BOOLEAN NOT NULL DEFAULT TRUE
       );""",
    """PRAGMA journal_mode = WAL;"""
  ]

  def __init__(me, file, hash = None):

    if file is None:

      ## We're going this alone, with no cache.
      db = None
      if hash is None:
        die("no hash specified and no database cache to read from")
    else:

      ## Connect to the database.
      db = DB.connect(file)
      db.text_factory = str

      ## See whether we can understand the cache database.
      c = db.cursor()
      v = h = None
      try:
        c.execute('SELECT version, hash FROM meta')
        v, h = c.fetchone()
        if c.fetchone() is not None:
          die("cache database corrupt: meta table has mutliple rows")
      except (DB.Error, TypeError):
        pass

      ## If that didn't work, we'd better clear the thing and start again.
      ## But only if we know how to initialize it.
      if v != me.VERSION:

        ## Explain the situation.
        moan("cache version %s not understood" % v)
        if hash is None:
          if h is None:
            die("can't initialize cache: no hash function set")
          else:
            hash = h
        try:
          H.new(hash)
        except Exception:
          die("unknown hash function `%s'" % hash)

        ## Drop old things.
        c.execute('SELECT type, name FROM sqlite_master')
        for type, name in c.fetchall():
          c.execute('DROP %s IF EXISTS %s' % (type, name))

        ## Now we're ready to go.
        for stmt in me.INIT:
          c.execute(stmt)
        c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
        db.commit()

      ## Check the hash function if necessary.
      if hash is None:
        hash = h
      elif h is not None and  h != hash:
        die("hash mismatch: cache uses %s but %s requested" % (h, hash))

    ## All done.
    me.hash = hash
    me._db = db
    me._pend = 0

  def hashfile(me, fi):

    ## If this isn't a proper file then don't try to hash it.
    if fi.err or not ST.S_ISREG(fi.st.st_mode):
      return None

    ## See whether there's a valid entry in the cache.
    if me._db:
      c = me._db.cursor()
      c.execute(
        'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
        [fi.st.st_ino])
      r = c.fetchone()
      if r is not None:
        mt, sz, h, s = r
        if mt == fi.st.st_mtime and \
           sz == fi.st.st_size:
          if not s:
            c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
                      [fi.st.st_ino])
          me._update()
          return h

    ## Hash the file.  Beware raciness: update the file information from the
    ## open descriptor, but set the size from what we actually read.
    h = H.new(me.hash)
    try:
      with open(fi.name, 'rb') as f:
        sz = 0
        while True:
          buf = f.read(me.BUFSZ)
          if len(buf) == 0:
            break
          sz += len(buf)
          h.update(buf)
        fi.st = OS.fstat(f.fileno())
        ##fi.st.st_size = sz
      hash = h.digest()
    except (OSError, IOError), err:
      fi.st = None
      fi.err = err
      return None
    hash = hash.encode('hex')

    ## Insert a record into the database.
    if me._db:
      c.execute("""
              INSERT OR REPLACE INTO hash
                      (ino, mtime, ctime, size, hash, seen)
              VALUES
                      (?, ?, ?, ?, ?, 1);
      """, [fi.st.st_ino,
            fi.st.st_mtime,
            fi.st.st_ctime,
            fi.st.st_size,
            hash])
      me._update()

    ## Done.
    return hash

  def _update(me):
    me._pend += 1
    if me._pend >= 1024:
      me.flush()

  def flush(me):
    if me._db:
      me._db.commit()
    me._pend = 0

  def need_db(me):
    if not me._db:
      die("no cache database")

  def reset(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('UPDATE hash SET seen = 0 WHERE seen')
    me.flush()

  def prune(me):
    me.need_db()
    c = me._db.cursor()
    c.execute('DELETE FROM hash WHERE NOT seen')
    me.flush()

###--------------------------------------------------------------------------
### Printing output.

class GenericFormatter (object):
  def __init__(me, fi):
    me.fi = fi
  def _fmt_time(me, t):
    tm = T.gmtime(t)
    return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
  def _enc_name(me, n):
    return ' \\-> '.join(n.encode('string_escape').split(' -> '))
  def name(me):
    return me._enc_name(me.fi.name)
  def info(me):
    return me.TYPE
  def mode(me):
    return '%06o' % me.fi.st.st_mode
  def size(me):
    return me.fi.st.st_size
  def mtime(me):
    return me._fmt_time(me.fi.st.st_mtime)
  def owner(me):
    return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)

class ErrorFormatter (GenericFormatter):
  def info(me):
    return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
  def error(me): return 'error'
  mode = size = mtime = owner = error

class SocketFormatter (GenericFormatter):
  TYPE = 'socket'
class PipeFormatter (GenericFormatter):
  TYPE = 'fifo'

class LinkFormatter (GenericFormatter):
  TYPE = 'symbolic-link'
  def name(me):
    n = GenericFormatter.name(me)
    try:
      d = OS.readlink(me.fi.name)
      return '%s -> %s' % (n, me._enc_name(d))
    except OSError, err:
      return '%s -> <E%d %s>' % (n, err.errno, err.strerror)

class DirectoryFormatter (GenericFormatter):
  TYPE = 'directory'
  def name(me): return GenericFormatter.name(me) + '/'
  def size(me): return 'dir'

class DeviceFormatter (GenericFormatter):
  def info(me):
    return '%s %d:%d' % (me.TYPE,
                         OS.major(me.fi.st.st_rdev),
                         OS.minor(me.fi.st.st_rdev))
class BlockDeviceFormatter (DeviceFormatter):
  TYPE = 'block-device'
class CharDeviceFormatter (DeviceFormatter):
  TYPE = 'character-device'

class FileFormatter (GenericFormatter):
  TYPE = 'regular-file'

class Reporter (object):

  TYMAP = {
    ST.S_IFSOCK: SocketFormatter,
    ST.S_IFDIR: DirectoryFormatter,
    ST.S_IFLNK: LinkFormatter,
    ST.S_IFREG: FileFormatter,
    ST.S_IFBLK: BlockDeviceFormatter,
    ST.S_IFCHR: CharDeviceFormatter,
    ST.S_IFIFO: PipeFormatter,
  }

  def __init__(me, db):
    me._inomap = {}
    me._vinomap = {}
    me._db = db
    me._hsz = int(H.new(db.hash).digest_size)

  def file(me, fi):
    h = me._db.hashfile(fi)
    if fi.err:
      fmt = ErrorFormatter(fi)
      vino = 'error'
    else:
      fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
      inoidx = fi.st.st_dev, fi.st.st_ino
      try:
        vino = me._inomap[inoidx]
      except KeyError:
        suffix = ''
        seq = 0
        while True:
          vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
          if vino not in me._vinomap: break
          suffix = '\0%d' % seq
          seq += 1
        me._inomap[inoidx] = vino
    if h: info = h
    else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
    print '%s %8s %6s %-12s %-20s %20s %s' % (
      info, vino, fmt.mode(), fmt.owner(),
      fmt.mtime(), fmt.size(), fmt.name())

###--------------------------------------------------------------------------
### Main program.

FMTMAP = {
  'rsync': lambda f: enum_rsync(stdin, f),
  'find0': lambda f: enum_find0(stdin, f)
}
op = OP.OptionParser(
  usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
  version = '%%prog, version %s' % VERSION,
  description = '''\
Print a digest of a filesystem (or a collection of specified files) to
standard output.  The idea is that the digest should be mostly /complete/
(i.e., any `interesting\' change to the filesystem results in a different
digest) and /canonical/ (i.e., identical filesystem contents result in
identical output).
''')

for short, long, props in [
  ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
                    'help': 'clear cache of all files not seen' }),
  ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
                      'help': 'use FILE as a cache for file hashes' }),
  ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
                      'type': 'choice', 'choices': FMTMAP.keys(),
                      'help': 'read files to report in the given FORMAT' }),
  ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
                     ##'type': 'choice', 'choices': H.algorithms,
                     'help': 'use HASH as the hash function' })]:
  op.add_option(short, long, **props)
opts, args = op.parse_args(argv)

if not opts.files and len(args) <= 1:
  die("no filename sources: nothing to do")
db = HashCache(opts.cache, opts.hash)
if opts.all:
  db.reset()
rep = Reporter(db)
if opts.files:
  FMTMAP[opts.files](rep.file)
for dir in args[1:]:
  enum_walk(dir, rep.file)
if opts.all:
  db.prune()
db.flush()

###----- That's all, folks --------------------------------------------------