chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exit, stdin, stdout, stderr
  27 import os as OS
  28 import re as RX
  29 import time as T
  30 import stat as ST
  31 import optparse as OP
  32 import hashlib as H
  33 import sqlite3 as DB
  34 import zlib as Z
  35
  36 PACKAGE = '@PACKAGE@'
  37 VERSION = '@VERSION@'
  38
  39 ###--------------------------------------------------------------------------
  40 ### Utilities.
  41
  42 QUIS = OS.path.basename(argv[0])
  43
  44 def moan(msg):
  45   stderr.write('%s: %s\n' % (QUIS, msg))
  46
  47 def die(msg, rc = 1):
  48   moan(msg)
  49   exit(rc)
  50
  51 SYSERR = 0
  52 def syserr(msg):
  53   global SYSERR
  54   moan(msg)
  55   SYSERR += 1
  56
  57 ###--------------------------------------------------------------------------
  58 ### File system enumeration.
  59
  60 class FileInfo (object):
  61   def __init__(me, file, st = None):
  62     me.name = file
  63     if st:
  64       me.st = st
  65       me.err = None
  66     else:
  67       try:
  68         me.st = OS.lstat(file)
  69         me.err = None
  70       except OSError, err:
  71         me.st = None
  72         me.err = err
  73
  74 def enum_walk(file, func):
  75
  76   def dirents(name):
  77     try:
  78       return OS.listdir(name)
  79     except OSError, err:
  80       syserr("failed to read directory `%s': %s" % (name, err.strerror))
  81       return []
  82
  83   def dir(ee, dev):
  84     ff = []
  85     dd = []
  86     for e in ee:
  87       fi = FileInfo(e)
  88       if fi.st and fi.st.st_dev != dev: pass
  89       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  90       else: ff.append(fi)
  91     ff.sort(key = lambda fi: fi.name)
  92     dd.sort(key = lambda fi: fi.name + '/')
  93     for f in ff:
  94       func(f)
  95     for d in dd:
  96       if d.st.st_dev == dev:
  97         func(d)
  98         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
  99
 100   if file.endswith('/'):
 101     OS.chdir(file)
 102     fi = FileInfo('.')
 103     func(fi)
 104     dir(dirents('.'), fi.st.st_dev)
 105   else:
 106     fi = FileInfo(file)
 107     func(fi)
 108     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 109       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 110           fi.st.st_dev)
 111
 112 def enum_find0(f, func):
 113   tail = ""
 114   while True:
 115     buf = f.read(8192)
 116     last = len(buf) == 0
 117     names = (tail + buf).split('\0')
 118     tail = names.pop()
 119     for n in names:
 120       func(FileInfo(n))
 121     if last:
 122       break
 123   if len(tail):
 124     moan("ignored trailing junk after last filename")
 125
 126 RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 127 def enum_rsync(f, func):
 128
 129   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 130   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 131   ## one space character after the TIME and may begin with a space.
 132   ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
 133   ## for a byte with that value.  Newlines and backslashes which would be
 134   ## ambiguous are converted into this form; all other characters are
 135   ## literal.
 136   ##
 137   ## We ignore the stat information and retrieve it ourselves, because it's
 138   ## incomplete.  Hopefully the dcache is still warm.
 139
 140   for line in f:
 141     if line.endswith('\n'): line = line[:-1]
 142
 143     ## Extract the escaped name.
 144     ff = line.split(None, 3)
 145     if len(ff) != 4:
 146       syserr("ignoring invalid line from rsync: `%s'" % line)
 147       continue
 148     tail = ff[3]
 149     try:
 150       spc = tail.index(' ')
 151     except ValueError:
 152       syserr("ignoring invalid line from rsync: `%s'" % line)
 153       continue
 154     name = tail[spc + 1:]
 155
 156     ## Now translate escape sequences.
 157     name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 158
 159     ## Call the client.
 160     try:
 161       fi = FileInfo(name)
 162     except OSError, err:
 163       syserr("failed to stat `%s': %s" % (name, err.strerror))
 164       continue
 165     func(fi)
 166
 167 ###--------------------------------------------------------------------------
 168 ### The hash cache.
 169
 170 class HashCache (object):
 171
 172   VERSION = 0
 173   BUFSZ = 128*1024
 174
 175   INIT = [
 176     """CREATE TABLE meta (
 177                version INTEGER NOT NULL,
 178                hash TEXT NOT NULL
 179        );""",
 180     """CREATE TABLE hash (
 181                ino INTEGER PRIMARY KEY,
 182                mtime INTEGER NOT NULL,
 183                ctime INTEGER NOT NULL,
 184                size INTEGER NOT NULL,
 185                hash TEXT NOT NULL,
 186                seen BOOLEAN NOT NULL DEFAULT TRUE
 187        );""",
 188     """PRAGMA journal_mode = WAL;"""
 189   ]
 190
 191   def __init__(me, file, hash = None):
 192
 193     if file is None:
 194
 195       ## We're going this alone, with no cache.
 196       db = None
 197       if hash is None:
 198         die("no hash specified and no database cache to read from")
 199     else:
 200
 201       ## Connect to the database.
 202       db = DB.connect(file)
 203       db.text_factory = str
 204
 205       ## See whether we can understand the cache database.
 206       c = db.cursor()
 207       v = h = None
 208       try:
 209         c.execute('SELECT version, hash FROM meta')
 210         v, h = c.fetchone()
 211         if c.fetchone() is not None:
 212           die("cache database corrupt: meta table has mutliple rows")
 213       except (DB.Error, TypeError):
 214         pass
 215
 216       ## If that didn't work, we'd better clear the thing and start again.
 217       ## But only if we know how to initialize it.
 218       if v != me.VERSION:
 219
 220         ## Explain the situation.
 221         moan("cache version %s not understood" % v)
 222         if hash is None:
 223           if h is None:
 224             die("can't initialize cache: no hash function set")
 225           else:
 226             hash = h
 227         try:
 228           H.new(hash)
 229         except Exception:
 230           die("unknown hash function `%s'" % hash)
 231
 232         ## Drop old things.
 233         c.execute('SELECT type, name FROM sqlite_master')
 234         for type, name in c.fetchall():
 235           c.execute('DROP %s IF EXISTS %s' % (type, name))
 236
 237         ## Now we're ready to go.
 238         for stmt in me.INIT:
 239           c.execute(stmt)
 240         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 241         db.commit()
 242
 243       ## Check the hash function if necessary.
 244       if hash is None:
 245         hash = h
 246       elif h is not None and  h != hash:
 247         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 248
 249     ## All done.
 250     me.hash = hash
 251     me._db = db
 252     me._pend = 0
 253
 254   def hashfile(me, fi):
 255
 256     ## If this isn't a proper file then don't try to hash it.
 257     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 258       return None
 259
 260     ## See whether there's a valid entry in the cache.
 261     if me._db:
 262       c = me._db.cursor()
 263       c.execute(
 264         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 265         [fi.st.st_ino])
 266       r = c.fetchone()
 267       if r is not None:
 268         mt, sz, h, s = r
 269         if mt == fi.st.st_mtime and \
 270            sz == fi.st.st_size:
 271           if not s:
 272             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 273                       [fi.st.st_ino])
 274           me._update()
 275           return h
 276
 277     ## Hash the file.  Beware raciness: update the file information from the
 278     ## open descriptor, but set the size from what we actually read.
 279     h = H.new(me.hash)
 280     try:
 281       with open(fi.name, 'rb') as f:
 282         sz = 0
 283         while True:
 284           buf = f.read(me.BUFSZ)
 285           if len(buf) == 0:
 286             break
 287           sz += len(buf)
 288           h.update(buf)
 289         fi.st = OS.fstat(f.fileno())
 290         ##fi.st.st_size = sz
 291       hash = h.digest()
 292     except (OSError, IOError), err:
 293       fi.st = None
 294       fi.err = err
 295       return None
 296     hash = hash.encode('hex')
 297
 298     ## Insert a record into the database.
 299     if me._db:
 300       c.execute("""
 301               INSERT OR REPLACE INTO hash
 302                       (ino, mtime, ctime, size, hash, seen)
 303               VALUES
 304                       (?, ?, ?, ?, ?, 1);
 305       """, [fi.st.st_ino,
 306             fi.st.st_mtime,
 307             fi.st.st_ctime,
 308             fi.st.st_size,
 309             hash])
 310       me._update()
 311
 312     ## Done.
 313     return hash
 314
 315   def _update(me):
 316     me._pend += 1
 317     if me._pend >= 1024:
 318       me.flush()
 319
 320   def flush(me):
 321     if me._db:
 322       me._db.commit()
 323     me._pend = 0
 324
 325   def need_db(me):
 326     if not me._db:
 327       die("no cache database")
 328
 329   def reset(me):
 330     me.need_db()
 331     c = me._db.cursor()
 332     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 333     me.flush()
 334
 335   def prune(me):
 336     me.need_db()
 337     c = me._db.cursor()
 338     c.execute('DELETE FROM hash WHERE NOT seen')
 339     me.flush()
 340
 341 ###--------------------------------------------------------------------------
 342 ### Printing output.
 343
 344 class GenericFormatter (object):
 345   def __init__(me, fi):
 346     me.fi = fi
 347   def _fmt_time(me, t):
 348     tm = T.gmtime(t)
 349     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 350   def _enc_name(me, n):
 351     return n.encode('string_escape')
 352   def name(me):
 353     return me._enc_name(me.fi.name)
 354   def info(me):
 355     return me.TYPE
 356   def mode(me):
 357     return '%06o' % me.fi.st.st_mode
 358   def size(me):
 359     return me.fi.st.st_size
 360   def mtime(me):
 361     return me._fmt_time(me.fi.st.st_mtime)
 362   def owner(me):
 363     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 364
 365 class ErrorFormatter (GenericFormatter):
 366   def info(me):
 367     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 368   def error(me): return 'error'
 369   mode = size = mtime = owner = error
 370
 371 class SocketFormatter (GenericFormatter):
 372   TYPE = 'socket'
 373 class PipeFormatter (GenericFormatter):
 374   TYPE = 'fifo'
 375
 376 class LinkFormatter (GenericFormatter):
 377   TYPE = 'symbolic-link'
 378   def name(me):
 379     n = GenericFormatter.name(me)
 380     try:
 381       d = OS.readlink(me.fi.name)
 382       return '%s -> %s' % (n, me._enc_name(d))
 383     except OSError, err:
 384       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 385
 386 class DirectoryFormatter (GenericFormatter):
 387   TYPE = 'directory'
 388   def name(me): return GenericFormatter.name(me) + '/'
 389   def size(me): return 'dir'
 390
 391 class DeviceFormatter (GenericFormatter):
 392   def info(me):
 393     return '%s %d:%d' % (me.TYPE,
 394                          OS.major(me.fi.st.st_rdev),
 395                          OS.minor(me.fi.st.st_rdev))
 396 class BlockDeviceFormatter (DeviceFormatter):
 397   TYPE = 'block-device'
 398 class CharDeviceFormatter (DeviceFormatter):
 399   TYPE = 'character-device'
 400
 401 class FileFormatter (GenericFormatter):
 402   TYPE = 'regular-file'
 403
 404 class Reporter (object):
 405
 406   TYMAP = {
 407     ST.S_IFSOCK: SocketFormatter,
 408     ST.S_IFDIR: DirectoryFormatter,
 409     ST.S_IFLNK: LinkFormatter,
 410     ST.S_IFREG: FileFormatter,
 411     ST.S_IFBLK: BlockDeviceFormatter,
 412     ST.S_IFCHR: CharDeviceFormatter,
 413     ST.S_IFIFO: PipeFormatter,
 414   }
 415
 416   def __init__(me, db):
 417     me._inomap = {}
 418     me._vinomap = {}
 419     me._db = db
 420     me._hsz = int(H.new(db.hash).digest_size)
 421
 422   def file(me, fi):
 423     h = me._db.hashfile(fi)
 424     if fi.err:
 425       fmt = ErrorFormatter(fi)
 426       vino = 'error'
 427     else:
 428       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 429       inoidx = fi.st.st_dev, fi.st.st_ino
 430       try:
 431         vino = me._inomap[inoidx]
 432       except KeyError:
 433         suffix = ''
 434         seq = 0
 435         while True:
 436           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 437           if vino not in me._vinomap: break
 438           suffix = '\0%d' % seq
 439           seq += 1
 440         me._inomap[inoidx] = vino
 441     if h: info = h
 442     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 443     print '%s %8s %6s %-12s %-20s %20s %s' % (
 444       info, vino, fmt.mode(), fmt.owner(),
 445       fmt.mtime(), fmt.size(), fmt.name())
 446
 447 ###--------------------------------------------------------------------------
 448 ### Main program.
 449
 450 FMTMAP = {
 451   'rsync': lambda f: enum_rsync(stdin, f),
 452   'find0': lambda f: enum_find0(stdin, f)
 453 }
 454 op = OP.OptionParser(
 455   usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 456   version = '%%prog, version %s' % VERSION,
 457   description = '''\
 458 Print a digest of a filesystem (or a collection of specified files) to
 459 standard output.  The idea is that the digest should be mostly /complete/
 460 (i.e., any `interesting\' change to the filesystem results in a different
 461 digest) and /canonical/ (i.e., identical filesystem contents result in
 462 identical output).
 463 ''')
 464
 465 for short, long, props in [
 466   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 467                     'help': 'clear cache of all files not seen' }),
 468   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 469                       'help': 'use FILE as a cache for file hashes' }),
 470   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 471                       'type': 'choice', 'choices': FMTMAP.keys(),
 472                       'help': 'read files to report in the given FORMAT' }),
 473   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 474                      ##'type': 'choice', 'choices': H.algorithms,
 475                      'help': 'use HASH as the hash function' })]:
 476   op.add_option(short, long, **props)
 477 opts, args = op.parse_args(argv)
 478
 479 if not opts.files and len(args) <= 1:
 480   die("no filename sources: nothing to do")
 481 db = HashCache(opts.cache, opts.hash)
 482 if opts.all:
 483   db.reset()
 484 rep = Reporter(db)
 485 if opts.files:
 486   FMTMAP[opts.files](rep.file)
 487 for dir in args[1:]:
 488   enum_walk(dir, rep.file)
 489 if opts.all:
 490   db.prune()
 491 db.flush()
 492
 493 ###----- That's all, folks --------------------------------------------------