chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import errno as E
  28 import hashlib as H
  29 import optparse as OP
  30 import os as OS
  31 import re as RX
  32 import sqlite3 as DB
  33 import stat as ST
  34 import time as T
  35 import zlib as Z
  36
  37 PACKAGE = '@PACKAGE@'
  38 VERSION = '@VERSION@'
  39
  40 ###--------------------------------------------------------------------------
  41 ### Utilities.
  42
  43 def excval(): return exc_info()[1]
  44
  45 QUIS = OS.path.basename(argv[0])
  46
  47 def moan(msg):
  48   stderr.write('%s: %s\n' % (QUIS, msg))
  49
  50 def die(msg, rc = 1):
  51   moan(msg)
  52   exit(rc)
  53
  54 SYSERR = 0
  55 def syserr(msg):
  56   global SYSERR
  57   moan(msg)
  58   SYSERR += 1
  59
  60 ###--------------------------------------------------------------------------
  61 ### File system enumeration.
  62
  63 class FileInfo (object):
  64   def __init__(me, file, st = None):
  65     me.name = file
  66     if st:
  67       me.st = st
  68       me.err = None
  69     else:
  70       try:
  71         me.st = OS.lstat(file)
  72         me.err = None
  73       except OSError:
  74         me.st = None
  75         me.err = excval()
  76
  77 def enum_walk(file, func):
  78
  79   def dirents(name):
  80     try:
  81       return OS.listdir(name)
  82     except OSError:
  83       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
  84       return []
  85
  86   def dir(ee, dev):
  87     ff = []
  88     dd = []
  89     for e in ee:
  90       fi = FileInfo(e)
  91       if fi.st and fi.st.st_dev != dev: pass
  92       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  93       else: ff.append(fi)
  94     ff.sort(key = lambda fi: fi.name)
  95     dd.sort(key = lambda fi: fi.name + '/')
  96     for f in ff:
  97       func(f)
  98     for d in dd:
  99       if d.st.st_dev == dev:
 100         func(d)
 101         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 102
 103   if file.endswith('/'):
 104     cwd = OS.open('.', OS.O_RDONLY)
 105     try:
 106       OS.chdir(file)
 107       fi = FileInfo('.')
 108       func(fi)
 109       dir(dirents('.'), fi.st.st_dev)
 110     finally:
 111       OS.fchdir(cwd)
 112       OS.close(cwd)
 113   else:
 114     fi = FileInfo(file)
 115     func(fi)
 116     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 117       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 118           fi.st.st_dev)
 119
 120 def enum_find0(f, func):
 121   tail = ""
 122   while True:
 123     buf = f.read(8192)
 124     last = len(buf) == 0
 125     names = (tail + buf).split('\0')
 126     tail = names.pop()
 127     for n in names:
 128       func(FileInfo(n))
 129     if last:
 130       break
 131   if len(tail):
 132     moan("ignored trailing junk after last filename")
 133
 134 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 135 def enum_rsync(f, func):
 136
 137   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 138   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 139   ## one space character after the TIME and may begin with a space.
 140   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 141   ## for a byte with that value.  Newlines, and backslashes which would be
 142   ## ambiguous, are converted into this form; all other characters are
 143   ## literal.
 144   ##
 145   ## We ignore the stat information and retrieve it ourselves, because it's
 146   ## incomplete.  Hopefully the dcache is still warm.
 147
 148   for line in f:
 149     if line.endswith('\n'): line = line[:-1]
 150
 151     ## Extract the escaped name.
 152     ff = line.split(None, 3)
 153     if len(ff) != 4:
 154       syserr("ignoring invalid line from rsync: `%s'" % line)
 155       continue
 156     tail = ff[3]
 157     try:
 158       spc = tail.index(' ')
 159     except ValueError:
 160       syserr("ignoring invalid line from rsync: `%s'" % line)
 161       continue
 162     name = tail[spc + 1:]
 163
 164     ## Now translate escape sequences.
 165     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 166
 167     ## Call the client.
 168     try:
 169       fi = FileInfo(name)
 170     except OSError:
 171       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 172       continue
 173     func(fi)
 174
 175 ###--------------------------------------------------------------------------
 176 ### The hash cache.
 177
 178 class HashCache (object):
 179
 180   VERSION = 0
 181   BUFSZ = 128*1024
 182
 183   INIT = [
 184     """CREATE TABLE meta (
 185                version INTEGER NOT NULL,
 186                hash TEXT NOT NULL
 187        );""",
 188     """CREATE TABLE hash (
 189                ino INTEGER PRIMARY KEY,
 190                mtime INTEGER NOT NULL,
 191                ctime INTEGER NOT NULL,
 192                size INTEGER NOT NULL,
 193                hash TEXT NOT NULL,
 194                seen BOOLEAN NOT NULL DEFAULT TRUE
 195        );""",
 196     """PRAGMA journal_mode = WAL;"""
 197   ]
 198
 199   def __init__(me, file, hash = None):
 200
 201     if file is None:
 202
 203       ## We're going this alone, with no cache.
 204       db = None
 205       if hash is None:
 206         die("no hash specified and no database cache to read from")
 207     else:
 208
 209       ## Connect to the database.
 210       db = DB.connect(file)
 211       db.text_factory = str
 212
 213       ## See whether we can understand the cache database.
 214       c = db.cursor()
 215       v = h = None
 216       try:
 217         c.execute('SELECT version, hash FROM meta')
 218         v, h = c.fetchone()
 219         if c.fetchone() is not None:
 220           die("cache database corrupt: meta table has mutliple rows")
 221       except (DB.Error, TypeError):
 222         pass
 223
 224       ## If that didn't work, we'd better clear the thing and start again.
 225       ## But only if we know how to initialize it.
 226       if v != me.VERSION:
 227
 228         ## Explain the situation.
 229         moan("cache version %s not understood" % v)
 230         if hash is None:
 231           if h is None:
 232             die("can't initialize cache: no hash function set")
 233           else:
 234             hash = h
 235         try:
 236           H.new(hash)
 237         except Exception:
 238           die("unknown hash function `%s'" % hash)
 239
 240         ## Drop old things.
 241         c.execute('SELECT type, name FROM sqlite_master')
 242         for type, name in c.fetchall():
 243           c.execute('DROP %s IF EXISTS %s' % (type, name))
 244
 245         ## Now we're ready to go.
 246         for stmt in me.INIT:
 247           c.execute(stmt)
 248         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 249         db.commit()
 250
 251       ## Check the hash function if necessary.
 252       if hash is None:
 253         hash = h
 254       elif h is not None and  h != hash:
 255         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 256
 257     ## All done.
 258     me.hash = hash
 259     me._db = db
 260     me._pend = 0
 261
 262   def hashfile(me, fi):
 263
 264     ## If this isn't a proper file then don't try to hash it.
 265     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 266       return None
 267
 268     ## See whether there's a valid entry in the cache.
 269     if me._db:
 270       c = me._db.cursor()
 271       c.execute(
 272         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 273         [fi.st.st_ino])
 274       r = c.fetchone()
 275       if r is not None:
 276         mt, sz, h, s = r
 277         if mt == fi.st.st_mtime and \
 278            sz == fi.st.st_size:
 279           if not s:
 280             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 281                       [fi.st.st_ino])
 282           me._update()
 283           return h
 284
 285     ## Hash the file.  Beware raciness: update the file information from the
 286     ## open descriptor, but set the size from what we actually read.
 287     h = H.new(me.hash)
 288     try:
 289       with open(fi.name, 'rb') as f:
 290         sz = 0
 291         while True:
 292           buf = f.read(me.BUFSZ)
 293           if len(buf) == 0:
 294             break
 295           sz += len(buf)
 296           h.update(buf)
 297         fi.st = OS.fstat(f.fileno())
 298         ##fi.st.st_size = sz
 299       hash = h.digest()
 300     except (OSError, IOError):
 301       fi.st = None
 302       fi.err = excval()
 303       return None
 304     hash = hash.encode('hex')
 305
 306     ## Insert a record into the database.
 307     if me._db:
 308       c.execute("""
 309               INSERT OR REPLACE INTO hash
 310                       (ino, mtime, ctime, size, hash, seen)
 311               VALUES
 312                       (?, ?, ?, ?, ?, 1);
 313       """, [fi.st.st_ino,
 314             fi.st.st_mtime,
 315             fi.st.st_ctime,
 316             fi.st.st_size,
 317             hash])
 318       me._update()
 319
 320     ## Done.
 321     return hash
 322
 323   def _update(me):
 324     me._pend += 1
 325     if me._pend >= 1024:
 326       me.flush()
 327
 328   def flush(me):
 329     if me._db:
 330       me._db.commit()
 331     me._pend = 0
 332
 333   def need_db(me):
 334     if not me._db:
 335       die("no cache database")
 336
 337   def forget(me, ino):
 338     me.need_db()
 339     c = me._db.cursor()
 340     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 341
 342   def reset(me):
 343     me.need_db()
 344     c = me._db.cursor()
 345     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 346     me.flush()
 347
 348   def prune(me):
 349     me.need_db()
 350     c = me._db.cursor()
 351     c.execute('DELETE FROM hash WHERE NOT seen')
 352     me.flush()
 353
 354 ###--------------------------------------------------------------------------
 355 ### Printing output.
 356
 357 class GenericFormatter (object):
 358   def __init__(me, fi):
 359     me.fi = fi
 360   def _fmt_time(me, t):
 361     tm = T.gmtime(t)
 362     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 363   def _enc_name(me, n):
 364     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 365   def name(me):
 366     return me._enc_name(me.fi.name)
 367   def info(me):
 368     return me.TYPE
 369   def mode(me):
 370     return '%06o' % me.fi.st.st_mode
 371   def size(me):
 372     return me.fi.st.st_size
 373   def mtime(me):
 374     return me._fmt_time(me.fi.st.st_mtime)
 375   def owner(me):
 376     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 377
 378 class ErrorFormatter (GenericFormatter):
 379   def info(me):
 380     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 381   def error(me): return 'error'
 382   mode = size = mtime = owner = error
 383
 384 class SocketFormatter (GenericFormatter):
 385   TYPE = 'socket'
 386 class PipeFormatter (GenericFormatter):
 387   TYPE = 'fifo'
 388
 389 class LinkFormatter (GenericFormatter):
 390   TYPE = 'symbolic-link'
 391   def name(me):
 392     n = GenericFormatter.name(me)
 393     try:
 394       d = OS.readlink(me.fi.name)
 395       return '%s -> %s' % (n, me._enc_name(d))
 396     except OSError:
 397       err = excval()
 398       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 399
 400 class DirectoryFormatter (GenericFormatter):
 401   TYPE = 'directory'
 402   def name(me): return GenericFormatter.name(me) + '/'
 403   def size(me): return 'dir'
 404
 405 class DeviceFormatter (GenericFormatter):
 406   def info(me):
 407     return '%s %d:%d' % (me.TYPE,
 408                          OS.major(me.fi.st.st_rdev),
 409                          OS.minor(me.fi.st.st_rdev))
 410 class BlockDeviceFormatter (DeviceFormatter):
 411   TYPE = 'block-device'
 412 class CharDeviceFormatter (DeviceFormatter):
 413   TYPE = 'character-device'
 414
 415 class FileFormatter (GenericFormatter):
 416   TYPE = 'regular-file'
 417
 418 class Reporter (object):
 419
 420   TYMAP = {
 421     ST.S_IFSOCK: SocketFormatter,
 422     ST.S_IFDIR: DirectoryFormatter,
 423     ST.S_IFLNK: LinkFormatter,
 424     ST.S_IFREG: FileFormatter,
 425     ST.S_IFBLK: BlockDeviceFormatter,
 426     ST.S_IFCHR: CharDeviceFormatter,
 427     ST.S_IFIFO: PipeFormatter,
 428   }
 429
 430   def __init__(me, db):
 431     me._inomap = {}
 432     me._vinomap = {}
 433     me._db = db
 434     me._hsz = int(H.new(db.hash).digest_size)
 435
 436   def file(me, fi):
 437     h = me._db.hashfile(fi)
 438     if fi.err:
 439       fmt = ErrorFormatter(fi)
 440       vino = 'error'
 441     else:
 442       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 443       inoidx = fi.st.st_dev, fi.st.st_ino
 444       try:
 445         vino = me._inomap[inoidx]
 446       except KeyError:
 447         suffix = ''
 448         seq = 0
 449         while True:
 450           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 451           if vino not in me._vinomap: break
 452           suffix = '\0%d' % seq
 453           seq += 1
 454         me._inomap[inoidx] = vino
 455         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 456     if h: info = h
 457     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 458     print '%s %8s %6s %-12s %-20s %20s %s' % (
 459       info, vino, fmt.mode(), fmt.owner(),
 460       fmt.mtime(), fmt.size(), fmt.name())
 461
 462 ###--------------------------------------------------------------------------
 463 ### Database clearing from diff files.
 464
 465 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 466
 467 def clear_entry(db, lno, line):
 468
 469   good = True
 470
 471   if line.startswith('['):
 472     pos = line.find(']')
 473     if pos < 0:
 474       moan("failed to parse file entry (type field; line %d)" % lno)
 475       return False
 476     ty = line[1:pos].strip()
 477     rest = line[pos + 1:]
 478     hash = None
 479   else:
 480     ff = line.split(None, 1)
 481     if len(ff) != 2:
 482       moan("failed to parse file entry (field split; line %d)" % lno)
 483       return False
 484     ty = 'regular-file'
 485     hash, rest = ff
 486
 487   ff = rest.split(None, 5)
 488   if len(ff) != 6:
 489     moan("failed to parse file entry (field split; line %d)" % lno)
 490     return False
 491   ino, mode, uidgid, mtime, sz, name = ff
 492
 493   if ty != 'symbolic-link':
 494     target = None
 495   else:
 496     nn = name.split(' -> ', 1)
 497     if len(nn) != 2:
 498       moan("failed to parse file entry (name split; line %d)" % lno)
 499       return False
 500     name, target = nn
 501     target = target.decode('string_escape')
 502   name = name.decode('string_escape')
 503
 504   try:
 505     st = OS.lstat(name)
 506   except OSError:
 507     e = excval()
 508     moan("failed to stat `%s': %s" % (name, e.strerror))
 509     if e.errno != E.ENOENT: good = False
 510   else:
 511     print "Clear cache entry for `%s'" % name
 512     db.forget(st.st_ino)
 513
 514   return good
 515
 516 def clear_cache(db):
 517
 518   ## Work through the input diff file one line at a time.
 519   diffstate = 'gap'
 520   lno = 0
 521   good = True
 522   for line in stdin:
 523     if line.endswith('\n'): line = line[:-1]
 524     lno += 1
 525
 526     ## We're in a gap between hunks.  Find a hunk header and extract the line
 527     ## counts.
 528     if diffstate == 'gap':
 529       m = R_HUNK.match(line)
 530       if m:
 531         oldlines = int(m.group(1))
 532         newlines = int(m.group(2))
 533         diffstate = 'hunk'
 534         hdrlno = lno
 535
 536     ## We're in a hunk.  Keep track of whether we've reached the end, and
 537     ## discard entries from the cache for mismatching lines.
 538     elif diffstate == 'hunk':
 539       if len(line) == 0:
 540         moan("empty line in diff hunk (line %d)" % lno)
 541         good = False
 542       ty = line[0]
 543       if ty == ' ':
 544         oldlines -= 1; newlines -= 1
 545       elif ty == '+':
 546         newlines -= 1
 547         if not clear_entry(db, lno, line[1:]): good = False
 548       elif ty == '-':
 549         oldlines -= 1
 550         if not clear_entry(db, lno, line[1:]): good = False
 551       else:
 552         moan("incomprehensible line in diff hunk (line %d)" % lno)
 553         good = false
 554       if oldlines < 0 or newlines < 0:
 555         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 556         good = False
 557       if oldlines == newlines == 0:
 558         diffstate = 'gap'
 559
 560   if diffstate == 'hunk':
 561     moan("truncated diff hunk (started at line %d)" % hdrlno)
 562     good = False
 563
 564   return good
 565
 566 ###--------------------------------------------------------------------------
 567 ### Main program.
 568
 569 FMTMAP = {
 570   'rsync': lambda f: enum_rsync(stdin, f),
 571   'find0': lambda f: enum_find0(stdin, f)
 572 }
 573 op = OP.OptionParser(
 574   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 575   version = '%%prog, version %s' % VERSION,
 576   description = '''\
 577 Print a digest of a filesystem (or a collection of specified files) to
 578 standard output.  The idea is that the digest should be mostly /complete/
 579 (i.e., any `interesting\' change to the filesystem results in a different
 580 digest) and /canonical/ (i.e., identical filesystem contents result in
 581 identical output).
 582 ''')
 583
 584 for short, long, props in [
 585   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 586                     'help': 'clear cache of all files not seen' }),
 587   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 588                       'help': 'use FILE as a cache for file hashes' }),
 589   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 590                       'type': 'choice', 'choices': FMTMAP.keys(),
 591                       'help': 'read files to report in the given FORMAT' }),
 592   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 593                       'help': 'read diff from stdin, clear cache entries' }),
 594   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 595                        'type': 'int', 'default': 2,
 596                        'help': 'produce output with given compatibility VERSION' }),
 597   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 598                      ##'type': 'choice', 'choices': H.algorithms,
 599                      'help': 'use HASH as the hash function' })]:
 600   op.add_option(short, long, **props)
 601 OPTS, args = op.parse_args(argv)
 602 if not 1 <= OPTS.compat <= 2:
 603   die("unknown compatibility version %d" % OPTS.compat)
 604 if OPTS.udiff:
 605   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 606     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 607   db = HashCache(OPTS.cache, OPTS.hash)
 608   if len(args) == 2: OS.chdir(args[1])
 609   good = True
 610   if not clear_cache(db): good = False
 611   if good: db.flush()
 612   else: exit(2)
 613 else:
 614   if not OPTS.files and len(args) <= 1:
 615     die("no filename sources: nothing to do")
 616   db = HashCache(OPTS.cache, OPTS.hash)
 617   if OPTS.all:
 618     db.reset()
 619   if OPTS.compat >= 2:
 620     print "## fshash report format version %d" % OPTS.compat
 621   rep = Reporter(db)
 622   if OPTS.files:
 623     FMTMAP[OPTS.files](rep.file)
 624   for dir in args[1:]:
 625     enum_walk(dir, rep.file)
 626   if OPTS.all:
 627     db.prune()
 628   db.flush()
 629
 630 ###----- That's all, folks --------------------------------------------------