chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import binascii as B
  28 import errno as E
  29 import hashlib as H
  30 import optparse as OP
  31 import os as OS
  32 import re as RX
  33 import sqlite3 as DB
  34 import stat as ST
  35 import time as T
  36 import zlib as Z
  37
  38 PACKAGE = '@PACKAGE@'
  39 VERSION = '@VERSION@'
  40
  41 ###--------------------------------------------------------------------------
  42 ### Utilities.
  43
  44 def bin(x): return x
  45 def text(x): return x
  46 def excval(): return exc_info()[1]
  47
  48 QUIS = OS.path.basename(argv[0])
  49
  50 def moan(msg):
  51   stderr.write('%s: %s\n' % (QUIS, msg))
  52
  53 def die(msg, rc = 1):
  54   moan(msg)
  55   exit(rc)
  56
  57 SYSERR = 0
  58 def syserr(msg):
  59   global SYSERR
  60   moan(msg)
  61   SYSERR += 1
  62
  63 ###--------------------------------------------------------------------------
  64 ### File system enumeration.
  65
  66 class FileInfo (object):
  67   def __init__(me, file, st = None):
  68     me.name = file
  69     if st:
  70       me.st = st
  71       me.err = None
  72     else:
  73       try:
  74         me.st = OS.lstat(file)
  75         me.err = None
  76       except OSError:
  77         me.st = None
  78         me.err = excval()
  79
  80 def enum_walk(file, func):
  81
  82   def dirents(name):
  83     try:
  84       return OS.listdir(name)
  85     except OSError:
  86       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
  87       return []
  88
  89   def dir(ee, dev):
  90     ff = []
  91     dd = []
  92     for e in ee:
  93       fi = FileInfo(e)
  94       if fi.st and fi.st.st_dev != dev: pass
  95       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  96       else: ff.append(fi)
  97     ff.sort(key = lambda fi: fi.name)
  98     dd.sort(key = lambda fi: fi.name + '/')
  99     for f in ff:
 100       func(f)
 101     for d in dd:
 102       if d.st.st_dev == dev:
 103         func(d)
 104         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 105
 106   if file.endswith('/'):
 107     cwd = OS.open('.', OS.O_RDONLY)
 108     try:
 109       OS.chdir(file)
 110       fi = FileInfo('.')
 111       func(fi)
 112       dir(dirents('.'), fi.st.st_dev)
 113     finally:
 114       OS.fchdir(cwd)
 115       OS.close(cwd)
 116   else:
 117     fi = FileInfo(file)
 118     func(fi)
 119     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 120       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 121           fi.st.st_dev)
 122
 123 def enum_find0(f, func):
 124   tail = ""
 125   while True:
 126     buf = f.read(8192)
 127     last = len(buf) == 0
 128     names = (tail + buf).split('\0')
 129     tail = names.pop()
 130     for n in names:
 131       func(FileInfo(n))
 132     if last:
 133       break
 134   if len(tail):
 135     moan("ignored trailing junk after last filename")
 136
 137 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 138 def enum_rsync(f, func):
 139
 140   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 141   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 142   ## one space character after the TIME and may begin with a space.
 143   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 144   ## for a byte with that value.  Newlines, and backslashes which would be
 145   ## ambiguous, are converted into this form; all other characters are
 146   ## literal.
 147   ##
 148   ## We ignore the stat information and retrieve it ourselves, because it's
 149   ## incomplete.  Hopefully the dcache is still warm.
 150
 151   for line in f:
 152     if line.endswith('\n'): line = line[:-1]
 153
 154     ## Extract the escaped name.
 155     ff = line.split(None, 3)
 156     if len(ff) != 4:
 157       syserr("ignoring invalid line from rsync: `%s'" % line)
 158       continue
 159     tail = ff[3]
 160     try:
 161       spc = tail.index(' ')
 162     except ValueError:
 163       syserr("ignoring invalid line from rsync: `%s'" % line)
 164       continue
 165     name = tail[spc + 1:]
 166
 167     ## Now translate escape sequences.
 168     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 169
 170     ## Call the client.
 171     try:
 172       fi = FileInfo(name)
 173     except OSError:
 174       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 175       continue
 176     func(fi)
 177
 178 ###--------------------------------------------------------------------------
 179 ### The hash cache.
 180
 181 class HashCache (object):
 182
 183   VERSION = 0
 184   BUFSZ = 128*1024
 185
 186   INIT = [
 187     """CREATE TABLE meta (
 188                version INTEGER NOT NULL,
 189                hash TEXT NOT NULL
 190        );""",
 191     """CREATE TABLE hash (
 192                ino INTEGER PRIMARY KEY,
 193                mtime INTEGER NOT NULL,
 194                ctime INTEGER NOT NULL,
 195                size INTEGER NOT NULL,
 196                hash TEXT NOT NULL,
 197                seen BOOLEAN NOT NULL DEFAULT TRUE
 198        );""",
 199     """PRAGMA journal_mode = WAL;"""
 200   ]
 201
 202   def __init__(me, file, hash = None):
 203
 204     if file is None:
 205
 206       ## We're going this alone, with no cache.
 207       db = None
 208       if hash is None:
 209         die("no hash specified and no database cache to read from")
 210     else:
 211
 212       ## Connect to the database.
 213       db = DB.connect(file)
 214       db.text_factory = str
 215
 216       ## See whether we can understand the cache database.
 217       c = db.cursor()
 218       v = h = None
 219       try:
 220         c.execute('SELECT version, hash FROM meta')
 221         v, h = c.fetchone()
 222         if c.fetchone() is not None:
 223           die("cache database corrupt: meta table has mutliple rows")
 224       except (DB.Error, TypeError):
 225         pass
 226
 227       ## If that didn't work, we'd better clear the thing and start again.
 228       ## But only if we know how to initialize it.
 229       if v != me.VERSION:
 230
 231         ## Explain the situation.
 232         moan("cache version %s not understood" % v)
 233         if hash is None:
 234           if h is None:
 235             die("can't initialize cache: no hash function set")
 236           else:
 237             hash = h
 238         try:
 239           H.new(hash)
 240         except Exception:
 241           die("unknown hash function `%s'" % hash)
 242
 243         ## Drop old things.
 244         c.execute('SELECT type, name FROM sqlite_master')
 245         for type, name in c.fetchall():
 246           c.execute('DROP %s IF EXISTS %s' % (type, name))
 247
 248         ## Now we're ready to go.
 249         for stmt in me.INIT:
 250           c.execute(stmt)
 251         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 252         db.commit()
 253
 254       ## Check the hash function if necessary.
 255       if hash is None:
 256         hash = h
 257       elif h is not None and  h != hash:
 258         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 259
 260     ## All done.
 261     me.hash = hash
 262     me._db = db
 263     me._pend = 0
 264
 265   def hashfile(me, fi):
 266
 267     ## If this isn't a proper file then don't try to hash it.
 268     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 269       return None
 270
 271     ## See whether there's a valid entry in the cache.
 272     if me._db:
 273       c = me._db.cursor()
 274       c.execute(
 275         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 276         [fi.st.st_ino])
 277       r = c.fetchone()
 278       if r is not None:
 279         mt, sz, h, s = r
 280         if mt == fi.st.st_mtime and \
 281            sz == fi.st.st_size:
 282           if not s:
 283             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 284                       [fi.st.st_ino])
 285           me._update()
 286           return h
 287
 288     ## Hash the file.  Beware raciness: update the file information from the
 289     ## open descriptor, but set the size from what we actually read.
 290     h = H.new(me.hash)
 291     try:
 292       with open(fi.name, 'rb') as f:
 293         sz = 0
 294         while True:
 295           buf = f.read(me.BUFSZ)
 296           if len(buf) == 0:
 297             break
 298           sz += len(buf)
 299           h.update(buf)
 300         fi.st = OS.fstat(f.fileno())
 301         ##fi.st.st_size = sz
 302       hash = h.digest()
 303     except (OSError, IOError):
 304       fi.st = None
 305       fi.err = excval()
 306       return None
 307     hash = text(B.hexlify(hash))
 308
 309     ## Insert a record into the database.
 310     if me._db:
 311       c.execute("""
 312               INSERT OR REPLACE INTO hash
 313                       (ino, mtime, ctime, size, hash, seen)
 314               VALUES
 315                       (?, ?, ?, ?, ?, 1);
 316       """, [fi.st.st_ino,
 317             fi.st.st_mtime,
 318             fi.st.st_ctime,
 319             fi.st.st_size,
 320             hash])
 321       me._update()
 322
 323     ## Done.
 324     return hash
 325
 326   def _update(me):
 327     me._pend += 1
 328     if me._pend >= 1024:
 329       me.flush()
 330
 331   def flush(me):
 332     if me._db:
 333       me._db.commit()
 334     me._pend = 0
 335
 336   def need_db(me):
 337     if not me._db:
 338       die("no cache database")
 339
 340   def forget(me, ino):
 341     me.need_db()
 342     c = me._db.cursor()
 343     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 344
 345   def reset(me):
 346     me.need_db()
 347     c = me._db.cursor()
 348     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 349     me.flush()
 350
 351   def prune(me):
 352     me.need_db()
 353     c = me._db.cursor()
 354     c.execute('DELETE FROM hash WHERE NOT seen')
 355     me.flush()
 356
 357 ###--------------------------------------------------------------------------
 358 ### Printing output.
 359
 360 class GenericFormatter (object):
 361   def __init__(me, fi):
 362     me.fi = fi
 363   def _fmt_time(me, t):
 364     tm = T.gmtime(t)
 365     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 366   def _enc_name(me, n):
 367     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 368   def name(me):
 369     return me._enc_name(me.fi.name)
 370   def info(me):
 371     return me.TYPE
 372   def mode(me):
 373     return '%06o' % me.fi.st.st_mode
 374   def size(me):
 375     return me.fi.st.st_size
 376   def mtime(me):
 377     return me._fmt_time(me.fi.st.st_mtime)
 378   def owner(me):
 379     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 380
 381 class ErrorFormatter (GenericFormatter):
 382   def info(me):
 383     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 384   def error(me): return 'error'
 385   mode = size = mtime = owner = error
 386
 387 class SocketFormatter (GenericFormatter):
 388   TYPE = 'socket'
 389 class PipeFormatter (GenericFormatter):
 390   TYPE = 'fifo'
 391
 392 class LinkFormatter (GenericFormatter):
 393   TYPE = 'symbolic-link'
 394   def name(me):
 395     n = GenericFormatter.name(me)
 396     try:
 397       d = OS.readlink(me.fi.name)
 398       return '%s -> %s' % (n, me._enc_name(d))
 399     except OSError:
 400       err = excval()
 401       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 402
 403 class DirectoryFormatter (GenericFormatter):
 404   TYPE = 'directory'
 405   def name(me): return GenericFormatter.name(me) + '/'
 406   def size(me): return 'dir'
 407
 408 class DeviceFormatter (GenericFormatter):
 409   def info(me):
 410     return '%s %d:%d' % (me.TYPE,
 411                          OS.major(me.fi.st.st_rdev),
 412                          OS.minor(me.fi.st.st_rdev))
 413 class BlockDeviceFormatter (DeviceFormatter):
 414   TYPE = 'block-device'
 415 class CharDeviceFormatter (DeviceFormatter):
 416   TYPE = 'character-device'
 417
 418 class FileFormatter (GenericFormatter):
 419   TYPE = 'regular-file'
 420
 421 class Reporter (object):
 422
 423   TYMAP = {
 424     ST.S_IFSOCK: SocketFormatter,
 425     ST.S_IFDIR: DirectoryFormatter,
 426     ST.S_IFLNK: LinkFormatter,
 427     ST.S_IFREG: FileFormatter,
 428     ST.S_IFBLK: BlockDeviceFormatter,
 429     ST.S_IFCHR: CharDeviceFormatter,
 430     ST.S_IFIFO: PipeFormatter,
 431   }
 432
 433   def __init__(me, db):
 434     me._inomap = {}
 435     me._vinomap = {}
 436     me._db = db
 437     me._hsz = int(H.new(db.hash).digest_size)
 438
 439   def file(me, fi):
 440     h = me._db.hashfile(fi)
 441     if fi.err:
 442       fmt = ErrorFormatter(fi)
 443       vino = 'error'
 444     else:
 445       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 446       inoidx = fi.st.st_dev, fi.st.st_ino
 447       try:
 448         vino = me._inomap[inoidx]
 449       except KeyError:
 450         suffix = ''
 451         seq = 0
 452         while True:
 453           vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
 454           if vino not in me._vinomap: break
 455           suffix = '\0%d' % seq
 456           seq += 1
 457         me._inomap[inoidx] = vino
 458         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 459     if h: info = h
 460     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 461     print('%s %8s %6s %-12s %-20s %20s %s' %
 462           (info, vino, fmt.mode(), fmt.owner(),
 463            fmt.mtime(), fmt.size(), fmt.name()))
 464
 465 ###--------------------------------------------------------------------------
 466 ### Database clearing from diff files.
 467
 468 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 469
 470 def clear_entry(db, lno, line):
 471
 472   good = True
 473
 474   if line.startswith('['):
 475     pos = line.find(']')
 476     if pos < 0:
 477       moan("failed to parse file entry (type field; line %d)" % lno)
 478       return False
 479     ty = line[1:pos].strip()
 480     rest = line[pos + 1:]
 481     hash = None
 482   else:
 483     ff = line.split(None, 1)
 484     if len(ff) != 2:
 485       moan("failed to parse file entry (field split; line %d)" % lno)
 486       return False
 487     ty = 'regular-file'
 488     hash, rest = ff
 489
 490   ff = rest.split(None, 5)
 491   if len(ff) != 6:
 492     moan("failed to parse file entry (field split; line %d)" % lno)
 493     return False
 494   ino, mode, uidgid, mtime, sz, name = ff
 495
 496   if ty != 'symbolic-link':
 497     target = None
 498   else:
 499     nn = name.split(' -> ', 1)
 500     if len(nn) != 2:
 501       moan("failed to parse file entry (name split; line %d)" % lno)
 502       return False
 503     name, target = nn
 504     target = target.decode('string_escape')
 505   name = name.decode('string_escape')
 506
 507   try:
 508     st = OS.lstat(name)
 509   except OSError:
 510     e = excval()
 511     moan("failed to stat `%s': %s" % (name, e.strerror))
 512     if e.errno != E.ENOENT: good = False
 513   else:
 514     print("Clear cache entry for `%s'" % name)
 515     db.forget(st.st_ino)
 516
 517   return good
 518
 519 def clear_cache(db):
 520
 521   ## Work through the input diff file one line at a time.
 522   diffstate = 'gap'
 523   lno = 0
 524   good = True
 525   for line in stdin:
 526     if line.endswith('\n'): line = line[:-1]
 527     lno += 1
 528
 529     ## We're in a gap between hunks.  Find a hunk header and extract the line
 530     ## counts.
 531     if diffstate == 'gap':
 532       m = R_HUNK.match(line)
 533       if m:
 534         oldlines = int(m.group(1))
 535         newlines = int(m.group(2))
 536         diffstate = 'hunk'
 537         hdrlno = lno
 538
 539     ## We're in a hunk.  Keep track of whether we've reached the end, and
 540     ## discard entries from the cache for mismatching lines.
 541     elif diffstate == 'hunk':
 542       if len(line) == 0:
 543         moan("empty line in diff hunk (line %d)" % lno)
 544         good = False
 545       ty = line[0]
 546       if ty == ' ':
 547         oldlines -= 1; newlines -= 1
 548       elif ty == '+':
 549         newlines -= 1
 550         if not clear_entry(db, lno, line[1:]): good = False
 551       elif ty == '-':
 552         oldlines -= 1
 553         if not clear_entry(db, lno, line[1:]): good = False
 554       else:
 555         moan("incomprehensible line in diff hunk (line %d)" % lno)
 556         good = false
 557       if oldlines < 0 or newlines < 0:
 558         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 559         good = False
 560       if oldlines == newlines == 0:
 561         diffstate = 'gap'
 562
 563   if diffstate == 'hunk':
 564     moan("truncated diff hunk (started at line %d)" % hdrlno)
 565     good = False
 566
 567   return good
 568
 569 ###--------------------------------------------------------------------------
 570 ### Main program.
 571
 572 FMTMAP = {
 573   'rsync': lambda f: enum_rsync(stdin, f),
 574   'find0': lambda f: enum_find0(stdin, f)
 575 }
 576 op = OP.OptionParser(
 577   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 578   version = '%%prog, version %s' % VERSION,
 579   description = '''\
 580 Print a digest of a filesystem (or a collection of specified files) to
 581 standard output.  The idea is that the digest should be mostly /complete/
 582 (i.e., any `interesting\' change to the filesystem results in a different
 583 digest) and /canonical/ (i.e., identical filesystem contents result in
 584 identical output).
 585 ''')
 586
 587 for short, long, props in [
 588   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 589                     'help': 'clear cache of all files not seen' }),
 590   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 591                       'help': 'use FILE as a cache for file hashes' }),
 592   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 593                       'type': 'choice', 'choices': list(FMTMAP.keys()),
 594                       'help': 'read files to report in the given FORMAT' }),
 595   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 596                       'help': 'read diff from stdin, clear cache entries' }),
 597   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 598                        'type': 'int', 'default': 2,
 599                        'help': 'produce output with given compatibility VERSION' }),
 600   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 601                      ##'type': 'choice', 'choices': H.algorithms,
 602                      'help': 'use HASH as the hash function' })]:
 603   op.add_option(short, long, **props)
 604 OPTS, args = op.parse_args(argv)
 605 if not 1 <= OPTS.compat <= 2:
 606   die("unknown compatibility version %d" % OPTS.compat)
 607 if OPTS.udiff:
 608   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 609     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 610   db = HashCache(OPTS.cache, OPTS.hash)
 611   if len(args) == 2: OS.chdir(args[1])
 612   good = True
 613   if not clear_cache(db): good = False
 614   if good: db.flush()
 615   else: exit(2)
 616 else:
 617   if not OPTS.files and len(args) <= 1:
 618     die("no filename sources: nothing to do")
 619   db = HashCache(OPTS.cache, OPTS.hash)
 620   if OPTS.all:
 621     db.reset()
 622   if OPTS.compat >= 2:
 623     print("## fshash report format version %d" % OPTS.compat)
 624   rep = Reporter(db)
 625   if OPTS.files:
 626     FMTMAP[OPTS.files](rep.file)
 627   for dir in args[1:]:
 628     enum_walk(dir, rep.file)
 629   if OPTS.all:
 630     db.prune()
 631   db.flush()
 632
 633 ###----- That's all, folks --------------------------------------------------