chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exc_info, exit, stdin, stdout, stderr
  27 import binascii as B
  28 import errno as E
  29 import hashlib as H
  30 import optparse as OP
  31 import os as OS
  32 import re as RX
  33 import sqlite3 as DB
  34 import stat as ST
  35 import time as T
  36 import zlib as Z
  37
  38 PACKAGE = '@PACKAGE@'
  39 VERSION = '@VERSION@'
  40
  41 ###--------------------------------------------------------------------------
  42 ### Utilities.
  43
  44 def excval(): return exc_info()[1]
  45
  46 QUIS = OS.path.basename(argv[0])
  47
  48 def moan(msg):
  49   stderr.write('%s: %s\n' % (QUIS, msg))
  50
  51 def die(msg, rc = 1):
  52   moan(msg)
  53   exit(rc)
  54
  55 SYSERR = 0
  56 def syserr(msg):
  57   global SYSERR
  58   moan(msg)
  59   SYSERR += 1
  60
  61 ###--------------------------------------------------------------------------
  62 ### File system enumeration.
  63
  64 class FileInfo (object):
  65   def __init__(me, file, st = None):
  66     me.name = file
  67     if st:
  68       me.st = st
  69       me.err = None
  70     else:
  71       try:
  72         me.st = OS.lstat(file)
  73         me.err = None
  74       except OSError:
  75         me.st = None
  76         me.err = excval()
  77
  78 def enum_walk(file, func):
  79
  80   def dirents(name):
  81     try:
  82       return OS.listdir(name)
  83     except OSError:
  84       syserr("failed to read directory `%s': %s" % (name, excval().strerror))
  85       return []
  86
  87   def dir(ee, dev):
  88     ff = []
  89     dd = []
  90     for e in ee:
  91       fi = FileInfo(e)
  92       if fi.st and fi.st.st_dev != dev: pass
  93       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  94       else: ff.append(fi)
  95     ff.sort(key = lambda fi: fi.name)
  96     dd.sort(key = lambda fi: fi.name + '/')
  97     for f in ff:
  98       func(f)
  99     for d in dd:
 100       if d.st.st_dev == dev:
 101         func(d)
 102         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 103
 104   if file.endswith('/'):
 105     cwd = OS.open('.', OS.O_RDONLY)
 106     try:
 107       OS.chdir(file)
 108       fi = FileInfo('.')
 109       func(fi)
 110       dir(dirents('.'), fi.st.st_dev)
 111     finally:
 112       OS.fchdir(cwd)
 113       OS.close(cwd)
 114   else:
 115     fi = FileInfo(file)
 116     func(fi)
 117     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 118       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 119           fi.st.st_dev)
 120
 121 def enum_find0(f, func):
 122   tail = ""
 123   while True:
 124     buf = f.read(8192)
 125     last = len(buf) == 0
 126     names = (tail + buf).split('\0')
 127     tail = names.pop()
 128     for n in names:
 129       func(FileInfo(n))
 130     if last:
 131       break
 132   if len(tail):
 133     moan("ignored trailing junk after last filename")
 134
 135 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 136 def enum_rsync(f, func):
 137
 138   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 139   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 140   ## one space character after the TIME and may begin with a space.
 141   ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
 142   ## for a byte with that value.  Newlines, and backslashes which would be
 143   ## ambiguous, are converted into this form; all other characters are
 144   ## literal.
 145   ##
 146   ## We ignore the stat information and retrieve it ourselves, because it's
 147   ## incomplete.  Hopefully the dcache is still warm.
 148
 149   for line in f:
 150     if line.endswith('\n'): line = line[:-1]
 151
 152     ## Extract the escaped name.
 153     ff = line.split(None, 3)
 154     if len(ff) != 4:
 155       syserr("ignoring invalid line from rsync: `%s'" % line)
 156       continue
 157     tail = ff[3]
 158     try:
 159       spc = tail.index(' ')
 160     except ValueError:
 161       syserr("ignoring invalid line from rsync: `%s'" % line)
 162       continue
 163     name = tail[spc + 1:]
 164
 165     ## Now translate escape sequences.
 166     name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 167
 168     ## Call the client.
 169     try:
 170       fi = FileInfo(name)
 171     except OSError:
 172       syserr("failed to stat `%s': %s" % (name, excval().strerror))
 173       continue
 174     func(fi)
 175
 176 ###--------------------------------------------------------------------------
 177 ### The hash cache.
 178
 179 class HashCache (object):
 180
 181   VERSION = 0
 182   BUFSZ = 128*1024
 183
 184   INIT = [
 185     """CREATE TABLE meta (
 186                version INTEGER NOT NULL,
 187                hash TEXT NOT NULL
 188        );""",
 189     """CREATE TABLE hash (
 190                ino INTEGER PRIMARY KEY,
 191                mtime INTEGER NOT NULL,
 192                ctime INTEGER NOT NULL,
 193                size INTEGER NOT NULL,
 194                hash TEXT NOT NULL,
 195                seen BOOLEAN NOT NULL DEFAULT TRUE
 196        );""",
 197     """PRAGMA journal_mode = WAL;"""
 198   ]
 199
 200   def __init__(me, file, hash = None):
 201
 202     if file is None:
 203
 204       ## We're going this alone, with no cache.
 205       db = None
 206       if hash is None:
 207         die("no hash specified and no database cache to read from")
 208     else:
 209
 210       ## Connect to the database.
 211       db = DB.connect(file)
 212       db.text_factory = str
 213
 214       ## See whether we can understand the cache database.
 215       c = db.cursor()
 216       v = h = None
 217       try:
 218         c.execute('SELECT version, hash FROM meta')
 219         v, h = c.fetchone()
 220         if c.fetchone() is not None:
 221           die("cache database corrupt: meta table has mutliple rows")
 222       except (DB.Error, TypeError):
 223         pass
 224
 225       ## If that didn't work, we'd better clear the thing and start again.
 226       ## But only if we know how to initialize it.
 227       if v != me.VERSION:
 228
 229         ## Explain the situation.
 230         moan("cache version %s not understood" % v)
 231         if hash is None:
 232           if h is None:
 233             die("can't initialize cache: no hash function set")
 234           else:
 235             hash = h
 236         try:
 237           H.new(hash)
 238         except Exception:
 239           die("unknown hash function `%s'" % hash)
 240
 241         ## Drop old things.
 242         c.execute('SELECT type, name FROM sqlite_master')
 243         for type, name in c.fetchall():
 244           c.execute('DROP %s IF EXISTS %s' % (type, name))
 245
 246         ## Now we're ready to go.
 247         for stmt in me.INIT:
 248           c.execute(stmt)
 249         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 250         db.commit()
 251
 252       ## Check the hash function if necessary.
 253       if hash is None:
 254         hash = h
 255       elif h is not None and  h != hash:
 256         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 257
 258     ## All done.
 259     me.hash = hash
 260     me._db = db
 261     me._pend = 0
 262
 263   def hashfile(me, fi):
 264
 265     ## If this isn't a proper file then don't try to hash it.
 266     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 267       return None
 268
 269     ## See whether there's a valid entry in the cache.
 270     if me._db:
 271       c = me._db.cursor()
 272       c.execute(
 273         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 274         [fi.st.st_ino])
 275       r = c.fetchone()
 276       if r is not None:
 277         mt, sz, h, s = r
 278         if mt == fi.st.st_mtime and \
 279            sz == fi.st.st_size:
 280           if not s:
 281             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 282                       [fi.st.st_ino])
 283           me._update()
 284           return h
 285
 286     ## Hash the file.  Beware raciness: update the file information from the
 287     ## open descriptor, but set the size from what we actually read.
 288     h = H.new(me.hash)
 289     try:
 290       with open(fi.name, 'rb') as f:
 291         sz = 0
 292         while True:
 293           buf = f.read(me.BUFSZ)
 294           if len(buf) == 0:
 295             break
 296           sz += len(buf)
 297           h.update(buf)
 298         fi.st = OS.fstat(f.fileno())
 299         ##fi.st.st_size = sz
 300       hash = h.digest()
 301     except (OSError, IOError):
 302       fi.st = None
 303       fi.err = excval()
 304       return None
 305     hash = B.hexlify(hash)
 306
 307     ## Insert a record into the database.
 308     if me._db:
 309       c.execute("""
 310               INSERT OR REPLACE INTO hash
 311                       (ino, mtime, ctime, size, hash, seen)
 312               VALUES
 313                       (?, ?, ?, ?, ?, 1);
 314       """, [fi.st.st_ino,
 315             fi.st.st_mtime,
 316             fi.st.st_ctime,
 317             fi.st.st_size,
 318             hash])
 319       me._update()
 320
 321     ## Done.
 322     return hash
 323
 324   def _update(me):
 325     me._pend += 1
 326     if me._pend >= 1024:
 327       me.flush()
 328
 329   def flush(me):
 330     if me._db:
 331       me._db.commit()
 332     me._pend = 0
 333
 334   def need_db(me):
 335     if not me._db:
 336       die("no cache database")
 337
 338   def forget(me, ino):
 339     me.need_db()
 340     c = me._db.cursor()
 341     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 342
 343   def reset(me):
 344     me.need_db()
 345     c = me._db.cursor()
 346     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 347     me.flush()
 348
 349   def prune(me):
 350     me.need_db()
 351     c = me._db.cursor()
 352     c.execute('DELETE FROM hash WHERE NOT seen')
 353     me.flush()
 354
 355 ###--------------------------------------------------------------------------
 356 ### Printing output.
 357
 358 class GenericFormatter (object):
 359   def __init__(me, fi):
 360     me.fi = fi
 361   def _fmt_time(me, t):
 362     tm = T.gmtime(t)
 363     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 364   def _enc_name(me, n):
 365     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 366   def name(me):
 367     return me._enc_name(me.fi.name)
 368   def info(me):
 369     return me.TYPE
 370   def mode(me):
 371     return '%06o' % me.fi.st.st_mode
 372   def size(me):
 373     return me.fi.st.st_size
 374   def mtime(me):
 375     return me._fmt_time(me.fi.st.st_mtime)
 376   def owner(me):
 377     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 378
 379 class ErrorFormatter (GenericFormatter):
 380   def info(me):
 381     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 382   def error(me): return 'error'
 383   mode = size = mtime = owner = error
 384
 385 class SocketFormatter (GenericFormatter):
 386   TYPE = 'socket'
 387 class PipeFormatter (GenericFormatter):
 388   TYPE = 'fifo'
 389
 390 class LinkFormatter (GenericFormatter):
 391   TYPE = 'symbolic-link'
 392   def name(me):
 393     n = GenericFormatter.name(me)
 394     try:
 395       d = OS.readlink(me.fi.name)
 396       return '%s -> %s' % (n, me._enc_name(d))
 397     except OSError:
 398       err = excval()
 399       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 400
 401 class DirectoryFormatter (GenericFormatter):
 402   TYPE = 'directory'
 403   def name(me): return GenericFormatter.name(me) + '/'
 404   def size(me): return 'dir'
 405
 406 class DeviceFormatter (GenericFormatter):
 407   def info(me):
 408     return '%s %d:%d' % (me.TYPE,
 409                          OS.major(me.fi.st.st_rdev),
 410                          OS.minor(me.fi.st.st_rdev))
 411 class BlockDeviceFormatter (DeviceFormatter):
 412   TYPE = 'block-device'
 413 class CharDeviceFormatter (DeviceFormatter):
 414   TYPE = 'character-device'
 415
 416 class FileFormatter (GenericFormatter):
 417   TYPE = 'regular-file'
 418
 419 class Reporter (object):
 420
 421   TYMAP = {
 422     ST.S_IFSOCK: SocketFormatter,
 423     ST.S_IFDIR: DirectoryFormatter,
 424     ST.S_IFLNK: LinkFormatter,
 425     ST.S_IFREG: FileFormatter,
 426     ST.S_IFBLK: BlockDeviceFormatter,
 427     ST.S_IFCHR: CharDeviceFormatter,
 428     ST.S_IFIFO: PipeFormatter,
 429   }
 430
 431   def __init__(me, db):
 432     me._inomap = {}
 433     me._vinomap = {}
 434     me._db = db
 435     me._hsz = int(H.new(db.hash).digest_size)
 436
 437   def file(me, fi):
 438     h = me._db.hashfile(fi)
 439     if fi.err:
 440       fmt = ErrorFormatter(fi)
 441       vino = 'error'
 442     else:
 443       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 444       inoidx = fi.st.st_dev, fi.st.st_ino
 445       try:
 446         vino = me._inomap[inoidx]
 447       except KeyError:
 448         suffix = ''
 449         seq = 0
 450         while True:
 451           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 452           if vino not in me._vinomap: break
 453           suffix = '\0%d' % seq
 454           seq += 1
 455         me._inomap[inoidx] = vino
 456         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 457     if h: info = h
 458     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 459     print('%s %8s %6s %-12s %-20s %20s %s' %
 460           (info, vino, fmt.mode(), fmt.owner(),
 461            fmt.mtime(), fmt.size(), fmt.name()))
 462
 463 ###--------------------------------------------------------------------------
 464 ### Database clearing from diff files.
 465
 466 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 467
 468 def clear_entry(db, lno, line):
 469
 470   good = True
 471
 472   if line.startswith('['):
 473     pos = line.find(']')
 474     if pos < 0:
 475       moan("failed to parse file entry (type field; line %d)" % lno)
 476       return False
 477     ty = line[1:pos].strip()
 478     rest = line[pos + 1:]
 479     hash = None
 480   else:
 481     ff = line.split(None, 1)
 482     if len(ff) != 2:
 483       moan("failed to parse file entry (field split; line %d)" % lno)
 484       return False
 485     ty = 'regular-file'
 486     hash, rest = ff
 487
 488   ff = rest.split(None, 5)
 489   if len(ff) != 6:
 490     moan("failed to parse file entry (field split; line %d)" % lno)
 491     return False
 492   ino, mode, uidgid, mtime, sz, name = ff
 493
 494   if ty != 'symbolic-link':
 495     target = None
 496   else:
 497     nn = name.split(' -> ', 1)
 498     if len(nn) != 2:
 499       moan("failed to parse file entry (name split; line %d)" % lno)
 500       return False
 501     name, target = nn
 502     target = target.decode('string_escape')
 503   name = name.decode('string_escape')
 504
 505   try:
 506     st = OS.lstat(name)
 507   except OSError:
 508     e = excval()
 509     moan("failed to stat `%s': %s" % (name, e.strerror))
 510     if e.errno != E.ENOENT: good = False
 511   else:
 512     print("Clear cache entry for `%s'" % name)
 513     db.forget(st.st_ino)
 514
 515   return good
 516
 517 def clear_cache(db):
 518
 519   ## Work through the input diff file one line at a time.
 520   diffstate = 'gap'
 521   lno = 0
 522   good = True
 523   for line in stdin:
 524     if line.endswith('\n'): line = line[:-1]
 525     lno += 1
 526
 527     ## We're in a gap between hunks.  Find a hunk header and extract the line
 528     ## counts.
 529     if diffstate == 'gap':
 530       m = R_HUNK.match(line)
 531       if m:
 532         oldlines = int(m.group(1))
 533         newlines = int(m.group(2))
 534         diffstate = 'hunk'
 535         hdrlno = lno
 536
 537     ## We're in a hunk.  Keep track of whether we've reached the end, and
 538     ## discard entries from the cache for mismatching lines.
 539     elif diffstate == 'hunk':
 540       if len(line) == 0:
 541         moan("empty line in diff hunk (line %d)" % lno)
 542         good = False
 543       ty = line[0]
 544       if ty == ' ':
 545         oldlines -= 1; newlines -= 1
 546       elif ty == '+':
 547         newlines -= 1
 548         if not clear_entry(db, lno, line[1:]): good = False
 549       elif ty == '-':
 550         oldlines -= 1
 551         if not clear_entry(db, lno, line[1:]): good = False
 552       else:
 553         moan("incomprehensible line in diff hunk (line %d)" % lno)
 554         good = false
 555       if oldlines < 0 or newlines < 0:
 556         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 557         good = False
 558       if oldlines == newlines == 0:
 559         diffstate = 'gap'
 560
 561   if diffstate == 'hunk':
 562     moan("truncated diff hunk (started at line %d)" % hdrlno)
 563     good = False
 564
 565   return good
 566
 567 ###--------------------------------------------------------------------------
 568 ### Main program.
 569
 570 FMTMAP = {
 571   'rsync': lambda f: enum_rsync(stdin, f),
 572   'find0': lambda f: enum_find0(stdin, f)
 573 }
 574 op = OP.OptionParser(
 575   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 576   version = '%%prog, version %s' % VERSION,
 577   description = '''\
 578 Print a digest of a filesystem (or a collection of specified files) to
 579 standard output.  The idea is that the digest should be mostly /complete/
 580 (i.e., any `interesting\' change to the filesystem results in a different
 581 digest) and /canonical/ (i.e., identical filesystem contents result in
 582 identical output).
 583 ''')
 584
 585 for short, long, props in [
 586   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 587                     'help': 'clear cache of all files not seen' }),
 588   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 589                       'help': 'use FILE as a cache for file hashes' }),
 590   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 591                       'type': 'choice', 'choices': FMTMAP.keys(),
 592                       'help': 'read files to report in the given FORMAT' }),
 593   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 594                       'help': 'read diff from stdin, clear cache entries' }),
 595   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 596                        'type': 'int', 'default': 2,
 597                        'help': 'produce output with given compatibility VERSION' }),
 598   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 599                      ##'type': 'choice', 'choices': H.algorithms,
 600                      'help': 'use HASH as the hash function' })]:
 601   op.add_option(short, long, **props)
 602 OPTS, args = op.parse_args(argv)
 603 if not 1 <= OPTS.compat <= 2:
 604   die("unknown compatibility version %d" % OPTS.compat)
 605 if OPTS.udiff:
 606   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 607     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 608   db = HashCache(OPTS.cache, OPTS.hash)
 609   if len(args) == 2: OS.chdir(args[1])
 610   good = True
 611   if not clear_cache(db): good = False
 612   if good: db.flush()
 613   else: exit(2)
 614 else:
 615   if not OPTS.files and len(args) <= 1:
 616     die("no filename sources: nothing to do")
 617   db = HashCache(OPTS.cache, OPTS.hash)
 618   if OPTS.all:
 619     db.reset()
 620   if OPTS.compat >= 2:
 621     print("## fshash report format version %d" % OPTS.compat)
 622   rep = Reporter(db)
 623   if OPTS.files:
 624     FMTMAP[OPTS.files](rep.file)
 625   for dir in args[1:]:
 626     enum_walk(dir, rep.file)
 627   if OPTS.all:
 628     db.prune()
 629   db.flush()
 630
 631 ###----- That's all, folks --------------------------------------------------