chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exit, stdin, stdout, stderr
  27 import os as OS
  28 import re as RX
  29 import time as T
  30 import errno as E
  31 import stat as ST
  32 import optparse as OP
  33 import hashlib as H
  34 import sqlite3 as DB
  35 import zlib as Z
  36
  37 PACKAGE = '@PACKAGE@'
  38 VERSION = '@VERSION@'
  39
  40 ###--------------------------------------------------------------------------
  41 ### Utilities.
  42
  43 QUIS = OS.path.basename(argv[0])
  44
  45 def moan(msg):
  46   stderr.write('%s: %s\n' % (QUIS, msg))
  47
  48 def die(msg, rc = 1):
  49   moan(msg)
  50   exit(rc)
  51
  52 SYSERR = 0
  53 def syserr(msg):
  54   global SYSERR
  55   moan(msg)
  56   SYSERR += 1
  57
  58 ###--------------------------------------------------------------------------
  59 ### File system enumeration.
  60
  61 class FileInfo (object):
  62   def __init__(me, file, st = None):
  63     me.name = file
  64     if st:
  65       me.st = st
  66       me.err = None
  67     else:
  68       try:
  69         me.st = OS.lstat(file)
  70         me.err = None
  71       except OSError, err:
  72         me.st = None
  73         me.err = err
  74
  75 def enum_walk(file, func):
  76
  77   def dirents(name):
  78     try:
  79       return OS.listdir(name)
  80     except OSError, err:
  81       syserr("failed to read directory `%s': %s" % (name, err.strerror))
  82       return []
  83
  84   def dir(ee, dev):
  85     ff = []
  86     dd = []
  87     for e in ee:
  88       fi = FileInfo(e)
  89       if fi.st and fi.st.st_dev != dev: pass
  90       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  91       else: ff.append(fi)
  92     ff.sort(key = lambda fi: fi.name)
  93     dd.sort(key = lambda fi: fi.name + '/')
  94     for f in ff:
  95       func(f)
  96     for d in dd:
  97       if d.st.st_dev == dev:
  98         func(d)
  99         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 100
 101   if file.endswith('/'):
 102     cwd = OS.open('.', OS.O_RDONLY)
 103     try:
 104       OS.chdir(file)
 105       fi = FileInfo('.')
 106       func(fi)
 107       dir(dirents('.'), fi.st.st_dev)
 108     finally:
 109       OS.fchdir(cwd)
 110       OS.close(cwd)
 111   else:
 112     fi = FileInfo(file)
 113     func(fi)
 114     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 115       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 116           fi.st.st_dev)
 117
 118 def enum_find0(f, func):
 119   tail = ""
 120   while True:
 121     buf = f.read(8192)
 122     last = len(buf) == 0
 123     names = (tail + buf).split('\0')
 124     tail = names.pop()
 125     for n in names:
 126       func(FileInfo(n))
 127     if last:
 128       break
 129   if len(tail):
 130     moan("ignored trailing junk after last filename")
 131
 132 RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 133 def enum_rsync(f, func):
 134
 135   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 136   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 137   ## one space character after the TIME and may begin with a space.
 138   ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
 139   ## for a byte with that value.  Newlines and backslashes which would be
 140   ## ambiguous are converted into this form; all other characters are
 141   ## literal.
 142   ##
 143   ## We ignore the stat information and retrieve it ourselves, because it's
 144   ## incomplete.  Hopefully the dcache is still warm.
 145
 146   for line in f:
 147     if line.endswith('\n'): line = line[:-1]
 148
 149     ## Extract the escaped name.
 150     ff = line.split(None, 3)
 151     if len(ff) != 4:
 152       syserr("ignoring invalid line from rsync: `%s'" % line)
 153       continue
 154     tail = ff[3]
 155     try:
 156       spc = tail.index(' ')
 157     except ValueError:
 158       syserr("ignoring invalid line from rsync: `%s'" % line)
 159       continue
 160     name = tail[spc + 1:]
 161
 162     ## Now translate escape sequences.
 163     name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 164
 165     ## Call the client.
 166     try:
 167       fi = FileInfo(name)
 168     except OSError, err:
 169       syserr("failed to stat `%s': %s" % (name, err.strerror))
 170       continue
 171     func(fi)
 172
 173 ###--------------------------------------------------------------------------
 174 ### The hash cache.
 175
 176 class HashCache (object):
 177
 178   VERSION = 0
 179   BUFSZ = 128*1024
 180
 181   INIT = [
 182     """CREATE TABLE meta (
 183                version INTEGER NOT NULL,
 184                hash TEXT NOT NULL
 185        );""",
 186     """CREATE TABLE hash (
 187                ino INTEGER PRIMARY KEY,
 188                mtime INTEGER NOT NULL,
 189                ctime INTEGER NOT NULL,
 190                size INTEGER NOT NULL,
 191                hash TEXT NOT NULL,
 192                seen BOOLEAN NOT NULL DEFAULT TRUE
 193        );""",
 194     """PRAGMA journal_mode = WAL;"""
 195   ]
 196
 197   def __init__(me, file, hash = None):
 198
 199     if file is None:
 200
 201       ## We're going this alone, with no cache.
 202       db = None
 203       if hash is None:
 204         die("no hash specified and no database cache to read from")
 205     else:
 206
 207       ## Connect to the database.
 208       db = DB.connect(file)
 209       db.text_factory = str
 210
 211       ## See whether we can understand the cache database.
 212       c = db.cursor()
 213       v = h = None
 214       try:
 215         c.execute('SELECT version, hash FROM meta')
 216         v, h = c.fetchone()
 217         if c.fetchone() is not None:
 218           die("cache database corrupt: meta table has mutliple rows")
 219       except (DB.Error, TypeError):
 220         pass
 221
 222       ## If that didn't work, we'd better clear the thing and start again.
 223       ## But only if we know how to initialize it.
 224       if v != me.VERSION:
 225
 226         ## Explain the situation.
 227         moan("cache version %s not understood" % v)
 228         if hash is None:
 229           if h is None:
 230             die("can't initialize cache: no hash function set")
 231           else:
 232             hash = h
 233         try:
 234           H.new(hash)
 235         except Exception:
 236           die("unknown hash function `%s'" % hash)
 237
 238         ## Drop old things.
 239         c.execute('SELECT type, name FROM sqlite_master')
 240         for type, name in c.fetchall():
 241           c.execute('DROP %s IF EXISTS %s' % (type, name))
 242
 243         ## Now we're ready to go.
 244         for stmt in me.INIT:
 245           c.execute(stmt)
 246         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 247         db.commit()
 248
 249       ## Check the hash function if necessary.
 250       if hash is None:
 251         hash = h
 252       elif h is not None and  h != hash:
 253         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 254
 255     ## All done.
 256     me.hash = hash
 257     me._db = db
 258     me._pend = 0
 259
 260   def hashfile(me, fi):
 261
 262     ## If this isn't a proper file then don't try to hash it.
 263     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 264       return None
 265
 266     ## See whether there's a valid entry in the cache.
 267     if me._db:
 268       c = me._db.cursor()
 269       c.execute(
 270         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 271         [fi.st.st_ino])
 272       r = c.fetchone()
 273       if r is not None:
 274         mt, sz, h, s = r
 275         if mt == fi.st.st_mtime and \
 276            sz == fi.st.st_size:
 277           if not s:
 278             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 279                       [fi.st.st_ino])
 280           me._update()
 281           return h
 282
 283     ## Hash the file.  Beware raciness: update the file information from the
 284     ## open descriptor, but set the size from what we actually read.
 285     h = H.new(me.hash)
 286     try:
 287       with open(fi.name, 'rb') as f:
 288         sz = 0
 289         while True:
 290           buf = f.read(me.BUFSZ)
 291           if len(buf) == 0:
 292             break
 293           sz += len(buf)
 294           h.update(buf)
 295         fi.st = OS.fstat(f.fileno())
 296         ##fi.st.st_size = sz
 297       hash = h.digest()
 298     except (OSError, IOError), err:
 299       fi.st = None
 300       fi.err = err
 301       return None
 302     hash = hash.encode('hex')
 303
 304     ## Insert a record into the database.
 305     if me._db:
 306       c.execute("""
 307               INSERT OR REPLACE INTO hash
 308                       (ino, mtime, ctime, size, hash, seen)
 309               VALUES
 310                       (?, ?, ?, ?, ?, 1);
 311       """, [fi.st.st_ino,
 312             fi.st.st_mtime,
 313             fi.st.st_ctime,
 314             fi.st.st_size,
 315             hash])
 316       me._update()
 317
 318     ## Done.
 319     return hash
 320
 321   def _update(me):
 322     me._pend += 1
 323     if me._pend >= 1024:
 324       me.flush()
 325
 326   def flush(me):
 327     if me._db:
 328       me._db.commit()
 329     me._pend = 0
 330
 331   def need_db(me):
 332     if not me._db:
 333       die("no cache database")
 334
 335   def forget(me, ino):
 336     me.need_db()
 337     c = me._db.cursor()
 338     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 339
 340   def reset(me):
 341     me.need_db()
 342     c = me._db.cursor()
 343     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 344     me.flush()
 345
 346   def prune(me):
 347     me.need_db()
 348     c = me._db.cursor()
 349     c.execute('DELETE FROM hash WHERE NOT seen')
 350     me.flush()
 351
 352 ###--------------------------------------------------------------------------
 353 ### Printing output.
 354
 355 class GenericFormatter (object):
 356   def __init__(me, fi):
 357     me.fi = fi
 358   def _fmt_time(me, t):
 359     tm = T.gmtime(t)
 360     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 361   def _enc_name(me, n):
 362     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 363   def name(me):
 364     return me._enc_name(me.fi.name)
 365   def info(me):
 366     return me.TYPE
 367   def mode(me):
 368     return '%06o' % me.fi.st.st_mode
 369   def size(me):
 370     return me.fi.st.st_size
 371   def mtime(me):
 372     return me._fmt_time(me.fi.st.st_mtime)
 373   def owner(me):
 374     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 375
 376 class ErrorFormatter (GenericFormatter):
 377   def info(me):
 378     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 379   def error(me): return 'error'
 380   mode = size = mtime = owner = error
 381
 382 class SocketFormatter (GenericFormatter):
 383   TYPE = 'socket'
 384 class PipeFormatter (GenericFormatter):
 385   TYPE = 'fifo'
 386
 387 class LinkFormatter (GenericFormatter):
 388   TYPE = 'symbolic-link'
 389   def name(me):
 390     n = GenericFormatter.name(me)
 391     try:
 392       d = OS.readlink(me.fi.name)
 393       return '%s -> %s' % (n, me._enc_name(d))
 394     except OSError, err:
 395       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 396
 397 class DirectoryFormatter (GenericFormatter):
 398   TYPE = 'directory'
 399   def name(me): return GenericFormatter.name(me) + '/'
 400   def size(me): return 'dir'
 401
 402 class DeviceFormatter (GenericFormatter):
 403   def info(me):
 404     return '%s %d:%d' % (me.TYPE,
 405                          OS.major(me.fi.st.st_rdev),
 406                          OS.minor(me.fi.st.st_rdev))
 407 class BlockDeviceFormatter (DeviceFormatter):
 408   TYPE = 'block-device'
 409 class CharDeviceFormatter (DeviceFormatter):
 410   TYPE = 'character-device'
 411
 412 class FileFormatter (GenericFormatter):
 413   TYPE = 'regular-file'
 414
 415 class Reporter (object):
 416
 417   TYMAP = {
 418     ST.S_IFSOCK: SocketFormatter,
 419     ST.S_IFDIR: DirectoryFormatter,
 420     ST.S_IFLNK: LinkFormatter,
 421     ST.S_IFREG: FileFormatter,
 422     ST.S_IFBLK: BlockDeviceFormatter,
 423     ST.S_IFCHR: CharDeviceFormatter,
 424     ST.S_IFIFO: PipeFormatter,
 425   }
 426
 427   def __init__(me, db):
 428     me._inomap = {}
 429     me._vinomap = {}
 430     me._db = db
 431     me._hsz = int(H.new(db.hash).digest_size)
 432
 433   def file(me, fi):
 434     h = me._db.hashfile(fi)
 435     if fi.err:
 436       fmt = ErrorFormatter(fi)
 437       vino = 'error'
 438     else:
 439       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 440       inoidx = fi.st.st_dev, fi.st.st_ino
 441       try:
 442         vino = me._inomap[inoidx]
 443       except KeyError:
 444         suffix = ''
 445         seq = 0
 446         while True:
 447           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 448           if vino not in me._vinomap: break
 449           suffix = '\0%d' % seq
 450           seq += 1
 451         me._inomap[inoidx] = vino
 452         if OPTS.compat >= 2: me._vinomap[vino] = inoidx
 453     if h: info = h
 454     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 455     print '%s %8s %6s %-12s %-20s %20s %s' % (
 456       info, vino, fmt.mode(), fmt.owner(),
 457       fmt.mtime(), fmt.size(), fmt.name())
 458
 459 ###--------------------------------------------------------------------------
 460 ### Database clearing from diff files.
 461
 462 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 463
 464 def clear_entry(db, lno, line):
 465
 466   good = True
 467
 468   if line.startswith('['):
 469     pos = line.find(']')
 470     if pos < 0:
 471       moan("failed to parse file entry (type field; line %d)" % lno)
 472       return False
 473     ty = line[1:pos].strip()
 474     rest = line[pos + 1:]
 475     hash = None
 476   else:
 477     ff = line.split(None, 1)
 478     if len(ff) != 2:
 479       moan("failed to parse file entry (field split; line %d)" % lno)
 480       return False
 481     ty = 'regular-file'
 482     hash, rest = ff
 483
 484   ff = rest.split(None, 5)
 485   if len(ff) != 6:
 486     moan("failed to parse file entry (field split; line %d)" % lno)
 487     return False
 488   ino, mode, uidgid, mtime, sz, name = ff
 489
 490   if ty != 'symbolic-link':
 491     target = None
 492   else:
 493     nn = name.split(' -> ', 1)
 494     if len(nn) != 2:
 495       moan("failed to parse file entry (name split; line %d)" % lno)
 496       return False
 497     name, target = nn
 498     target = target.decode('string_escape')
 499   name = name.decode('string_escape')
 500
 501   try:
 502     st = OS.lstat(name)
 503   except OSError, e:
 504     moan("failed to stat `%s': %s" % (name, e.strerror))
 505     if e.errno != E.ENOENT: good = False
 506   else:
 507     print "Clear cache entry for `%s'" % name
 508     db.forget(st.st_ino)
 509
 510   return good
 511
 512 def clear_cache(db):
 513
 514   ## Work through the input diff file one line at a time.
 515   diffstate = 'gap'
 516   lno = 0
 517   good = True
 518   for line in stdin:
 519     if line.endswith('\n'): line = line[:-1]
 520     lno += 1
 521
 522     ## We're in a gap between hunks.  Find a hunk header and extract the line
 523     ## counts.
 524     if diffstate == 'gap':
 525       m = R_HUNK.match(line)
 526       if m:
 527         oldlines = int(m.group(1))
 528         newlines = int(m.group(2))
 529         diffstate = 'hunk'
 530         hdrlno = lno
 531
 532     ## We're in a hunk.  Keep track of whether we've reached the end, and
 533     ## discard entries from the cache for mismatching lines.
 534     elif diffstate == 'hunk':
 535       if len(line) == 0:
 536         moan("empty line in diff hunk (line %d)" % lno)
 537         good = False
 538       ty = line[0]
 539       if ty == ' ':
 540         oldlines -= 1; newlines -= 1
 541       elif ty == '+':
 542         newlines -= 1
 543         if not clear_entry(db, lno, line[1:]): good = False
 544       elif ty == '-':
 545         oldlines -= 1
 546         if not clear_entry(db, lno, line[1:]): good = False
 547       else:
 548         moan("incomprehensible line in diff hunk (line %d)" % lno)
 549         good = false
 550       if oldlines < 0 or newlines < 0:
 551         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 552         good = False
 553       if oldlines == newlines == 0:
 554         diffstate = 'gap'
 555
 556   if diffstate == 'hunk':
 557     moan("truncated diff hunk (started at line %d)" % hdrlno)
 558     good = False
 559
 560   return good
 561
 562 ###--------------------------------------------------------------------------
 563 ### Main program.
 564
 565 FMTMAP = {
 566   'rsync': lambda f: enum_rsync(stdin, f),
 567   'find0': lambda f: enum_find0(stdin, f)
 568 }
 569 op = OP.OptionParser(
 570   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 571   version = '%%prog, version %s' % VERSION,
 572   description = '''\
 573 Print a digest of a filesystem (or a collection of specified files) to
 574 standard output.  The idea is that the digest should be mostly /complete/
 575 (i.e., any `interesting\' change to the filesystem results in a different
 576 digest) and /canonical/ (i.e., identical filesystem contents result in
 577 identical output).
 578 ''')
 579
 580 for short, long, props in [
 581   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 582                     'help': 'clear cache of all files not seen' }),
 583   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 584                       'help': 'use FILE as a cache for file hashes' }),
 585   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 586                       'type': 'choice', 'choices': FMTMAP.keys(),
 587                       'help': 'read files to report in the given FORMAT' }),
 588   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 589                       'help': 'read diff from stdin, clear cache entries' }),
 590   ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
 591                        'type': 'int', 'default': 2,
 592                        'help': 'produce output with given compatibility VERSION' }),
 593   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 594                      ##'type': 'choice', 'choices': H.algorithms,
 595                      'help': 'use HASH as the hash function' })]:
 596   op.add_option(short, long, **props)
 597 OPTS, args = op.parse_args(argv)
 598 if not 1 <= OPTS.compat <= 2:
 599   die("unknown compatibility version %d" % OPTS.compat)
 600 if OPTS.udiff:
 601   if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
 602     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 603   db = HashCache(OPTS.cache, OPTS.hash)
 604   if len(args) == 2: OS.chdir(args[1])
 605   good = True
 606   if not clear_cache(db): good = False
 607   if good: db.flush()
 608   else: exit(2)
 609 else:
 610   if not OPTS.files and len(args) <= 1:
 611     die("no filename sources: nothing to do")
 612   db = HashCache(OPTS.cache, OPTS.hash)
 613   if OPTS.all:
 614     db.reset()
 615   if OPTS.compat >= 2:
 616     print "## fshash report format version %d" % OPTS.compat
 617   rep = Reporter(db)
 618   if OPTS.files:
 619     FMTMAP[OPTS.files](rep.file)
 620   for dir in args[1:]:
 621     enum_walk(dir, rep.file)
 622   if OPTS.all:
 623     db.prune()
 624   db.flush()
 625
 626 ###----- That's all, folks --------------------------------------------------