chiark - git - mdw - rsync-backup/blob - fshash.in

   1 #! @PYTHON@
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exit, stdin, stdout, stderr
  27 import os as OS
  28 import re as RX
  29 import time as T
  30 import errno as E
  31 import stat as ST
  32 import optparse as OP
  33 import hashlib as H
  34 import sqlite3 as DB
  35 import zlib as Z
  36
  37 PACKAGE = '@PACKAGE@'
  38 VERSION = '@VERSION@'
  39
  40 ###--------------------------------------------------------------------------
  41 ### Utilities.
  42
  43 QUIS = OS.path.basename(argv[0])
  44
  45 def moan(msg):
  46   stderr.write('%s: %s\n' % (QUIS, msg))
  47
  48 def die(msg, rc = 1):
  49   moan(msg)
  50   exit(rc)
  51
  52 SYSERR = 0
  53 def syserr(msg):
  54   global SYSERR
  55   moan(msg)
  56   SYSERR += 1
  57
  58 ###--------------------------------------------------------------------------
  59 ### File system enumeration.
  60
  61 class FileInfo (object):
  62   def __init__(me, file, st = None):
  63     me.name = file
  64     if st:
  65       me.st = st
  66       me.err = None
  67     else:
  68       try:
  69         me.st = OS.lstat(file)
  70         me.err = None
  71       except OSError, err:
  72         me.st = None
  73         me.err = err
  74
  75 def enum_walk(file, func):
  76
  77   def dirents(name):
  78     try:
  79       return OS.listdir(name)
  80     except OSError, err:
  81       syserr("failed to read directory `%s': %s" % (name, err.strerror))
  82       return []
  83
  84   def dir(ee, dev):
  85     ff = []
  86     dd = []
  87     for e in ee:
  88       fi = FileInfo(e)
  89       if fi.st and fi.st.st_dev != dev: pass
  90       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  91       else: ff.append(fi)
  92     ff.sort(key = lambda fi: fi.name)
  93     dd.sort(key = lambda fi: fi.name + '/')
  94     for f in ff:
  95       func(f)
  96     for d in dd:
  97       if d.st.st_dev == dev:
  98         func(d)
  99         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
 100
 101   if file.endswith('/'):
 102     cwd = OS.open('.', OS.O_RDONLY)
 103     try:
 104       OS.chdir(file)
 105       fi = FileInfo('.')
 106       func(fi)
 107       dir(dirents('.'), fi.st.st_dev)
 108     finally:
 109       OS.fchdir(cwd)
 110       OS.close(cwd)
 111   else:
 112     fi = FileInfo(file)
 113     func(fi)
 114     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 115       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 116           fi.st.st_dev)
 117
 118 def enum_find0(f, func):
 119   tail = ""
 120   while True:
 121     buf = f.read(8192)
 122     last = len(buf) == 0
 123     names = (tail + buf).split('\0')
 124     tail = names.pop()
 125     for n in names:
 126       func(FileInfo(n))
 127     if last:
 128       break
 129   if len(tail):
 130     moan("ignored trailing junk after last filename")
 131
 132 RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 133 def enum_rsync(f, func):
 134
 135   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 136   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 137   ## one space character after the TIME and may begin with a space.
 138   ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
 139   ## for a byte with that value.  Newlines and backslashes which would be
 140   ## ambiguous are converted into this form; all other characters are
 141   ## literal.
 142   ##
 143   ## We ignore the stat information and retrieve it ourselves, because it's
 144   ## incomplete.  Hopefully the dcache is still warm.
 145
 146   for line in f:
 147     if line.endswith('\n'): line = line[:-1]
 148
 149     ## Extract the escaped name.
 150     ff = line.split(None, 3)
 151     if len(ff) != 4:
 152       syserr("ignoring invalid line from rsync: `%s'" % line)
 153       continue
 154     tail = ff[3]
 155     try:
 156       spc = tail.index(' ')
 157     except ValueError:
 158       syserr("ignoring invalid line from rsync: `%s'" % line)
 159       continue
 160     name = tail[spc + 1:]
 161
 162     ## Now translate escape sequences.
 163     name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 164
 165     ## Call the client.
 166     try:
 167       fi = FileInfo(name)
 168     except OSError, err:
 169       syserr("failed to stat `%s': %s" % (name, err.strerror))
 170       continue
 171     func(fi)
 172
 173 ###--------------------------------------------------------------------------
 174 ### The hash cache.
 175
 176 class HashCache (object):
 177
 178   VERSION = 0
 179   BUFSZ = 128*1024
 180
 181   INIT = [
 182     """CREATE TABLE meta (
 183                version INTEGER NOT NULL,
 184                hash TEXT NOT NULL
 185        );""",
 186     """CREATE TABLE hash (
 187                ino INTEGER PRIMARY KEY,
 188                mtime INTEGER NOT NULL,
 189                ctime INTEGER NOT NULL,
 190                size INTEGER NOT NULL,
 191                hash TEXT NOT NULL,
 192                seen BOOLEAN NOT NULL DEFAULT TRUE
 193        );""",
 194     """PRAGMA journal_mode = WAL;"""
 195   ]
 196
 197   def __init__(me, file, hash = None):
 198
 199     if file is None:
 200
 201       ## We're going this alone, with no cache.
 202       db = None
 203       if hash is None:
 204         die("no hash specified and no database cache to read from")
 205     else:
 206
 207       ## Connect to the database.
 208       db = DB.connect(file)
 209       db.text_factory = str
 210
 211       ## See whether we can understand the cache database.
 212       c = db.cursor()
 213       v = h = None
 214       try:
 215         c.execute('SELECT version, hash FROM meta')
 216         v, h = c.fetchone()
 217         if c.fetchone() is not None:
 218           die("cache database corrupt: meta table has mutliple rows")
 219       except (DB.Error, TypeError):
 220         pass
 221
 222       ## If that didn't work, we'd better clear the thing and start again.
 223       ## But only if we know how to initialize it.
 224       if v != me.VERSION:
 225
 226         ## Explain the situation.
 227         moan("cache version %s not understood" % v)
 228         if hash is None:
 229           if h is None:
 230             die("can't initialize cache: no hash function set")
 231           else:
 232             hash = h
 233         try:
 234           H.new(hash)
 235         except Exception:
 236           die("unknown hash function `%s'" % hash)
 237
 238         ## Drop old things.
 239         c.execute('SELECT type, name FROM sqlite_master')
 240         for type, name in c.fetchall():
 241           c.execute('DROP %s IF EXISTS %s' % (type, name))
 242
 243         ## Now we're ready to go.
 244         for stmt in me.INIT:
 245           c.execute(stmt)
 246         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 247         db.commit()
 248
 249       ## Check the hash function if necessary.
 250       if hash is None:
 251         hash = h
 252       elif h is not None and  h != hash:
 253         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 254
 255     ## All done.
 256     me.hash = hash
 257     me._db = db
 258     me._pend = 0
 259
 260   def hashfile(me, fi):
 261
 262     ## If this isn't a proper file then don't try to hash it.
 263     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 264       return None
 265
 266     ## See whether there's a valid entry in the cache.
 267     if me._db:
 268       c = me._db.cursor()
 269       c.execute(
 270         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 271         [fi.st.st_ino])
 272       r = c.fetchone()
 273       if r is not None:
 274         mt, sz, h, s = r
 275         if mt == fi.st.st_mtime and \
 276            sz == fi.st.st_size:
 277           if not s:
 278             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 279                       [fi.st.st_ino])
 280           me._update()
 281           return h
 282
 283     ## Hash the file.  Beware raciness: update the file information from the
 284     ## open descriptor, but set the size from what we actually read.
 285     h = H.new(me.hash)
 286     try:
 287       with open(fi.name, 'rb') as f:
 288         sz = 0
 289         while True:
 290           buf = f.read(me.BUFSZ)
 291           if len(buf) == 0:
 292             break
 293           sz += len(buf)
 294           h.update(buf)
 295         fi.st = OS.fstat(f.fileno())
 296         ##fi.st.st_size = sz
 297       hash = h.digest()
 298     except (OSError, IOError), err:
 299       fi.st = None
 300       fi.err = err
 301       return None
 302     hash = hash.encode('hex')
 303
 304     ## Insert a record into the database.
 305     if me._db:
 306       c.execute("""
 307               INSERT OR REPLACE INTO hash
 308                       (ino, mtime, ctime, size, hash, seen)
 309               VALUES
 310                       (?, ?, ?, ?, ?, 1);
 311       """, [fi.st.st_ino,
 312             fi.st.st_mtime,
 313             fi.st.st_ctime,
 314             fi.st.st_size,
 315             hash])
 316       me._update()
 317
 318     ## Done.
 319     return hash
 320
 321   def _update(me):
 322     me._pend += 1
 323     if me._pend >= 1024:
 324       me.flush()
 325
 326   def flush(me):
 327     if me._db:
 328       me._db.commit()
 329     me._pend = 0
 330
 331   def need_db(me):
 332     if not me._db:
 333       die("no cache database")
 334
 335   def forget(me, ino):
 336     me.need_db()
 337     c = me._db.cursor()
 338     c.execute('DELETE FROM hash WHERE ino = ?', [ino])
 339
 340   def reset(me):
 341     me.need_db()
 342     c = me._db.cursor()
 343     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 344     me.flush()
 345
 346   def prune(me):
 347     me.need_db()
 348     c = me._db.cursor()
 349     c.execute('DELETE FROM hash WHERE NOT seen')
 350     me.flush()
 351
 352 ###--------------------------------------------------------------------------
 353 ### Printing output.
 354
 355 class GenericFormatter (object):
 356   def __init__(me, fi):
 357     me.fi = fi
 358   def _fmt_time(me, t):
 359     tm = T.gmtime(t)
 360     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 361   def _enc_name(me, n):
 362     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 363   def name(me):
 364     return me._enc_name(me.fi.name)
 365   def info(me):
 366     return me.TYPE
 367   def mode(me):
 368     return '%06o' % me.fi.st.st_mode
 369   def size(me):
 370     return me.fi.st.st_size
 371   def mtime(me):
 372     return me._fmt_time(me.fi.st.st_mtime)
 373   def owner(me):
 374     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 375
 376 class ErrorFormatter (GenericFormatter):
 377   def info(me):
 378     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 379   def error(me): return 'error'
 380   mode = size = mtime = owner = error
 381
 382 class SocketFormatter (GenericFormatter):
 383   TYPE = 'socket'
 384 class PipeFormatter (GenericFormatter):
 385   TYPE = 'fifo'
 386
 387 class LinkFormatter (GenericFormatter):
 388   TYPE = 'symbolic-link'
 389   def name(me):
 390     n = GenericFormatter.name(me)
 391     try:
 392       d = OS.readlink(me.fi.name)
 393       return '%s -> %s' % (n, me._enc_name(d))
 394     except OSError, err:
 395       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 396
 397 class DirectoryFormatter (GenericFormatter):
 398   TYPE = 'directory'
 399   def name(me): return GenericFormatter.name(me) + '/'
 400   def size(me): return 'dir'
 401
 402 class DeviceFormatter (GenericFormatter):
 403   def info(me):
 404     return '%s %d:%d' % (me.TYPE,
 405                          OS.major(me.fi.st.st_rdev),
 406                          OS.minor(me.fi.st.st_rdev))
 407 class BlockDeviceFormatter (DeviceFormatter):
 408   TYPE = 'block-device'
 409 class CharDeviceFormatter (DeviceFormatter):
 410   TYPE = 'character-device'
 411
 412 class FileFormatter (GenericFormatter):
 413   TYPE = 'regular-file'
 414
 415 class Reporter (object):
 416
 417   TYMAP = {
 418     ST.S_IFSOCK: SocketFormatter,
 419     ST.S_IFDIR: DirectoryFormatter,
 420     ST.S_IFLNK: LinkFormatter,
 421     ST.S_IFREG: FileFormatter,
 422     ST.S_IFBLK: BlockDeviceFormatter,
 423     ST.S_IFCHR: CharDeviceFormatter,
 424     ST.S_IFIFO: PipeFormatter,
 425   }
 426
 427   def __init__(me, db):
 428     me._inomap = {}
 429     me._vinomap = {}
 430     me._db = db
 431     me._hsz = int(H.new(db.hash).digest_size)
 432
 433   def file(me, fi):
 434     h = me._db.hashfile(fi)
 435     if fi.err:
 436       fmt = ErrorFormatter(fi)
 437       vino = 'error'
 438     else:
 439       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 440       inoidx = fi.st.st_dev, fi.st.st_ino
 441       try:
 442         vino = me._inomap[inoidx]
 443       except KeyError:
 444         suffix = ''
 445         seq = 0
 446         while True:
 447           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 448           if vino not in me._vinomap: break
 449           suffix = '\0%d' % seq
 450           seq += 1
 451         me._inomap[inoidx] = vino
 452     if h: info = h
 453     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 454     print '%s %8s %6s %-12s %-20s %20s %s' % (
 455       info, vino, fmt.mode(), fmt.owner(),
 456       fmt.mtime(), fmt.size(), fmt.name())
 457
 458 ###--------------------------------------------------------------------------
 459 ### Database clearing from diff files.
 460
 461 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
 462
 463 def clear_entry(db, lno, line):
 464
 465   good = True
 466
 467   if line.startswith('['):
 468     pos = line.find(']')
 469     if pos < 0:
 470       moan("failed to parse file entry (type field; line %d)" % lno)
 471       return False
 472     ty = line[1:pos].strip()
 473     rest = line[pos + 1:]
 474     hash = None
 475   else:
 476     ff = line.split(None, 1)
 477     if len(ff) != 2:
 478       moan("failed to parse file entry (field split; line %d)" % lno)
 479       return False
 480     ty = 'regular-file'
 481     hash, rest = ff
 482
 483   ff = rest.split(None, 5)
 484   if len(ff) != 6:
 485     moan("failed to parse file entry (field split; line %d)" % lno)
 486     return False
 487   ino, mode, uidgid, mtime, sz, name = ff
 488
 489   if ty != 'symbolic-link':
 490     target = None
 491   else:
 492     nn = name.split(' -> ', 1)
 493     if len(nn) != 2:
 494       moan("failed to parse file entry (name split; line %d)" % lno)
 495       return False
 496     name, target = nn
 497     target = target.decode('string_escape')
 498   name = name.decode('string_escape')
 499
 500   try:
 501     st = OS.lstat(name)
 502   except OSError, e:
 503     moan("failed to stat `%s': %s" % (name, e.strerror))
 504     if e.errno != E.ENOENT: good = False
 505   else:
 506     print "Clear cache entry for `%s'" % name
 507     db.forget(st.st_ino)
 508
 509   return good
 510
 511 def clear_cache(db):
 512
 513   ## Work through the input diff file one line at a time.
 514   diffstate = 'gap'
 515   lno = 0
 516   good = True
 517   for line in stdin:
 518     if line.endswith('\n'): line = line[:-1]
 519     lno += 1
 520
 521     ## We're in a gap between hunks.  Find a hunk header and extract the line
 522     ## counts.
 523     if diffstate == 'gap':
 524       m = R_HUNK.match(line)
 525       if m:
 526         oldlines = int(m.group(1))
 527         newlines = int(m.group(2))
 528         diffstate = 'hunk'
 529         hdrlno = lno
 530
 531     ## We're in a hunk.  Keep track of whether we've reached the end, and
 532     ## discard entries from the cache for mismatching lines.
 533     elif diffstate == 'hunk':
 534       if len(line) == 0:
 535         moan("empty line in diff hunk (line %d)" % lno)
 536         good = False
 537       ty = line[0]
 538       if ty == ' ':
 539         oldlines -= 1; newlines -= 1
 540       elif ty == '+':
 541         newlines -= 1
 542         if not clear_entry(db, lno, line[1:]): good = False
 543       elif ty == '-':
 544         oldlines -= 1
 545         if not clear_entry(db, lno, line[1:]): good = False
 546       else:
 547         moan("incomprehensible line in diff hunk (line %d)" % lno)
 548         good = false
 549       if oldlines < 0 or newlines < 0:
 550         moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
 551         good = False
 552       if oldlines == newlines == 0:
 553         diffstate = 'gap'
 554
 555   if diffstate == 'hunk':
 556     moan("truncated diff hunk (started at line %d)" % hdrlno)
 557     good = False
 558
 559   return good
 560
 561 ###--------------------------------------------------------------------------
 562 ### Main program.
 563
 564 FMTMAP = {
 565   'rsync': lambda f: enum_rsync(stdin, f),
 566   'find0': lambda f: enum_find0(stdin, f)
 567 }
 568 op = OP.OptionParser(
 569   usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 570   version = '%%prog, version %s' % VERSION,
 571   description = '''\
 572 Print a digest of a filesystem (or a collection of specified files) to
 573 standard output.  The idea is that the digest should be mostly /complete/
 574 (i.e., any `interesting\' change to the filesystem results in a different
 575 digest) and /canonical/ (i.e., identical filesystem contents result in
 576 identical output).
 577 ''')
 578
 579 for short, long, props in [
 580   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 581                     'help': 'clear cache of all files not seen' }),
 582   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 583                       'help': 'use FILE as a cache for file hashes' }),
 584   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 585                       'type': 'choice', 'choices': FMTMAP.keys(),
 586                       'help': 'read files to report in the given FORMAT' }),
 587   ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
 588                       'help': 'read diff from stdin, clear cache entries' }),
 589   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 590                      ##'type': 'choice', 'choices': H.algorithms,
 591                      'help': 'use HASH as the hash function' })]:
 592   op.add_option(short, long, **props)
 593 opts, args = op.parse_args(argv)
 594
 595 if opts.udiff:
 596   if opts.cache is None or opts.all or opts.files or len(args) > 2:
 597     die("incompatible options: `-u' requires `-c CACHE', forbids others")
 598   db = HashCache(opts.cache, opts.hash)
 599   if len(args) == 2: OS.chdir(args[1])
 600   good = True
 601   if not clear_cache(db): good = False
 602   if good: db.flush()
 603   else: exit(2)
 604 else:
 605   if not opts.files and len(args) <= 1:
 606     die("no filename sources: nothing to do")
 607   db = HashCache(opts.cache, opts.hash)
 608   if opts.all:
 609     db.reset()
 610   rep = Reporter(db)
 611   if opts.files:
 612     FMTMAP[opts.files](rep.file)
 613   for dir in args[1:]:
 614     enum_walk(dir, rep.file)
 615   if opts.all:
 616     db.prune()
 617   db.flush()
 618
 619 ###----- That's all, folks --------------------------------------------------