chiark - git - mdw - distorted-ansible/blob - roles/common/files/backup/fshash

   1 #! /usr/bin/python
   2 ###
   3 ### Efficiently construct canonical digests of filesystems
   4 ###
   5 ### (c) 2012 Mark Wooding
   6 ###
   7
   8 ###----- Licensing notice ---------------------------------------------------
   9 ###
  10 ### This file is part of the `rsync-backup' program.
  11 ###
  12 ### rsync-backup is free software; you can redistribute it and/or modify
  13 ### it under the terms of the GNU General Public License as published by
  14 ### the Free Software Foundation; either version 2 of the License, or
  15 ### (at your option) any later version.
  16 ###
  17 ### rsync-backup is distributed in the hope that it will be useful,
  18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 ### GNU General Public License for more details.
  21 ###
  22 ### You should have received a copy of the GNU General Public License
  23 ### along with rsync-backup; if not, write to the Free Software Foundation,
  24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  25
  26 from sys import argv, exit, stdin, stdout, stderr
  27 import os as OS
  28 import re as RX
  29 import time as T
  30 import stat as ST
  31 import optparse as OP
  32 import hashlib as H
  33 import sqlite3 as DB
  34 import zlib as Z
  35
  36 PACKAGE = 'rsync-backup'
  37 VERSION = '0.99.1-8-ga844'
  38
  39 ###--------------------------------------------------------------------------
  40 ### Utilities.
  41
  42 QUIS = OS.path.basename(argv[0])
  43
  44 def moan(msg):
  45   stderr.write('%s: %s\n' % (QUIS, msg))
  46
  47 def die(msg, rc = 1):
  48   moan(msg)
  49   exit(rc)
  50
  51 SYSERR = 0
  52 def syserr(msg):
  53   global SYSERR
  54   moan(msg)
  55   SYSERR += 1
  56
  57 ###--------------------------------------------------------------------------
  58 ### File system enumeration.
  59
  60 class FileInfo (object):
  61   def __init__(me, file, st = None):
  62     me.name = file
  63     if st:
  64       me.st = st
  65       me.err = None
  66     else:
  67       try:
  68         me.st = OS.lstat(file)
  69         me.err = None
  70       except OSError, err:
  71         me.st = None
  72         me.err = err
  73
  74 def enum_walk(file, func):
  75
  76   def dirents(name):
  77     try:
  78       return OS.listdir(name)
  79     except OSError, err:
  80       syserr("failed to read directory `%s': %s" % (name, err.strerror))
  81       return []
  82
  83   def dir(ee, dev):
  84     ff = []
  85     dd = []
  86     for e in ee:
  87       fi = FileInfo(e)
  88       if fi.st and fi.st.st_dev != dev: pass
  89       if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
  90       else: ff.append(fi)
  91     ff.sort(key = lambda fi: fi.name)
  92     dd.sort(key = lambda fi: fi.name + '/')
  93     for f in ff:
  94       func(f)
  95     for d in dd:
  96       if d.st.st_dev == dev:
  97         func(d)
  98         dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
  99
 100   if file.endswith('/'):
 101     cwd = OS.open('.', OS.O_RDONLY)
 102     try:
 103       OS.chdir(file)
 104       fi = FileInfo('.')
 105       func(fi)
 106       dir(dirents('.'), fi.st.st_dev)
 107     finally:
 108       OS.fchdir(cwd)
 109       OS.close(cwd)
 110   else:
 111     fi = FileInfo(file)
 112     func(fi)
 113     if fi.st and ST.S_ISDIR(fi.st.st_mode):
 114       dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
 115           fi.st.st_dev)
 116
 117 def enum_find0(f, func):
 118   tail = ""
 119   while True:
 120     buf = f.read(8192)
 121     last = len(buf) == 0
 122     names = (tail + buf).split('\0')
 123     tail = names.pop()
 124     for n in names:
 125       func(FileInfo(n))
 126     if last:
 127       break
 128   if len(tail):
 129     moan("ignored trailing junk after last filename")
 130
 131 RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
 132 def enum_rsync(f, func):
 133
 134   ## The format is a little fiddly.  Each line consists of PERMS SIZE DATE
 135   ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
 136   ## one space character after the TIME and may begin with a space.
 137   ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
 138   ## for a byte with that value.  Newlines and backslashes which would be
 139   ## ambiguous are converted into this form; all other characters are
 140   ## literal.
 141   ##
 142   ## We ignore the stat information and retrieve it ourselves, because it's
 143   ## incomplete.  Hopefully the dcache is still warm.
 144
 145   for line in f:
 146     if line.endswith('\n'): line = line[:-1]
 147
 148     ## Extract the escaped name.
 149     ff = line.split(None, 3)
 150     if len(ff) != 4:
 151       syserr("ignoring invalid line from rsync: `%s'" % line)
 152       continue
 153     tail = ff[3]
 154     try:
 155       spc = tail.index(' ')
 156     except ValueError:
 157       syserr("ignoring invalid line from rsync: `%s'" % line)
 158       continue
 159     name = tail[spc + 1:]
 160
 161     ## Now translate escape sequences.
 162     name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
 163
 164     ## Call the client.
 165     try:
 166       fi = FileInfo(name)
 167     except OSError, err:
 168       syserr("failed to stat `%s': %s" % (name, err.strerror))
 169       continue
 170     func(fi)
 171
 172 ###--------------------------------------------------------------------------
 173 ### The hash cache.
 174
 175 class HashCache (object):
 176
 177   VERSION = 0
 178   BUFSZ = 128*1024
 179
 180   INIT = [
 181     """CREATE TABLE meta (
 182                version INTEGER NOT NULL,
 183                hash TEXT NOT NULL
 184        );""",
 185     """CREATE TABLE hash (
 186                ino INTEGER PRIMARY KEY,
 187                mtime INTEGER NOT NULL,
 188                ctime INTEGER NOT NULL,
 189                size INTEGER NOT NULL,
 190                hash TEXT NOT NULL,
 191                seen BOOLEAN NOT NULL DEFAULT TRUE
 192        );""",
 193     """PRAGMA journal_mode = WAL;"""
 194   ]
 195
 196   def __init__(me, file, hash = None):
 197
 198     if file is None:
 199
 200       ## We're going this alone, with no cache.
 201       db = None
 202       if hash is None:
 203         die("no hash specified and no database cache to read from")
 204     else:
 205
 206       ## Connect to the database.
 207       db = DB.connect(file)
 208       db.text_factory = str
 209
 210       ## See whether we can understand the cache database.
 211       c = db.cursor()
 212       v = h = None
 213       try:
 214         c.execute('SELECT version, hash FROM meta')
 215         v, h = c.fetchone()
 216         if c.fetchone() is not None:
 217           die("cache database corrupt: meta table has mutliple rows")
 218       except (DB.Error, TypeError):
 219         pass
 220
 221       ## If that didn't work, we'd better clear the thing and start again.
 222       ## But only if we know how to initialize it.
 223       if v != me.VERSION:
 224
 225         ## Explain the situation.
 226         moan("cache version %s not understood" % v)
 227         if hash is None:
 228           if h is None:
 229             die("can't initialize cache: no hash function set")
 230           else:
 231             hash = h
 232         try:
 233           H.new(hash)
 234         except Exception:
 235           die("unknown hash function `%s'" % hash)
 236
 237         ## Drop old things.
 238         c.execute('SELECT type, name FROM sqlite_master')
 239         for type, name in c.fetchall():
 240           c.execute('DROP %s IF EXISTS %s' % (type, name))
 241
 242         ## Now we're ready to go.
 243         for stmt in me.INIT:
 244           c.execute(stmt)
 245         c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
 246         db.commit()
 247
 248       ## Check the hash function if necessary.
 249       if hash is None:
 250         hash = h
 251       elif h is not None and  h != hash:
 252         die("hash mismatch: cache uses %s but %s requested" % (h, hash))
 253
 254     ## All done.
 255     me.hash = hash
 256     me._db = db
 257     me._pend = 0
 258
 259   def hashfile(me, fi):
 260
 261     ## If this isn't a proper file then don't try to hash it.
 262     if fi.err or not ST.S_ISREG(fi.st.st_mode):
 263       return None
 264
 265     ## See whether there's a valid entry in the cache.
 266     if me._db:
 267       c = me._db.cursor()
 268       c.execute(
 269         'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
 270         [fi.st.st_ino])
 271       r = c.fetchone()
 272       if r is not None:
 273         mt, sz, h, s = r
 274         if mt == fi.st.st_mtime and \
 275            sz == fi.st.st_size:
 276           if not s:
 277             c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
 278                       [fi.st.st_ino])
 279           me._update()
 280           return h
 281
 282     ## Hash the file.  Beware raciness: update the file information from the
 283     ## open descriptor, but set the size from what we actually read.
 284     h = H.new(me.hash)
 285     try:
 286       with open(fi.name, 'rb') as f:
 287         sz = 0
 288         while True:
 289           buf = f.read(me.BUFSZ)
 290           if len(buf) == 0:
 291             break
 292           sz += len(buf)
 293           h.update(buf)
 294         fi.st = OS.fstat(f.fileno())
 295         ##fi.st.st_size = sz
 296       hash = h.digest()
 297     except (OSError, IOError), err:
 298       fi.st = None
 299       fi.err = err
 300       return None
 301     hash = hash.encode('hex')
 302
 303     ## Insert a record into the database.
 304     if me._db:
 305       c.execute("""
 306               INSERT OR REPLACE INTO hash
 307                       (ino, mtime, ctime, size, hash, seen)
 308               VALUES
 309                       (?, ?, ?, ?, ?, 1);
 310       """, [fi.st.st_ino,
 311             fi.st.st_mtime,
 312             fi.st.st_ctime,
 313             fi.st.st_size,
 314             hash])
 315       me._update()
 316
 317     ## Done.
 318     return hash
 319
 320   def _update(me):
 321     me._pend += 1
 322     if me._pend >= 1024:
 323       me.flush()
 324
 325   def flush(me):
 326     if me._db:
 327       me._db.commit()
 328     me._pend = 0
 329
 330   def need_db(me):
 331     if not me._db:
 332       die("no cache database")
 333
 334   def reset(me):
 335     me.need_db()
 336     c = me._db.cursor()
 337     c.execute('UPDATE hash SET seen = 0 WHERE seen')
 338     me.flush()
 339
 340   def prune(me):
 341     me.need_db()
 342     c = me._db.cursor()
 343     c.execute('DELETE FROM hash WHERE NOT seen')
 344     me.flush()
 345
 346 ###--------------------------------------------------------------------------
 347 ### Printing output.
 348
 349 class GenericFormatter (object):
 350   def __init__(me, fi):
 351     me.fi = fi
 352   def _fmt_time(me, t):
 353     tm = T.gmtime(t)
 354     return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
 355   def _enc_name(me, n):
 356     return ' \\-> '.join(n.encode('string_escape').split(' -> '))
 357   def name(me):
 358     return me._enc_name(me.fi.name)
 359   def info(me):
 360     return me.TYPE
 361   def mode(me):
 362     return '%06o' % me.fi.st.st_mode
 363   def size(me):
 364     return me.fi.st.st_size
 365   def mtime(me):
 366     return me._fmt_time(me.fi.st.st_mtime)
 367   def owner(me):
 368     return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
 369
 370 class ErrorFormatter (GenericFormatter):
 371   def info(me):
 372     return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
 373   def error(me): return 'error'
 374   mode = size = mtime = owner = error
 375
 376 class SocketFormatter (GenericFormatter):
 377   TYPE = 'socket'
 378 class PipeFormatter (GenericFormatter):
 379   TYPE = 'fifo'
 380
 381 class LinkFormatter (GenericFormatter):
 382   TYPE = 'symbolic-link'
 383   def name(me):
 384     n = GenericFormatter.name(me)
 385     try:
 386       d = OS.readlink(me.fi.name)
 387       return '%s -> %s' % (n, me._enc_name(d))
 388     except OSError, err:
 389       return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
 390
 391 class DirectoryFormatter (GenericFormatter):
 392   TYPE = 'directory'
 393   def name(me): return GenericFormatter.name(me) + '/'
 394   def size(me): return 'dir'
 395
 396 class DeviceFormatter (GenericFormatter):
 397   def info(me):
 398     return '%s %d:%d' % (me.TYPE,
 399                          OS.major(me.fi.st.st_rdev),
 400                          OS.minor(me.fi.st.st_rdev))
 401 class BlockDeviceFormatter (DeviceFormatter):
 402   TYPE = 'block-device'
 403 class CharDeviceFormatter (DeviceFormatter):
 404   TYPE = 'character-device'
 405
 406 class FileFormatter (GenericFormatter):
 407   TYPE = 'regular-file'
 408
 409 class Reporter (object):
 410
 411   TYMAP = {
 412     ST.S_IFSOCK: SocketFormatter,
 413     ST.S_IFDIR: DirectoryFormatter,
 414     ST.S_IFLNK: LinkFormatter,
 415     ST.S_IFREG: FileFormatter,
 416     ST.S_IFBLK: BlockDeviceFormatter,
 417     ST.S_IFCHR: CharDeviceFormatter,
 418     ST.S_IFIFO: PipeFormatter,
 419   }
 420
 421   def __init__(me, db):
 422     me._inomap = {}
 423     me._vinomap = {}
 424     me._db = db
 425     me._hsz = int(H.new(db.hash).digest_size)
 426
 427   def file(me, fi):
 428     h = me._db.hashfile(fi)
 429     if fi.err:
 430       fmt = ErrorFormatter(fi)
 431       vino = 'error'
 432     else:
 433       fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
 434       inoidx = fi.st.st_dev, fi.st.st_ino
 435       try:
 436         vino = me._inomap[inoidx]
 437       except KeyError:
 438         suffix = ''
 439         seq = 0
 440         while True:
 441           vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
 442           if vino not in me._vinomap: break
 443           suffix = '\0%d' % seq
 444           seq += 1
 445         me._inomap[inoidx] = vino
 446     if h: info = h
 447     else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
 448     print '%s %8s %6s %-12s %-20s %20s %s' % (
 449       info, vino, fmt.mode(), fmt.owner(),
 450       fmt.mtime(), fmt.size(), fmt.name())
 451
 452 ###--------------------------------------------------------------------------
 453 ### Main program.
 454
 455 FMTMAP = {
 456   'rsync': lambda f: enum_rsync(stdin, f),
 457   'find0': lambda f: enum_find0(stdin, f)
 458 }
 459 op = OP.OptionParser(
 460   usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
 461   version = '%%prog, version %s' % VERSION,
 462   description = '''\
 463 Print a digest of a filesystem (or a collection of specified files) to
 464 standard output.  The idea is that the digest should be mostly /complete/
 465 (i.e., any `interesting\' change to the filesystem results in a different
 466 digest) and /canonical/ (i.e., identical filesystem contents result in
 467 identical output).
 468 ''')
 469
 470 for short, long, props in [
 471   ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
 472                     'help': 'clear cache of all files not seen' }),
 473   ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
 474                       'help': 'use FILE as a cache for file hashes' }),
 475   ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
 476                       'type': 'choice', 'choices': FMTMAP.keys(),
 477                       'help': 'read files to report in the given FORMAT' }),
 478   ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
 479                      ##'type': 'choice', 'choices': H.algorithms,
 480                      'help': 'use HASH as the hash function' })]:
 481   op.add_option(short, long, **props)
 482 opts, args = op.parse_args(argv)
 483
 484 if not opts.files and len(args) <= 1:
 485   die("no filename sources: nothing to do")
 486 db = HashCache(opts.cache, opts.hash)
 487 if opts.all:
 488   db.reset()
 489 rep = Reporter(db)
 490 if opts.files:
 491   FMTMAP[opts.files](rep.file)
 492 for dir in args[1:]:
 493   enum_walk(dir, rep.file)
 494 if opts.all:
 495   db.prune()
 496 db.flush()
 497
 498 ###----- That's all, folks --------------------------------------------------