#! @PYTHON@ ### ### Efficiently construct canonical digests of filesystems ### ### (c) 2012 Mark Wooding ### ###----- Licensing notice --------------------------------------------------- ### ### This file is part of the `rsync-backup' program. ### ### rsync-backup is free software; you can redistribute it and/or modify ### it under the terms of the GNU General Public License as published by ### the Free Software Foundation; either version 2 of the License, or ### (at your option) any later version. ### ### rsync-backup is distributed in the hope that it will be useful, ### but WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ### GNU General Public License for more details. ### ### You should have received a copy of the GNU General Public License ### along with rsync-backup; if not, write to the Free Software Foundation, ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from sys import argv, exit, stdin, stdout, stderr import os as OS import re as RX import time as T import stat as ST import optparse as OP import hashlib as H import sqlite3 as DB import zlib as Z PACKAGE = '@PACKAGE@' VERSION = '@VERSION@' ###-------------------------------------------------------------------------- ### Utilities. QUIS = OS.path.basename(argv[0]) def moan(msg): stderr.write('%s: %s\n' % (QUIS, msg)) def die(msg, rc = 1): moan(msg) exit(rc) SYSERR = 0 def syserr(msg): global SYSERR moan(msg) SYSERR += 1 ###-------------------------------------------------------------------------- ### File system enumeration. class FileInfo (object): def __init__(me, file, st = None): me.name = file if st: me.st = st me.err = None else: try: me.st = OS.lstat(file) me.err = None except OSError, err: me.st = None me.err = err def enum_walk(file, func): def dirents(name): try: return OS.listdir(name) except OSError, err: syserr("failed to read directory `%s': %s" % (name, err.strerror)) return [] def dir(ee, dev): ff = [] dd = [] for e in ee: fi = FileInfo(e) if fi.st and fi.st.st_dev != dev: pass if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi) else: ff.append(fi) ff.sort(key = lambda fi: fi.name) dd.sort(key = lambda fi: fi.name + '/') for f in ff: func(f) for d in dd: if d.st.st_dev == dev: func(d) dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev) if file.endswith('/'): cwd = OS.open('.', OS.O_RDONLY) try: OS.chdir(file) fi = FileInfo('.') func(fi) dir(dirents('.'), fi.st.st_dev) finally: OS.fchdir(cwd) OS.close(cwd) else: fi = FileInfo(file) func(fi) if fi.st and ST.S_ISDIR(fi.st.st_mode): dir([OS.path.join(fi.name, e) for e in dirents(fi.name)], fi.st.st_dev) def enum_find0(f, func): tail = "" while True: buf = f.read(8192) last = len(buf) == 0 names = (tail + buf).split('\0') tail = names.pop() for n in names: func(FileInfo(n)) if last: break if len(tail): moan("ignored trailing junk after last filename") RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE) def enum_rsync(f, func): ## The format is a little fiddly. Each line consists of PERMS SIZE DATE ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly ## one space character after the TIME and may begin with a space. ## Sequences of the form `\#OOO' where OOO are three octal digits, stand ## for a byte with that value. Newlines and backslashes which would be ## ambiguous are converted into this form; all other characters are ## literal. ## ## We ignore the stat information and retrieve it ourselves, because it's ## incomplete. Hopefully the dcache is still warm. for line in f: if line.endswith('\n'): line = line[:-1] ## Extract the escaped name. ff = line.split(None, 3) if len(ff) != 4: syserr("ignoring invalid line from rsync: `%s'" % line) continue tail = ff[3] try: spc = tail.index(' ') except ValueError: syserr("ignoring invalid line from rsync: `%s'" % line) continue name = tail[spc + 1:] ## Now translate escape sequences. name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name) ## Call the client. try: fi = FileInfo(name) except OSError, err: syserr("failed to stat `%s': %s" % (name, err.strerror)) continue func(fi) ###-------------------------------------------------------------------------- ### The hash cache. class HashCache (object): VERSION = 0 BUFSZ = 128*1024 INIT = [ """CREATE TABLE meta ( version INTEGER NOT NULL, hash TEXT NOT NULL );""", """CREATE TABLE hash ( ino INTEGER PRIMARY KEY, mtime INTEGER NOT NULL, ctime INTEGER NOT NULL, size INTEGER NOT NULL, hash TEXT NOT NULL, seen BOOLEAN NOT NULL DEFAULT TRUE );""", """PRAGMA journal_mode = WAL;""" ] def __init__(me, file, hash = None): if file is None: ## We're going this alone, with no cache. db = None if hash is None: die("no hash specified and no database cache to read from") else: ## Connect to the database. db = DB.connect(file) db.text_factory = str ## See whether we can understand the cache database. c = db.cursor() v = h = None try: c.execute('SELECT version, hash FROM meta') v, h = c.fetchone() if c.fetchone() is not None: die("cache database corrupt: meta table has mutliple rows") except (DB.Error, TypeError): pass ## If that didn't work, we'd better clear the thing and start again. ## But only if we know how to initialize it. if v != me.VERSION: ## Explain the situation. moan("cache version %s not understood" % v) if hash is None: if h is None: die("can't initialize cache: no hash function set") else: hash = h try: H.new(hash) except Exception: die("unknown hash function `%s'" % hash) ## Drop old things. c.execute('SELECT type, name FROM sqlite_master') for type, name in c.fetchall(): c.execute('DROP %s IF EXISTS %s' % (type, name)) ## Now we're ready to go. for stmt in me.INIT: c.execute(stmt) c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash]) db.commit() ## Check the hash function if necessary. if hash is None: hash = h elif h is not None and h != hash: die("hash mismatch: cache uses %s but %s requested" % (h, hash)) ## All done. me.hash = hash me._db = db me._pend = 0 def hashfile(me, fi): ## If this isn't a proper file then don't try to hash it. if fi.err or not ST.S_ISREG(fi.st.st_mode): return None ## See whether there's a valid entry in the cache. if me._db: c = me._db.cursor() c.execute( 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;', [fi.st.st_ino]) r = c.fetchone() if r is not None: mt, sz, h, s = r if mt == fi.st.st_mtime and \ sz == fi.st.st_size: if not s: c.execute('UPDATE hash SET seen = 1 WHERE ino = ?', [fi.st.st_ino]) me._update() return h ## Hash the file. Beware raciness: update the file information from the ## open descriptor, but set the size from what we actually read. h = H.new(me.hash) try: with open(fi.name, 'rb') as f: sz = 0 while True: buf = f.read(me.BUFSZ) if len(buf) == 0: break sz += len(buf) h.update(buf) fi.st = OS.fstat(f.fileno()) ##fi.st.st_size = sz hash = h.digest() except (OSError, IOError), err: fi.st = None fi.err = err return None hash = hash.encode('hex') ## Insert a record into the database. if me._db: c.execute(""" INSERT OR REPLACE INTO hash (ino, mtime, ctime, size, hash, seen) VALUES (?, ?, ?, ?, ?, 1); """, [fi.st.st_ino, fi.st.st_mtime, fi.st.st_ctime, fi.st.st_size, hash]) me._update() ## Done. return hash def _update(me): me._pend += 1 if me._pend >= 1024: me.flush() def flush(me): if me._db: me._db.commit() me._pend = 0 def need_db(me): if not me._db: die("no cache database") def reset(me): me.need_db() c = me._db.cursor() c.execute('UPDATE hash SET seen = 0 WHERE seen') me.flush() def prune(me): me.need_db() c = me._db.cursor() c.execute('DELETE FROM hash WHERE NOT seen') me.flush() ###-------------------------------------------------------------------------- ### Printing output. class GenericFormatter (object): def __init__(me, fi): me.fi = fi def _fmt_time(me, t): tm = T.gmtime(t) return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm) def _enc_name(me, n): return ' \\-> '.join(n.encode('string_escape').split(' -> ')) def name(me): return me._enc_name(me.fi.name) def info(me): return me.TYPE def mode(me): return '%06o' % me.fi.st.st_mode def size(me): return me.fi.st.st_size def mtime(me): return me._fmt_time(me.fi.st.st_mtime) def owner(me): return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid) class ErrorFormatter (GenericFormatter): def info(me): return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror) def error(me): return 'error' mode = size = mtime = owner = error class SocketFormatter (GenericFormatter): TYPE = 'socket' class PipeFormatter (GenericFormatter): TYPE = 'fifo' class LinkFormatter (GenericFormatter): TYPE = 'symbolic-link' def name(me): n = GenericFormatter.name(me) try: d = OS.readlink(me.fi.name) return '%s -> %s' % (n, me._enc_name(d)) except OSError, err: return '%s -> ' % (n, err.errno, err.strerror) class DirectoryFormatter (GenericFormatter): TYPE = 'directory' def name(me): return GenericFormatter.name(me) + '/' def size(me): return 'dir' class DeviceFormatter (GenericFormatter): def info(me): return '%s %d:%d' % (me.TYPE, OS.major(me.fi.st.st_rdev), OS.minor(me.fi.st.st_rdev)) class BlockDeviceFormatter (DeviceFormatter): TYPE = 'block-device' class CharDeviceFormatter (DeviceFormatter): TYPE = 'character-device' class FileFormatter (GenericFormatter): TYPE = 'regular-file' class Reporter (object): TYMAP = { ST.S_IFSOCK: SocketFormatter, ST.S_IFDIR: DirectoryFormatter, ST.S_IFLNK: LinkFormatter, ST.S_IFREG: FileFormatter, ST.S_IFBLK: BlockDeviceFormatter, ST.S_IFCHR: CharDeviceFormatter, ST.S_IFIFO: PipeFormatter, } def __init__(me, db): me._inomap = {} me._vinomap = {} me._db = db me._hsz = int(H.new(db.hash).digest_size) def file(me, fi): h = me._db.hashfile(fi) if fi.err: fmt = ErrorFormatter(fi) vino = 'error' else: fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi) inoidx = fi.st.st_dev, fi.st.st_ino try: vino = me._inomap[inoidx] except KeyError: suffix = '' seq = 0 while True: vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff) if vino not in me._vinomap: break suffix = '\0%d' % seq seq += 1 me._inomap[inoidx] = vino if h: info = h else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info()) print '%s %8s %6s %-12s %-20s %20s %s' % ( info, vino, fmt.mode(), fmt.owner(), fmt.mtime(), fmt.size(), fmt.name()) ###-------------------------------------------------------------------------- ### Main program. FMTMAP = { 'rsync': lambda f: enum_rsync(stdin, f), 'find0': lambda f: enum_find0(stdin, f) } op = OP.OptionParser( usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]', version = '%%prog, version %s' % VERSION, description = '''\ Print a digest of a filesystem (or a collection of specified files) to standard output. The idea is that the digest should be mostly /complete/ (i.e., any `interesting\' change to the filesystem results in a different digest) and /canonical/ (i.e., identical filesystem contents result in identical output). ''') for short, long, props in [ ('-a', '--all', { 'action': 'store_true', 'dest': 'all', 'help': 'clear cache of all files not seen' }), ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE', 'help': 'use FILE as a cache for file hashes' }), ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT', 'type': 'choice', 'choices': FMTMAP.keys(), 'help': 'read files to report in the given FORMAT' }), ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH', ##'type': 'choice', 'choices': H.algorithms, 'help': 'use HASH as the hash function' })]: op.add_option(short, long, **props) opts, args = op.parse_args(argv) if not opts.files and len(args) <= 1: die("no filename sources: nothing to do") db = HashCache(opts.cache, opts.hash) if opts.all: db.reset() rep = Reporter(db) if opts.files: FMTMAP[opts.files](rep.file) for dir in args[1:]: enum_walk(dir, rep.file) if opts.all: db.prune() db.flush() ###----- That's all, folks --------------------------------------------------