3 ### Efficiently construct canonical digests of filesystems
5 ### (c) 2012 Mark Wooding
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of the `rsync-backup' program.
12 ### rsync-backup is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU General Public License as published by
14 ### the Free Software Foundation; either version 2 of the License, or
15 ### (at your option) any later version.
17 ### rsync-backup is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU General Public License for more details.
22 ### You should have received a copy of the GNU General Public License
23 ### along with rsync-backup; if not, write to the Free Software Foundation,
24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 from sys import argv, exit, stdin, stdout, stderr
36 PACKAGE = 'rsync-backup'
37 VERSION = '0.99.1-8-ga844'
39 ###--------------------------------------------------------------------------
42 QUIS = OS.path.basename(argv[0])
45 stderr.write('%s: %s\n' % (QUIS, msg))
57 ###--------------------------------------------------------------------------
58 ### File system enumeration.
60 class FileInfo (object):
61 def __init__(me, file, st = None):
68 me.st = OS.lstat(file)
74 def enum_walk(file, func):
78 return OS.listdir(name)
80 syserr("failed to read directory `%s': %s" % (name, err.strerror))
88 if fi.st and fi.st.st_dev != dev: pass
89 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
91 ff.sort(key = lambda fi: fi.name)
92 dd.sort(key = lambda fi: fi.name + '/')
96 if d.st.st_dev == dev:
98 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
100 if file.endswith('/'):
101 cwd = OS.open('.', OS.O_RDONLY)
106 dir(dirents('.'), fi.st.st_dev)
113 if fi.st and ST.S_ISDIR(fi.st.st_mode):
114 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
117 def enum_find0(f, func):
122 names = (tail + buf).split('\0')
129 moan("ignored trailing junk after last filename")
131 RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
132 def enum_rsync(f, func):
134 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
135 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
136 ## one space character after the TIME and may begin with a space.
137 ## Sequences of the form `\#OOO' where OOO are three octal digits, stand
138 ## for a byte with that value. Newlines and backslashes which would be
139 ## ambiguous are converted into this form; all other characters are
142 ## We ignore the stat information and retrieve it ourselves, because it's
143 ## incomplete. Hopefully the dcache is still warm.
146 if line.endswith('\n'): line = line[:-1]
148 ## Extract the escaped name.
149 ff = line.split(None, 3)
151 syserr("ignoring invalid line from rsync: `%s'" % line)
155 spc = tail.index(' ')
157 syserr("ignoring invalid line from rsync: `%s'" % line)
159 name = tail[spc + 1:]
161 ## Now translate escape sequences.
162 name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
168 syserr("failed to stat `%s': %s" % (name, err.strerror))
172 ###--------------------------------------------------------------------------
175 class HashCache (object):
181 """CREATE TABLE meta (
182 version INTEGER NOT NULL,
185 """CREATE TABLE hash (
186 ino INTEGER PRIMARY KEY,
187 mtime INTEGER NOT NULL,
188 ctime INTEGER NOT NULL,
189 size INTEGER NOT NULL,
191 seen BOOLEAN NOT NULL DEFAULT TRUE
193 """PRAGMA journal_mode = WAL;"""
196 def __init__(me, file, hash = None):
200 ## We're going this alone, with no cache.
203 die("no hash specified and no database cache to read from")
206 ## Connect to the database.
207 db = DB.connect(file)
208 db.text_factory = str
210 ## See whether we can understand the cache database.
214 c.execute('SELECT version, hash FROM meta')
216 if c.fetchone() is not None:
217 die("cache database corrupt: meta table has mutliple rows")
218 except (DB.Error, TypeError):
221 ## If that didn't work, we'd better clear the thing and start again.
222 ## But only if we know how to initialize it.
225 ## Explain the situation.
226 moan("cache version %s not understood" % v)
229 die("can't initialize cache: no hash function set")
235 die("unknown hash function `%s'" % hash)
238 c.execute('SELECT type, name FROM sqlite_master')
239 for type, name in c.fetchall():
240 c.execute('DROP %s IF EXISTS %s' % (type, name))
242 ## Now we're ready to go.
245 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
248 ## Check the hash function if necessary.
251 elif h is not None and h != hash:
252 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
259 def hashfile(me, fi):
261 ## If this isn't a proper file then don't try to hash it.
262 if fi.err or not ST.S_ISREG(fi.st.st_mode):
265 ## See whether there's a valid entry in the cache.
269 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
274 if mt == fi.st.st_mtime and \
277 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
282 ## Hash the file. Beware raciness: update the file information from the
283 ## open descriptor, but set the size from what we actually read.
286 with open(fi.name, 'rb') as f:
289 buf = f.read(me.BUFSZ)
294 fi.st = OS.fstat(f.fileno())
297 except (OSError, IOError), err:
301 hash = hash.encode('hex')
303 ## Insert a record into the database.
306 INSERT OR REPLACE INTO hash
307 (ino, mtime, ctime, size, hash, seen)
332 die("no cache database")
337 c.execute('UPDATE hash SET seen = 0 WHERE seen')
343 c.execute('DELETE FROM hash WHERE NOT seen')
346 ###--------------------------------------------------------------------------
349 class GenericFormatter (object):
350 def __init__(me, fi):
352 def _fmt_time(me, t):
354 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
355 def _enc_name(me, n):
356 return ' \\-> '.join(n.encode('string_escape').split(' -> '))
358 return me._enc_name(me.fi.name)
362 return '%06o' % me.fi.st.st_mode
364 return me.fi.st.st_size
366 return me._fmt_time(me.fi.st.st_mtime)
368 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
370 class ErrorFormatter (GenericFormatter):
372 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
373 def error(me): return 'error'
374 mode = size = mtime = owner = error
376 class SocketFormatter (GenericFormatter):
378 class PipeFormatter (GenericFormatter):
381 class LinkFormatter (GenericFormatter):
382 TYPE = 'symbolic-link'
384 n = GenericFormatter.name(me)
386 d = OS.readlink(me.fi.name)
387 return '%s -> %s' % (n, me._enc_name(d))
389 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
391 class DirectoryFormatter (GenericFormatter):
393 def name(me): return GenericFormatter.name(me) + '/'
394 def size(me): return 'dir'
396 class DeviceFormatter (GenericFormatter):
398 return '%s %d:%d' % (me.TYPE,
399 OS.major(me.fi.st.st_rdev),
400 OS.minor(me.fi.st.st_rdev))
401 class BlockDeviceFormatter (DeviceFormatter):
402 TYPE = 'block-device'
403 class CharDeviceFormatter (DeviceFormatter):
404 TYPE = 'character-device'
406 class FileFormatter (GenericFormatter):
407 TYPE = 'regular-file'
409 class Reporter (object):
412 ST.S_IFSOCK: SocketFormatter,
413 ST.S_IFDIR: DirectoryFormatter,
414 ST.S_IFLNK: LinkFormatter,
415 ST.S_IFREG: FileFormatter,
416 ST.S_IFBLK: BlockDeviceFormatter,
417 ST.S_IFCHR: CharDeviceFormatter,
418 ST.S_IFIFO: PipeFormatter,
421 def __init__(me, db):
425 me._hsz = int(H.new(db.hash).digest_size)
428 h = me._db.hashfile(fi)
430 fmt = ErrorFormatter(fi)
433 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
434 inoidx = fi.st.st_dev, fi.st.st_ino
436 vino = me._inomap[inoidx]
441 vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
442 if vino not in me._vinomap: break
443 suffix = '\0%d' % seq
445 me._inomap[inoidx] = vino
447 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
448 print '%s %8s %6s %-12s %-20s %20s %s' % (
449 info, vino, fmt.mode(), fmt.owner(),
450 fmt.mtime(), fmt.size(), fmt.name())
452 ###--------------------------------------------------------------------------
456 'rsync': lambda f: enum_rsync(stdin, f),
457 'find0': lambda f: enum_find0(stdin, f)
459 op = OP.OptionParser(
460 usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
461 version = '%%prog, version %s' % VERSION,
463 Print a digest of a filesystem (or a collection of specified files) to
464 standard output. The idea is that the digest should be mostly /complete/
465 (i.e., any `interesting\' change to the filesystem results in a different
466 digest) and /canonical/ (i.e., identical filesystem contents result in
470 for short, long, props in [
471 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
472 'help': 'clear cache of all files not seen' }),
473 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
474 'help': 'use FILE as a cache for file hashes' }),
475 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
476 'type': 'choice', 'choices': FMTMAP.keys(),
477 'help': 'read files to report in the given FORMAT' }),
478 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
479 ##'type': 'choice', 'choices': H.algorithms,
480 'help': 'use HASH as the hash function' })]:
481 op.add_option(short, long, **props)
482 opts, args = op.parse_args(argv)
484 if not opts.files and len(args) <= 1:
485 die("no filename sources: nothing to do")
486 db = HashCache(opts.cache, opts.hash)
491 FMTMAP[opts.files](rep.file)
493 enum_walk(dir, rep.file)
498 ###----- That's all, folks --------------------------------------------------