3 ### Efficiently construct canonical digests of filesystems
5 ### (c) 2012 Mark Wooding
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of the `rsync-backup' program.
12 ### rsync-backup is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU General Public License as published by
14 ### the Free Software Foundation; either version 2 of the License, or
15 ### (at your option) any later version.
17 ### rsync-backup is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU General Public License for more details.
22 ### You should have received a copy of the GNU General Public License
23 ### along with rsync-backup; if not, write to the Free Software Foundation,
24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 from sys import argv, exc_info, exit, stdin, stdout, stderr
41 ###--------------------------------------------------------------------------
45 _PYVER = _SYS.version_info
47 _FSENC = _SYS.getfilesystemencoding()
48 if _PYVER >= (3, 1): _FSENCERR = "surrogateescape"
49 else: _FSENCERR = "strict"
50 from io import BytesIO, StringIO
51 def bin(x): return x.encode(_FSENC, _FSENCERR)
52 def text(x): return x.decode(_FSENC, _FSENCERR)
53 def bytechr(x): return bytes([x])
54 def byteord(x): return x
56 from cStringIO import StringIO; BytesIO = StringIO
59 def bytechr(x): return chr(x)
60 def byteord(x): return ord(x)
61 def excval(): return exc_info()[1]
63 QUIS = OS.path.basename(argv[0])
66 stderr.write('%s: %s\n' % (QUIS, msg))
82 if k == 9: out.write("\\t")
83 elif k == 10: out.write("\\n")
84 elif k == 13: out.write("\\r")
85 elif k == 39: out.write("\\'")
86 elif k == 92: out.write("\\\\")
87 elif 20 <= k <= 126: out.write(chr(k))
88 else: out.write("\\x%02x" % k)
91 R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
97 m = R_STRESC.search(x, i)
98 if m is not None: j = m.start(0)
100 str.write(bin(str[i:j]))
102 k, e = m.group(1), m.group(2)
103 if k is not None: ch = int(k, 16)
104 elif ch == "a": ch = 7
105 elif ch == "b": ch = 8
106 elif ch == "f": ch = 12
107 elif ch == "n": ch = 10
108 elif ch == "r": ch = 13
109 elif ch == "t": ch = 9
110 elif ch == "v": ch = 11
111 else: ch = byteord(e)
112 str.write(bytechr(ch))
114 return text(out.getvalue())
116 ###--------------------------------------------------------------------------
117 ### File system enumeration.
119 class FileInfo (object):
120 def __init__(me, file, st = None):
127 me.st = OS.lstat(file)
133 def enum_walk(file, func):
137 return OS.listdir(name)
139 syserr("failed to read directory `%s': %s" % (name, excval().strerror))
147 if fi.st and fi.st.st_dev != dev: pass
148 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
150 ff.sort(key = lambda fi: fi.name)
151 dd.sort(key = lambda fi: fi.name + '/')
155 if d.st.st_dev == dev:
157 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
159 if file.endswith('/'):
160 cwd = OS.open('.', OS.O_RDONLY)
165 dir(dirents('.'), fi.st.st_dev)
172 if fi.st and ST.S_ISDIR(fi.st.st_mode):
173 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
176 def enum_find0(f, func):
181 names = (tail + buf).split('\0')
188 moan("ignored trailing junk after last filename")
190 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
191 def enum_rsync(f, func):
193 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
194 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
195 ## one space character after the TIME and may begin with a space.
196 ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
197 ## for a byte with that value. Newlines, and backslashes which would be
198 ## ambiguous, are converted into this form; all other characters are
201 ## We ignore the stat information and retrieve it ourselves, because it's
202 ## incomplete. Hopefully the dcache is still warm.
205 if line.endswith('\n'): line = line[:-1]
207 ## Extract the escaped name.
208 ff = line.split(None, 3)
210 syserr("ignoring invalid line from rsync: `%s'" % line)
214 spc = tail.index(' ')
216 syserr("ignoring invalid line from rsync: `%s'" % line)
218 name = tail[spc + 1:]
220 ## Now translate escape sequences.
221 name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
227 syserr("failed to stat `%s': %s" % (name, excval().strerror))
231 ###--------------------------------------------------------------------------
234 class HashCache (object):
240 """CREATE TABLE meta (
241 version INTEGER NOT NULL,
244 """CREATE TABLE hash (
245 ino INTEGER PRIMARY KEY,
246 mtime INTEGER NOT NULL,
247 ctime INTEGER NOT NULL,
248 size INTEGER NOT NULL,
250 seen BOOLEAN NOT NULL DEFAULT TRUE
252 """PRAGMA journal_mode = WAL;"""
255 def __init__(me, file, hash = None):
259 ## We're going this alone, with no cache.
262 die("no hash specified and no database cache to read from")
265 ## Connect to the database.
266 db = DB.connect(file)
267 db.text_factory = str
269 ## See whether we can understand the cache database.
273 c.execute('SELECT version, hash FROM meta')
275 if c.fetchone() is not None:
276 die("cache database corrupt: meta table has mutliple rows")
277 except (DB.Error, TypeError):
280 ## If that didn't work, we'd better clear the thing and start again.
281 ## But only if we know how to initialize it.
284 ## Explain the situation.
285 moan("cache version %s not understood" % v)
288 die("can't initialize cache: no hash function set")
294 die("unknown hash function `%s'" % hash)
297 c.execute('SELECT type, name FROM sqlite_master')
298 for type, name in c.fetchall():
299 c.execute('DROP %s IF EXISTS %s' % (type, name))
301 ## Now we're ready to go.
304 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
307 ## Check the hash function if necessary.
310 elif h is not None and h != hash:
311 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
318 def hashfile(me, fi):
320 ## If this isn't a proper file then don't try to hash it.
321 if fi.err or not ST.S_ISREG(fi.st.st_mode):
324 ## See whether there's a valid entry in the cache.
328 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
333 if mt == fi.st.st_mtime and \
336 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
341 ## Hash the file. Beware raciness: update the file information from the
342 ## open descriptor, but set the size from what we actually read.
345 with open(fi.name, 'rb') as f:
348 buf = f.read(me.BUFSZ)
353 fi.st = OS.fstat(f.fileno())
356 except (OSError, IOError):
360 hash = text(B.hexlify(hash))
362 ## Insert a record into the database.
365 INSERT OR REPLACE INTO hash
366 (ino, mtime, ctime, size, hash, seen)
391 die("no cache database")
396 c.execute('DELETE FROM hash WHERE ino = ?', [ino])
401 c.execute('UPDATE hash SET seen = 0 WHERE seen')
407 c.execute('DELETE FROM hash WHERE NOT seen')
410 ###--------------------------------------------------------------------------
413 class GenericFormatter (object):
414 def __init__(me, fi):
416 def _fmt_time(me, t):
418 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
419 def _enc_name(me, n):
420 return ' \\-> '.join(escapify(n).split(' -> '))
422 return me._enc_name(me.fi.name)
426 return '%06o' % me.fi.st.st_mode
428 return me.fi.st.st_size
430 return me._fmt_time(me.fi.st.st_mtime)
432 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
434 class ErrorFormatter (GenericFormatter):
436 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
437 def error(me): return 'error'
438 mode = size = mtime = owner = error
440 class SocketFormatter (GenericFormatter):
442 class PipeFormatter (GenericFormatter):
445 class LinkFormatter (GenericFormatter):
446 TYPE = 'symbolic-link'
448 n = GenericFormatter.name(me)
450 d = OS.readlink(me.fi.name)
451 return '%s -> %s' % (n, me._enc_name(d))
454 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
456 class DirectoryFormatter (GenericFormatter):
458 def name(me): return GenericFormatter.name(me) + '/'
459 def size(me): return 'dir'
461 class DeviceFormatter (GenericFormatter):
463 return '%s %d:%d' % (me.TYPE,
464 OS.major(me.fi.st.st_rdev),
465 OS.minor(me.fi.st.st_rdev))
466 class BlockDeviceFormatter (DeviceFormatter):
467 TYPE = 'block-device'
468 class CharDeviceFormatter (DeviceFormatter):
469 TYPE = 'character-device'
471 class FileFormatter (GenericFormatter):
472 TYPE = 'regular-file'
474 class Reporter (object):
477 ST.S_IFSOCK: SocketFormatter,
478 ST.S_IFDIR: DirectoryFormatter,
479 ST.S_IFLNK: LinkFormatter,
480 ST.S_IFREG: FileFormatter,
481 ST.S_IFBLK: BlockDeviceFormatter,
482 ST.S_IFCHR: CharDeviceFormatter,
483 ST.S_IFIFO: PipeFormatter,
486 def __init__(me, db):
490 me._hsz = int(H.new(db.hash).digest_size)
493 h = me._db.hashfile(fi)
495 fmt = ErrorFormatter(fi)
498 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
499 inoidx = fi.st.st_dev, fi.st.st_ino
501 vino = me._inomap[inoidx]
506 vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
507 if vino not in me._vinomap: break
508 suffix = '\0%d' % seq
510 me._inomap[inoidx] = vino
511 if OPTS.compat >= 2: me._vinomap[vino] = inoidx
513 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
514 print('%s %8s %6s %-12s %-20s %20s %s' %
515 (info, vino, fmt.mode(), fmt.owner(),
516 fmt.mtime(), fmt.size(), fmt.name()))
518 ###--------------------------------------------------------------------------
519 ### Database clearing from diff files.
521 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
523 def clear_entry(db, lno, line):
527 if line.startswith('['):
530 moan("failed to parse file entry (type field; line %d)" % lno)
532 ty = line[1:pos].strip()
533 rest = line[pos + 1:]
536 ff = line.split(None, 1)
538 moan("failed to parse file entry (field split; line %d)" % lno)
543 ff = rest.split(None, 5)
545 moan("failed to parse file entry (field split; line %d)" % lno)
547 ino, mode, uidgid, mtime, sz, name = ff
549 if ty != 'symbolic-link':
552 nn = name.split(' -> ', 1)
554 moan("failed to parse file entry (name split; line %d)" % lno)
557 target = unescapify(target)
558 name = unescapify(name)
564 moan("failed to stat `%s': %s" % (name, e.strerror))
565 if e.errno != E.ENOENT: good = False
567 print("Clear cache entry for `%s'" % name)
574 ## Work through the input diff file one line at a time.
579 if line.endswith('\n'): line = line[:-1]
582 ## We're in a gap between hunks. Find a hunk header and extract the line
584 if diffstate == 'gap':
585 m = R_HUNK.match(line)
587 oldlines = int(m.group(1))
588 newlines = int(m.group(2))
592 ## We're in a hunk. Keep track of whether we've reached the end, and
593 ## discard entries from the cache for mismatching lines.
594 elif diffstate == 'hunk':
596 moan("empty line in diff hunk (line %d)" % lno)
600 oldlines -= 1; newlines -= 1
603 if not clear_entry(db, lno, line[1:]): good = False
606 if not clear_entry(db, lno, line[1:]): good = False
608 moan("incomprehensible line in diff hunk (line %d)" % lno)
610 if oldlines < 0 or newlines < 0:
611 moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
613 if oldlines == newlines == 0:
616 if diffstate == 'hunk':
617 moan("truncated diff hunk (started at line %d)" % hdrlno)
622 ###--------------------------------------------------------------------------
626 'rsync': lambda f: enum_rsync(stdin, f),
627 'find0': lambda f: enum_find0(stdin, f)
629 op = OP.OptionParser(
630 usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
631 version = '%%prog, version %s' % VERSION,
633 Print a digest of a filesystem (or a collection of specified files) to
634 standard output. The idea is that the digest should be mostly /complete/
635 (i.e., any `interesting\' change to the filesystem results in a different
636 digest) and /canonical/ (i.e., identical filesystem contents result in
640 for short, long, props in [
641 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
642 'help': 'clear cache of all files not seen' }),
643 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
644 'help': 'use FILE as a cache for file hashes' }),
645 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
646 'type': 'choice', 'choices': list(FMTMAP.keys()),
647 'help': 'read files to report in the given FORMAT' }),
648 ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
649 'help': 'read diff from stdin, clear cache entries' }),
650 ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
651 'type': 'int', 'default': 2,
652 'help': 'produce output with given compatibility VERSION' }),
653 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
654 ##'type': 'choice', 'choices': H.algorithms,
655 'help': 'use HASH as the hash function' })]:
656 op.add_option(short, long, **props)
657 OPTS, args = op.parse_args(argv)
658 if not 1 <= OPTS.compat <= 2:
659 die("unknown compatibility version %d" % OPTS.compat)
661 if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
662 die("incompatible options: `-u' requires `-c CACHE', forbids others")
663 db = HashCache(OPTS.cache, OPTS.hash)
664 if len(args) == 2: OS.chdir(args[1])
666 if not clear_cache(db): good = False
670 if not OPTS.files and len(args) <= 1:
671 die("no filename sources: nothing to do")
672 db = HashCache(OPTS.cache, OPTS.hash)
676 print("## fshash report format version %d" % OPTS.compat)
679 FMTMAP[OPTS.files](rep.file)
681 enum_walk(dir, rep.file)
686 ###----- That's all, folks --------------------------------------------------