3 ### Efficiently construct canonical digests of filesystems
5 ### (c) 2012 Mark Wooding
8 ###----- Licensing notice ---------------------------------------------------
10 ### This file is part of the `rsync-backup' program.
12 ### rsync-backup is free software; you can redistribute it and/or modify
13 ### it under the terms of the GNU General Public License as published by
14 ### the Free Software Foundation; either version 2 of the License, or
15 ### (at your option) any later version.
17 ### rsync-backup is distributed in the hope that it will be useful,
18 ### but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ### GNU General Public License for more details.
22 ### You should have received a copy of the GNU General Public License
23 ### along with rsync-backup; if not, write to the Free Software Foundation,
24 ### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 from sys import argv, exc_info, exit, stdin, stdout, stderr
43 ###--------------------------------------------------------------------------
47 _PYVER = _SYS.version_info
49 _FSENC = _SYS.getfilesystemencoding()
50 if _PYVER >= (3, 1): _FSENCERR = "surrogateescape"
51 else: _FSENCERR = "strict"
52 from io import BytesIO, StringIO
53 def bin(x): return x.encode(_FSENC, _FSENCERR)
54 def text(x): return x.decode(_FSENC, _FSENCERR)
55 def bytechr(x): return bytes([x])
56 def byteord(x): return x
57 def iterkeys(x): return x.keys()
59 from cStringIO import StringIO; BytesIO = StringIO
62 def bytechr(x): return chr(x)
63 def byteord(x): return ord(x)
64 def iterkeys(x): return x.iterkeys()
65 def excval(): return exc_info()[1]
67 QUIS = OS.path.basename(argv[0])
70 stderr.write('%s: %s\n' % (QUIS, msg))
86 if k == 9: out.write("\\t")
87 elif k == 10: out.write("\\n")
88 elif k == 13: out.write("\\r")
89 elif k == 39: out.write("\\'")
90 elif k == 92: out.write("\\\\")
91 elif 20 <= k <= 126: out.write(chr(k))
92 else: out.write("\\x%02x" % k)
95 R_STRESC = RX.compile(r"\\ (?: x ([0-9A-Fa-f]{2}) | (.))",
101 m = R_STRESC.search(x, i)
102 if m is not None: j = m.start(0)
104 str.write(bin(str[i:j]))
106 k, e = m.group(1), m.group(2)
107 if k is not None: ch = int(k, 16)
108 elif ch == "a": ch = 7
109 elif ch == "b": ch = 8
110 elif ch == "f": ch = 12
111 elif ch == "n": ch = 10
112 elif ch == "r": ch = 13
113 elif ch == "t": ch = 9
114 elif ch == "v": ch = 11
115 else: ch = byteord(e)
116 str.write(bytechr(ch))
118 return text(out.getvalue())
120 def simple_memo(func):
133 pw = PW.getpwnam(name)
138 gr = GR.getgrnam(name)
141 ###--------------------------------------------------------------------------
142 ### Extended attributes.
144 def listxattr(f, follow_symlinks = True): return []
146 if hasattr(OS, "listxattr"):
147 getxattr, listxattr = OS.getxattr, OS.listxattr
154 if hasattr(_XA, "list"):
155 def listxattr(f, follow_symlinks = True):
156 return _XA.list(f, nofollow = not follow_symlinks)
157 def getxattr(f, a, follow_symlinks = True):
158 return _XA.get(f, a, nofollow = not follow_symlinks)
160 def listxattr(f, follow_symlinks = True):
161 return _XA.listxattr(f, nofollow = not follow_symlinks)
162 def getxattr(f, a, follow_symlinks = True):
163 return _XA.getxattr(f, a, nofollow = not follow_symlinks)
165 ###--------------------------------------------------------------------------
166 ### Access control lists.
173 def getacl(f, which): return None
175 import posix1e as ACL
180 ## Match a line from the standard ACL text format.
181 R_ACLENT = RX.compile(r"""^
183 (?: (u | user | g | group | m | mask | o | other)
185 (| [^:\s] | [^:\s] [^:]* [^:\s])
192 ## Codes for the possible entry tag types. These are ordered so that we
201 ## Output tags corresponding to the codes.
202 ACL_TAGMAP = [None, "u", "u", "m", "g", "g", "o"]
206 def getacl(f, which):
208 ## Fetch the file ACL.
209 if which == ACL_ACC: acl = ACL.ACL(file = f)
210 elif which == ACL_DFLT: acl = ACL.ACL(filedef = f)
211 else: raise ValueError("unexpected WHICH = %d" % which)
213 ## For maximum portability, only use the text format, which is guaranteed
214 ## to be supported if anything is. We'll have to parse this ourselves.
215 ## Honestly, an important part of what we're doing here is producing a
216 ## /canonical/ presentation of the ACL, which doesn't seem to be
217 ## something that even the less portable functions will do for us.
222 ## First pass: grind through the ACL entries and build a list of (TAG,
223 ## QUAL, MODE) triples, where the TAG is an `AT_...' code, the QUAL is
224 ## either `None' or a numeric ID, and the MODE is a bitmask of
226 for line in s.split("\n"):
227 m = R_ACLENT.match(line)
228 if m is None: raise ValueError("unexpected ACL line `%s'" % line)
229 if not m.group(1): continue
230 tag, qual, perm = m.group(1), m.group(2), m.group(3)
232 if qual == "": qual = None
234 ## Convert the tag and qualifier.
235 if tag == "u" or tag == "user":
236 if qual is None: pass
237 elif qual.isdigit(): qual = int(qual, 10)
238 else: qual = name_uid(qual)
239 if qual is None: tag = AT_OWNUID
240 else: tag = AT_USER; extp = True
241 elif tag == "m" or tag == "mask":
243 raise ValueError("unexpected mask qualifier `%s'" % qual)
244 tag = AT_MASK; extp = True
245 elif tag == "g" or tag == "group":
246 if qual is None: pass
247 elif qual.isdigit(): qual = int(qual, 10)
248 else: qual = name_gid(qual)
249 if qual is None: tag = AT_OWNGID
250 else: tag = AT_GROUP; extp = True
251 elif tag == "o" or tag == "other":
253 raise ValueError("unexpected other qualifier `%s'" % qual)
256 raise ValueError("unexpected tag type `%s'" % tag)
258 ## Convert the permissions.
261 if ch == "r": mode |= 4
262 elif ch == "w": mode |= 2
263 elif ch == "x": mode |= 1
265 else: raise ValueError("unexpected permission character `%s'" % ch)
268 entries.append((tag, qual, mode))
270 ## If the ACL is trivial then ignore it. An access ACL trivial if it
271 ## contains only entries which are reflected in the traditional
272 ## permission bits. A default ACL is trivial if it's empty.
273 if (which == ACL_ACC and not extp) or \
274 (which == ACL_DFLT and not entries):
277 ## Sort the entries. The tag codes are arranged so that this is a useful
281 ## Produce output. This happens to be the standard short text format,
282 ## with exclusively numeric IDs.
285 for tag, qual, mode in entries:
286 if firstp: firstp = False
288 out.write(ACL_TAGMAP[tag])
290 if qual is not None: out.write(str(qual))
292 if mode&4: out.write("r")
294 if mode&2: out.write("w")
296 if mode&1: out.write("x")
299 return out.getvalue()
301 ###--------------------------------------------------------------------------
302 ### File system enumeration.
304 class FileAttr (object):
305 def __init__(me, file, attr):
306 try: value = getxattr(file, attr, follow_symlinks = False)
307 except (OSError, IOError): me.value, me.err = None, excval()
308 else: me.value, me.err = value, None
310 class FileInfo (object):
311 def __init__(me, file, st = None):
318 me.st = OS.lstat(file)
324 me.xa, me.xa_err = dict(), None
325 me.acl_acc = me.aclerr_acc = None
326 me.acl_dflt = me.aclerr_dflt = None
328 if me.st is not None:
330 def collect_acl(which):
332 return getacl(file, which), None
333 except (OSError, IOError):
335 if err.errno == E.ENOTSUP: return None, None
336 else: return None, excval()
338 if not ST.S_ISLNK(me.st.st_mode):
339 me.acl_acc, me.aclerr_acc = collect_acl(ACL_ACC)
340 if ST.S_ISDIR(me.st.st_mode):
341 me.acl_dflt, me.aclerr_dflt = collect_acl(ACL_DFLT)
343 try: names = listxattr(file, follow_symlinks = False)
344 except (OSError, IOError): me.xa_err = excval()
347 if HAVE_ACL_P and (name == "system.posix_acl_access" or
348 name == "system.posix_acl_default"):
350 me.xa[name] = FileAttr(file, name)
352 def enum_walk(file, func):
356 return OS.listdir(name)
358 syserr("failed to read directory `%s': %s" % (name, excval().strerror))
366 if fi.st and fi.st.st_dev != dev: pass
367 if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
369 ff.sort(key = lambda fi: fi.name)
370 dd.sort(key = lambda fi: fi.name + '/')
374 if d.st.st_dev == dev:
376 dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
378 if file.endswith('/'):
379 cwd = OS.open('.', OS.O_RDONLY)
384 dir(dirents('.'), fi.st.st_dev)
391 if fi.st and ST.S_ISDIR(fi.st.st_mode):
392 dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
395 def enum_find0(f, func):
400 names = (tail + buf).split('\0')
407 moan("ignored trailing junk after last filename")
409 R_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
410 def enum_rsync(f, func):
412 ## The format is a little fiddly. Each line consists of PERMS SIZE DATE
413 ## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
414 ## one space character after the TIME and may begin with a space.
415 ## Sequences of the form `\#OOO', where OOO are three octal digits, stand
416 ## for a byte with that value. Newlines, and backslashes which would be
417 ## ambiguous, are converted into this form; all other characters are
420 ## We ignore the stat information and retrieve it ourselves, because it's
421 ## incomplete. Hopefully the dcache is still warm.
424 if line.endswith('\n'): line = line[:-1]
426 ## Extract the escaped name.
427 ff = line.split(None, 3)
429 syserr("ignoring invalid line from rsync: `%s'" % line)
433 spc = tail.index(' ')
435 syserr("ignoring invalid line from rsync: `%s'" % line)
437 name = tail[spc + 1:]
439 ## Now translate escape sequences.
440 name = R_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
446 syserr("failed to stat `%s': %s" % (name, excval().strerror))
450 ###--------------------------------------------------------------------------
453 class HashCache (object):
459 """CREATE TABLE meta (
460 version INTEGER NOT NULL,
463 """CREATE TABLE hash (
464 ino INTEGER PRIMARY KEY,
465 mtime INTEGER NOT NULL,
466 ctime INTEGER NOT NULL,
467 size INTEGER NOT NULL,
469 seen BOOLEAN NOT NULL DEFAULT TRUE
471 """PRAGMA journal_mode = WAL;"""
474 def __init__(me, file, hash = None):
478 ## We're going this alone, with no cache.
481 die("no hash specified and no database cache to read from")
484 ## Connect to the database.
485 db = DB.connect(file)
486 db.text_factory = str
488 ## See whether we can understand the cache database.
492 c.execute('SELECT version, hash FROM meta')
494 if c.fetchone() is not None:
495 die("cache database corrupt: meta table has mutliple rows")
496 except (DB.Error, TypeError):
499 ## If that didn't work, we'd better clear the thing and start again.
500 ## But only if we know how to initialize it.
503 ## Explain the situation.
504 moan("cache version %s not understood" % v)
507 die("can't initialize cache: no hash function set")
513 die("unknown hash function `%s'" % hash)
516 c.execute('SELECT type, name FROM sqlite_master')
517 for type, name in c.fetchall():
518 c.execute('DROP %s IF EXISTS %s' % (type, name))
520 ## Now we're ready to go.
523 c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
526 ## Check the hash function if necessary.
529 elif h is not None and h != hash:
530 die("hash mismatch: cache uses %s but %s requested" % (h, hash))
537 def hashblob(me, blob):
540 return text(B.hexlify(h.digest()))
542 def hashfile(me, fi):
544 ## If this isn't a proper file then don't try to hash it.
545 if fi.err or not ST.S_ISREG(fi.st.st_mode):
548 ## See whether there's a valid entry in the cache.
552 'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
557 if mt == fi.st.st_mtime and \
560 c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
565 ## Hash the file. Beware raciness: update the file information from the
566 ## open descriptor, but set the size from what we actually read.
569 with open(fi.name, 'rb') as f:
572 buf = f.read(me.BUFSZ)
577 fi.st = OS.fstat(f.fileno())
580 except (OSError, IOError):
584 hash = text(B.hexlify(hash))
586 ## Insert a record into the database.
589 INSERT OR REPLACE INTO hash
590 (ino, mtime, ctime, size, hash, seen)
615 die("no cache database")
620 c.execute('DELETE FROM hash WHERE ino = ?', [ino])
625 c.execute('UPDATE hash SET seen = 0 WHERE seen')
631 c.execute('DELETE FROM hash WHERE NOT seen')
634 ###--------------------------------------------------------------------------
637 class GenericFormatter (object):
638 def __init__(me, fi):
640 def _fmt_time(me, t):
642 return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
643 def _enc_name(me, n):
644 return ' \\-> '.join(escapify(n).split(' -> '))
646 return me._enc_name(me.fi.name)
650 return '%06o' % me.fi.st.st_mode
652 return me.fi.st.st_size
654 return me._fmt_time(me.fi.st.st_mtime)
656 return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
658 class ErrorFormatter (GenericFormatter):
660 return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
661 def error(me): return 'error'
662 mode = size = mtime = owner = error
664 class SocketFormatter (GenericFormatter):
666 class PipeFormatter (GenericFormatter):
669 class LinkFormatter (GenericFormatter):
670 TYPE = 'symbolic-link'
672 n = GenericFormatter.name(me)
674 d = OS.readlink(me.fi.name)
675 return '%s -> %s' % (n, me._enc_name(d))
678 return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
680 class DirectoryFormatter (GenericFormatter):
682 def name(me): return GenericFormatter.name(me) + '/'
683 def size(me): return 'dir'
685 class DeviceFormatter (GenericFormatter):
687 return '%s %d:%d' % (me.TYPE,
688 OS.major(me.fi.st.st_rdev),
689 OS.minor(me.fi.st.st_rdev))
690 class BlockDeviceFormatter (DeviceFormatter):
691 TYPE = 'block-device'
692 class CharDeviceFormatter (DeviceFormatter):
693 TYPE = 'character-device'
695 class FileFormatter (GenericFormatter):
696 TYPE = 'regular-file'
698 class Reporter (object):
701 ST.S_IFSOCK: SocketFormatter,
702 ST.S_IFDIR: DirectoryFormatter,
703 ST.S_IFLNK: LinkFormatter,
704 ST.S_IFREG: FileFormatter,
705 ST.S_IFBLK: BlockDeviceFormatter,
706 ST.S_IFCHR: CharDeviceFormatter,
707 ST.S_IFIFO: PipeFormatter,
710 def __init__(me, db):
714 me._hsz = int(H.new(db.hash).digest_size)
717 h = me._db.hashfile(fi)
719 fmt = ErrorFormatter(fi)
722 fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
723 inoidx = fi.st.st_dev, fi.st.st_ino
725 vino = me._inomap[inoidx]
730 vino = '%08x' % (Z.crc32(bin(fi.name + suffix)) & 0xffffffff)
731 if vino not in me._vinomap: break
732 suffix = '\0%d' % seq
734 me._inomap[inoidx] = vino
735 if OPTS.compat >= 2: me._vinomap[vino] = inoidx
737 else: info = '[%-*s]' % (2*me._hsz - 2, fmt.info())
738 print('%s %8s %6s %-12s %-20s %20s %s' %
739 (info, vino, fmt.mode(), fmt.owner(),
740 fmt.mtime(), fmt.size(), fmt.name()))
744 for which, acl, err in \
745 [("posix-access", fi.acl_acc, fi.aclerr_acc),
746 ("posix-default", fi.acl_dflt, fi.aclerr_dflt)]:
748 print("\tacl %s %s" % (which, acl))
749 elif err is not None:
750 print("\tacl %s <E%d %s>" % (which, err.errno, err.strerror))
752 if fi.xa_err is not None:
753 print("\txattr <E%d %s>" % (fi.xa_err.errno, fi.xa_err.strerror))
755 for name in sorted(iterkeys(fi.xa)):
758 print("\txattr %s %s" %
759 (escapify(name), me._db.hashblob(attr.value)))
761 print("\txattr %s <E%d %s>" %
762 (escapify(name), attr.err.errno, attr.err.strerror))
764 ###--------------------------------------------------------------------------
765 ### Database clearing from diff files.
767 R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
769 def clear_entry(db, lno, line):
773 if line.startswith('['):
776 moan("failed to parse file entry (type field; line %d)" % lno)
778 ty = line[1:pos].strip()
779 rest = line[pos + 1:]
782 ff = line.split(None, 1)
784 moan("failed to parse file entry (field split; line %d)" % lno)
789 ff = rest.split(None, 5)
791 moan("failed to parse file entry (field split; line %d)" % lno)
793 ino, mode, uidgid, mtime, sz, name = ff
795 if ty != 'symbolic-link':
798 nn = name.split(' -> ', 1)
800 moan("failed to parse file entry (name split; line %d)" % lno)
803 target = unescapify(target)
804 name = unescapify(name)
810 moan("failed to stat `%s': %s" % (name, e.strerror))
811 if e.errno != E.ENOENT: good = False
813 print("Clear cache entry for `%s'" % name)
820 ## Work through the input diff file one line at a time.
825 if line.endswith('\n'): line = line[:-1]
828 ## We're in a gap between hunks. Find a hunk header and extract the line
830 if diffstate == 'gap':
831 m = R_HUNK.match(line)
833 oldlines = int(m.group(1))
834 newlines = int(m.group(2))
838 ## We're in a hunk. Keep track of whether we've reached the end, and
839 ## discard entries from the cache for mismatching lines.
840 elif diffstate == 'hunk':
842 moan("empty line in diff hunk (line %d)" % lno)
846 oldlines -= 1; newlines -= 1
849 if not clear_entry(db, lno, line[1:]): good = False
852 if not clear_entry(db, lno, line[1:]): good = False
854 moan("incomprehensible line in diff hunk (line %d)" % lno)
856 if oldlines < 0 or newlines < 0:
857 moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
859 if oldlines == newlines == 0:
862 if diffstate == 'hunk':
863 moan("truncated diff hunk (started at line %d)" % hdrlno)
868 ###--------------------------------------------------------------------------
872 'rsync': lambda f: enum_rsync(stdin, f),
873 'find0': lambda f: enum_find0(stdin, f)
875 op = OP.OptionParser(
876 usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
877 version = '%%prog, version %s' % VERSION,
879 Print a digest of a filesystem (or a collection of specified files) to
880 standard output. The idea is that the digest should be mostly /complete/
881 (i.e., any `interesting\' change to the filesystem results in a different
882 digest) and /canonical/ (i.e., identical filesystem contents result in
886 for short, long, props in [
887 ('-a', '--all', { 'action': 'store_true', 'dest': 'all',
888 'help': 'clear cache of all files not seen' }),
889 ('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
890 'help': 'use FILE as a cache for file hashes' }),
891 ('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
892 'type': 'choice', 'choices': list(FMTMAP.keys()),
893 'help': 'read files to report in the given FORMAT' }),
894 ('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
895 'help': 'read diff from stdin, clear cache entries' }),
896 ('-C', '--compat', { 'dest': 'compat', 'metavar': 'VERSION',
897 'type': 'int', 'default': 3,
898 'help': 'produce output with given compatibility VERSION' }),
899 ('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
900 ##'type': 'choice', 'choices': H.algorithms,
901 'help': 'use HASH as the hash function' })]:
902 op.add_option(short, long, **props)
903 OPTS, args = op.parse_args(argv)
904 if not 1 <= OPTS.compat <= 3:
905 die("unknown compatibility version %d" % OPTS.compat)
907 if OPTS.cache is None or OPTS.all or OPTS.files or len(args) > 2:
908 die("incompatible options: `-u' requires `-c CACHE', forbids others")
909 db = HashCache(OPTS.cache, OPTS.hash)
910 if len(args) == 2: OS.chdir(args[1])
912 if not clear_cache(db): good = False
916 if not OPTS.files and len(args) <= 1:
917 die("no filename sources: nothing to do")
918 db = HashCache(OPTS.cache, OPTS.hash)
922 print("## fshash report format version %d" % OPTS.compat)
925 FMTMAP[OPTS.files](rep.file)
927 enum_walk(dir, rep.file)
932 ###----- That's all, folks --------------------------------------------------