chiark - git - mdw - rsync-backup/blame_incremental

... / ...

Commit	Line	Data
	1	#! @PYTHON@
	2	###
	3	### Efficiently construct canonical digests of filesystems
	4	###
	5	### (c) 2012 Mark Wooding
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of the `rsync-backup' program.
	11	###
	12	### rsync-backup is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU General Public License as published by
	14	### the Free Software Foundation; either version 2 of the License, or
	15	### (at your option) any later version.
	16	###
	17	### rsync-backup is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU General Public License
	23	### along with rsync-backup; if not, write to the Free Software Foundation,
	24	### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	25
	26	from sys import argv, exit, stdin, stdout, stderr
	27	import os as OS
	28	import re as RX
	29	import time as T
	30	import errno as E
	31	import stat as ST
	32	import optparse as OP
	33	import hashlib as H
	34	import sqlite3 as DB
	35	import zlib as Z
	36
	37	PACKAGE = '@PACKAGE@'
	38	VERSION = '@VERSION@'
	39
	40	###--------------------------------------------------------------------------
	41	### Utilities.
	42
	43	QUIS = OS.path.basename(argv[0])
	44
	45	def moan(msg):
	46	stderr.write('%s: %s\n' % (QUIS, msg))
	47
	48	def die(msg, rc = 1):
	49	moan(msg)
	50	exit(rc)
	51
	52	SYSERR = 0
	53	def syserr(msg):
	54	global SYSERR
	55	moan(msg)
	56	SYSERR += 1
	57
	58	###--------------------------------------------------------------------------
	59	### File system enumeration.
	60
	61	class FileInfo (object):
	62	def __init__(me, file, st = None):
	63	me.name = file
	64	if st:
	65	me.st = st
	66	me.err = None
	67	else:
	68	try:
	69	me.st = OS.lstat(file)
	70	me.err = None
	71	except OSError, err:
	72	me.st = None
	73	me.err = err
	74
	75	def enum_walk(file, func):
	76
	77	def dirents(name):
	78	try:
	79	return OS.listdir(name)
	80	except OSError, err:
	81	syserr("failed to read directory `%s': %s" % (name, err.strerror))
	82	return []
	83
	84	def dir(ee, dev):
	85	ff = []
	86	dd = []
	87	for e in ee:
	88	fi = FileInfo(e)
	89	if fi.st and fi.st.st_dev != dev: pass
	90	if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
	91	else: ff.append(fi)
	92	ff.sort(key = lambda fi: fi.name)
	93	dd.sort(key = lambda fi: fi.name + '/')
	94	for f in ff:
	95	func(f)
	96	for d in dd:
	97	if d.st.st_dev == dev:
	98	func(d)
	99	dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
	100
	101	if file.endswith('/'):
	102	cwd = OS.open('.', OS.O_RDONLY)
	103	try:
	104	OS.chdir(file)
	105	fi = FileInfo('.')
	106	func(fi)
	107	dir(dirents('.'), fi.st.st_dev)
	108	finally:
	109	OS.fchdir(cwd)
	110	OS.close(cwd)
	111	else:
	112	fi = FileInfo(file)
	113	func(fi)
	114	if fi.st and ST.S_ISDIR(fi.st.st_mode):
	115	dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
	116	fi.st.st_dev)
	117
	118	def enum_find0(f, func):
	119	tail = ""
	120	while True:
	121	buf = f.read(8192)
	122	last = len(buf) == 0
	123	names = (tail + buf).split('\0')
	124	tail = names.pop()
	125	for n in names:
	126	func(FileInfo(n))
	127	if last:
	128	break
	129	if len(tail):
	130	moan("ignored trailing junk after last filename")
	131
	132	RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
	133	def enum_rsync(f, func):
	134
	135	## The format is a little fiddly. Each line consists of PERMS SIZE DATE
	136	## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
	137	## one space character after the TIME and may begin with a space.
	138	## Sequences of the form `\#OOO' where OOO are three octal digits, stand
	139	## for a byte with that value. Newlines and backslashes which would be
	140	## ambiguous are converted into this form; all other characters are
	141	## literal.
	142	##
	143	## We ignore the stat information and retrieve it ourselves, because it's
	144	## incomplete. Hopefully the dcache is still warm.
	145
	146	for line in f:
	147	if line.endswith('\n'): line = line[:-1]
	148
	149	## Extract the escaped name.
	150	ff = line.split(None, 3)
	151	if len(ff) != 4:
	152	syserr("ignoring invalid line from rsync: `%s'" % line)
	153	continue
	154	tail = ff[3]
	155	try:
	156	spc = tail.index(' ')
	157	except ValueError:
	158	syserr("ignoring invalid line from rsync: `%s'" % line)
	159	continue
	160	name = tail[spc + 1:]
	161
	162	## Now translate escape sequences.
	163	name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
	164
	165	## Call the client.
	166	try:
	167	fi = FileInfo(name)
	168	except OSError, err:
	169	syserr("failed to stat `%s': %s" % (name, err.strerror))
	170	continue
	171	func(fi)
	172
	173	###--------------------------------------------------------------------------
	174	### The hash cache.
	175
	176	class HashCache (object):
	177
	178	VERSION = 0
	179	BUFSZ = 128*1024
	180
	181	INIT = [
	182	"""CREATE TABLE meta (
	183	version INTEGER NOT NULL,
	184	hash TEXT NOT NULL
	185	);""",
	186	"""CREATE TABLE hash (
	187	ino INTEGER PRIMARY KEY,
	188	mtime INTEGER NOT NULL,
	189	ctime INTEGER NOT NULL,
	190	size INTEGER NOT NULL,
	191	hash TEXT NOT NULL,
	192	seen BOOLEAN NOT NULL DEFAULT TRUE
	193	);""",
	194	"""PRAGMA journal_mode = WAL;"""
	195	]
	196
	197	def __init__(me, file, hash = None):
	198
	199	if file is None:
	200
	201	## We're going this alone, with no cache.
	202	db = None
	203	if hash is None:
	204	die("no hash specified and no database cache to read from")
	205	else:
	206
	207	## Connect to the database.
	208	db = DB.connect(file)
	209	db.text_factory = str
	210
	211	## See whether we can understand the cache database.
	212	c = db.cursor()
	213	v = h = None
	214	try:
	215	c.execute('SELECT version, hash FROM meta')
	216	v, h = c.fetchone()
	217	if c.fetchone() is not None:
	218	die("cache database corrupt: meta table has mutliple rows")
	219	except (DB.Error, TypeError):
	220	pass
	221
	222	## If that didn't work, we'd better clear the thing and start again.
	223	## But only if we know how to initialize it.
	224	if v != me.VERSION:
	225
	226	## Explain the situation.
	227	moan("cache version %s not understood" % v)
	228	if hash is None:
	229	if h is None:
	230	die("can't initialize cache: no hash function set")
	231	else:
	232	hash = h
	233	try:
	234	H.new(hash)
	235	except Exception:
	236	die("unknown hash function `%s'" % hash)
	237
	238	## Drop old things.
	239	c.execute('SELECT type, name FROM sqlite_master')
	240	for type, name in c.fetchall():
	241	c.execute('DROP %s IF EXISTS %s' % (type, name))
	242
	243	## Now we're ready to go.
	244	for stmt in me.INIT:
	245	c.execute(stmt)
	246	c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
	247	db.commit()
	248
	249	## Check the hash function if necessary.
	250	if hash is None:
	251	hash = h
	252	elif h is not None and h != hash:
	253	die("hash mismatch: cache uses %s but %s requested" % (h, hash))
	254
	255	## All done.
	256	me.hash = hash
	257	me._db = db
	258	me._pend = 0
	259
	260	def hashfile(me, fi):
	261
	262	## If this isn't a proper file then don't try to hash it.
	263	if fi.err or not ST.S_ISREG(fi.st.st_mode):
	264	return None
	265
	266	## See whether there's a valid entry in the cache.
	267	if me._db:
	268	c = me._db.cursor()
	269	c.execute(
	270	'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
	271	[fi.st.st_ino])
	272	r = c.fetchone()
	273	if r is not None:
	274	mt, sz, h, s = r
	275	if mt == fi.st.st_mtime and \
	276	sz == fi.st.st_size:
	277	if not s:
	278	c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
	279	[fi.st.st_ino])
	280	me._update()
	281	return h
	282
	283	## Hash the file. Beware raciness: update the file information from the
	284	## open descriptor, but set the size from what we actually read.
	285	h = H.new(me.hash)
	286	try:
	287	with open(fi.name, 'rb') as f:
	288	sz = 0
	289	while True:
	290	buf = f.read(me.BUFSZ)
	291	if len(buf) == 0:
	292	break
	293	sz += len(buf)
	294	h.update(buf)
	295	fi.st = OS.fstat(f.fileno())
	296	##fi.st.st_size = sz
	297	hash = h.digest()
	298	except (OSError, IOError), err:
	299	fi.st = None
	300	fi.err = err
	301	return None
	302	hash = hash.encode('hex')
	303
	304	## Insert a record into the database.
	305	if me._db:
	306	c.execute("""
	307	INSERT OR REPLACE INTO hash
	308	(ino, mtime, ctime, size, hash, seen)
	309	VALUES
	310	(?, ?, ?, ?, ?, 1);
	311	""", [fi.st.st_ino,
	312	fi.st.st_mtime,
	313	fi.st.st_ctime,
	314	fi.st.st_size,
	315	hash])
	316	me._update()
	317
	318	## Done.
	319	return hash
	320
	321	def _update(me):
	322	me._pend += 1
	323	if me._pend >= 1024:
	324	me.flush()
	325
	326	def flush(me):
	327	if me._db:
	328	me._db.commit()
	329	me._pend = 0
	330
	331	def need_db(me):
	332	if not me._db:
	333	die("no cache database")
	334
	335	def forget(me, ino):
	336	me.need_db()
	337	c = me._db.cursor()
	338	c.execute('DELETE FROM hash WHERE ino = ?', [ino])
	339
	340	def reset(me):
	341	me.need_db()
	342	c = me._db.cursor()
	343	c.execute('UPDATE hash SET seen = 0 WHERE seen')
	344	me.flush()
	345
	346	def prune(me):
	347	me.need_db()
	348	c = me._db.cursor()
	349	c.execute('DELETE FROM hash WHERE NOT seen')
	350	me.flush()
	351
	352	###--------------------------------------------------------------------------
	353	### Printing output.
	354
	355	class GenericFormatter (object):
	356	def __init__(me, fi):
	357	me.fi = fi
	358	def _fmt_time(me, t):
	359	tm = T.gmtime(t)
	360	return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
	361	def _enc_name(me, n):
	362	return ' \\-> '.join(n.encode('string_escape').split(' -> '))
	363	def name(me):
	364	return me._enc_name(me.fi.name)
	365	def info(me):
	366	return me.TYPE
	367	def mode(me):
	368	return '%06o' % me.fi.st.st_mode
	369	def size(me):
	370	return me.fi.st.st_size
	371	def mtime(me):
	372	return me._fmt_time(me.fi.st.st_mtime)
	373	def owner(me):
	374	return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
	375
	376	class ErrorFormatter (GenericFormatter):
	377	def info(me):
	378	return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
	379	def error(me): return 'error'
	380	mode = size = mtime = owner = error
	381
	382	class SocketFormatter (GenericFormatter):
	383	TYPE = 'socket'
	384	class PipeFormatter (GenericFormatter):
	385	TYPE = 'fifo'
	386
	387	class LinkFormatter (GenericFormatter):
	388	TYPE = 'symbolic-link'
	389	def name(me):
	390	n = GenericFormatter.name(me)
	391	try:
	392	d = OS.readlink(me.fi.name)
	393	return '%s -> %s' % (n, me._enc_name(d))
	394	except OSError, err:
	395	return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
	396
	397	class DirectoryFormatter (GenericFormatter):
	398	TYPE = 'directory'
	399	def name(me): return GenericFormatter.name(me) + '/'
	400	def size(me): return 'dir'
	401
	402	class DeviceFormatter (GenericFormatter):
	403	def info(me):
	404	return '%s %d:%d' % (me.TYPE,
	405	OS.major(me.fi.st.st_rdev),
	406	OS.minor(me.fi.st.st_rdev))
	407	class BlockDeviceFormatter (DeviceFormatter):
	408	TYPE = 'block-device'
	409	class CharDeviceFormatter (DeviceFormatter):
	410	TYPE = 'character-device'
	411
	412	class FileFormatter (GenericFormatter):
	413	TYPE = 'regular-file'
	414
	415	class Reporter (object):
	416
	417	TYMAP = {
	418	ST.S_IFSOCK: SocketFormatter,
	419	ST.S_IFDIR: DirectoryFormatter,
	420	ST.S_IFLNK: LinkFormatter,
	421	ST.S_IFREG: FileFormatter,
	422	ST.S_IFBLK: BlockDeviceFormatter,
	423	ST.S_IFCHR: CharDeviceFormatter,
	424	ST.S_IFIFO: PipeFormatter,
	425	}
	426
	427	def __init__(me, db):
	428	me._inomap = {}
	429	me._vinomap = {}
	430	me._db = db
	431	me._hsz = int(H.new(db.hash).digest_size)
	432
	433	def file(me, fi):
	434	h = me._db.hashfile(fi)
	435	if fi.err:
	436	fmt = ErrorFormatter(fi)
	437	vino = 'error'
	438	else:
	439	fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
	440	inoidx = fi.st.st_dev, fi.st.st_ino
	441	try:
	442	vino = me._inomap[inoidx]
	443	except KeyError:
	444	suffix = ''
	445	seq = 0
	446	while True:
	447	vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
	448	if vino not in me._vinomap: break
	449	suffix = '\0%d' % seq
	450	seq += 1
	451	me._inomap[inoidx] = vino
	452	if h: info = h
	453	else: info = '[%-s]' % (2me._hsz - 2, fmt.info())
	454	print '%s %8s %6s %-12s %-20s %20s %s' % (
	455	info, vino, fmt.mode(), fmt.owner(),
	456	fmt.mtime(), fmt.size(), fmt.name())
	457
	458	###--------------------------------------------------------------------------
	459	### Database clearing from diff files.
	460
	461	R_HUNK = RX.compile(r'^@@ -\d+,(\d+) \+\d+,(\d+) @@$')
	462
	463	def clear_entry(db, lno, line):
	464
	465	good = True
	466
	467	if line.startswith('['):
	468	pos = line.find(']')
	469	if pos < 0:
	470	moan("failed to parse file entry (type field; line %d)" % lno)
	471	return False
	472	ty = line[1:pos].strip()
	473	rest = line[pos + 1:]
	474	hash = None
	475	else:
	476	ff = line.split(None, 1)
	477	if len(ff) != 2:
	478	moan("failed to parse file entry (field split; line %d)" % lno)
	479	return False
	480	ty = 'regular-file'
	481	hash, rest = ff
	482
	483	ff = rest.split(None, 5)
	484	if len(ff) != 6:
	485	moan("failed to parse file entry (field split; line %d)" % lno)
	486	return False
	487	ino, mode, uidgid, mtime, sz, name = ff
	488
	489	if ty != 'symbolic-link':
	490	target = None
	491	else:
	492	nn = name.split(' -> ', 1)
	493	if len(nn) != 2:
	494	moan("failed to parse file entry (name split; line %d)" % lno)
	495	return False
	496	name, target = nn
	497	target = target.decode('string_escape')
	498	name = name.decode('string_escape')
	499
	500	try:
	501	st = OS.lstat(name)
	502	except OSError, e:
	503	moan("failed to stat `%s': %s" % (name, e.strerror))
	504	if e.errno != E.ENOENT: good = False
	505	else:
	506	print "Clear cache entry for `%s'" % name
	507	db.forget(st.st_ino)
	508
	509	return good
	510
	511	def clear_cache(db):
	512
	513	## Work through the input diff file one line at a time.
	514	diffstate = 'gap'
	515	lno = 0
	516	good = True
	517	for line in stdin:
	518	if line.endswith('\n'): line = line[:-1]
	519	lno += 1
	520
	521	## We're in a gap between hunks. Find a hunk header and extract the line
	522	## counts.
	523	if diffstate == 'gap':
	524	m = R_HUNK.match(line)
	525	if m:
	526	oldlines = int(m.group(1))
	527	newlines = int(m.group(2))
	528	diffstate = 'hunk'
	529	hdrlno = lno
	530
	531	## We're in a hunk. Keep track of whether we've reached the end, and
	532	## discard entries from the cache for mismatching lines.
	533	elif diffstate == 'hunk':
	534	if len(line) == 0:
	535	moan("empty line in diff hunk (line %d)" % lno)
	536	good = False
	537	ty = line[0]
	538	if ty == ' ':
	539	oldlines -= 1; newlines -= 1
	540	elif ty == '+':
	541	newlines -= 1
	542	if not clear_entry(db, lno, line[1:]): good = False
	543	elif ty == '-':
	544	oldlines -= 1
	545	if not clear_entry(db, lno, line[1:]): good = False
	546	else:
	547	moan("incomprehensible line in diff hunk (line %d)" % lno)
	548	good = false
	549	if oldlines < 0 or newlines < 0:
	550	moan("inconsistent lengths in diff hunk header (line %d)" % hdrlno)
	551	good = False
	552	if oldlines == newlines == 0:
	553	diffstate = 'gap'
	554
	555	if diffstate == 'hunk':
	556	moan("truncated diff hunk (started at line %d)" % hdrlno)
	557	good = False
	558
	559	return good
	560
	561	###--------------------------------------------------------------------------
	562	### Main program.
	563
	564	FMTMAP = {
	565	'rsync': lambda f: enum_rsync(stdin, f),
	566	'find0': lambda f: enum_find0(stdin, f)
	567	}
	568	op = OP.OptionParser(
	569	usage = '%prog [-au] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
	570	version = '%%prog, version %s' % VERSION,
	571	description = '''\
	572	Print a digest of a filesystem (or a collection of specified files) to
	573	standard output. The idea is that the digest should be mostly /complete/
	574	(i.e., any `interesting\' change to the filesystem results in a different
	575	digest) and /canonical/ (i.e., identical filesystem contents result in
	576	identical output).
	577	''')
	578
	579	for short, long, props in [
	580	('-a', '--all', { 'action': 'store_true', 'dest': 'all',
	581	'help': 'clear cache of all files not seen' }),
	582	('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
	583	'help': 'use FILE as a cache for file hashes' }),
	584	('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
	585	'type': 'choice', 'choices': FMTMAP.keys(),
	586	'help': 'read files to report in the given FORMAT' }),
	587	('-u', '--udiff', { 'action': 'store_true', 'dest': 'udiff',
	588	'help': 'read diff from stdin, clear cache entries' }),
	589	('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
	590	##'type': 'choice', 'choices': H.algorithms,
	591	'help': 'use HASH as the hash function' })]:
	592	op.add_option(short, long, **props)
	593	opts, args = op.parse_args(argv)
	594
	595	if opts.udiff:
	596	if opts.cache is None or opts.all or opts.files or len(args) > 2:
	597	die("incompatible options: `-u' requires `-c CACHE', forbids others")
	598	db = HashCache(opts.cache, opts.hash)
	599	if len(args) == 2: OS.chdir(args[1])
	600	good = True
	601	if not clear_cache(db): good = False
	602	if good: db.flush()
	603	else: exit(2)
	604	else:
	605	if not opts.files and len(args) <= 1:
	606	die("no filename sources: nothing to do")
	607	db = HashCache(opts.cache, opts.hash)
	608	if opts.all:
	609	db.reset()
	610	rep = Reporter(db)
	611	if opts.files:
	612	FMTMAP[opts.files](rep.file)
	613	for dir in args[1:]:
	614	enum_walk(dir, rep.file)
	615	if opts.all:
	616	db.prune()
	617	db.flush()
	618
	619	###----- That's all, folks --------------------------------------------------