chiark - git - mdw - rsync-backup/blame_incremental

... / ...

Commit	Line	Data
	1	#! @PYTHON@
	2	###
	3	### Efficiently construct canonical digests of filesystems
	4	###
	5	### (c) 2012 Mark Wooding
	6	###
	7
	8	###----- Licensing notice ---------------------------------------------------
	9	###
	10	### This file is part of the `rsync-backup' program.
	11	###
	12	### rsync-backup is free software; you can redistribute it and/or modify
	13	### it under the terms of the GNU General Public License as published by
	14	### the Free Software Foundation; either version 2 of the License, or
	15	### (at your option) any later version.
	16	###
	17	### rsync-backup is distributed in the hope that it will be useful,
	18	### but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	### GNU General Public License for more details.
	21	###
	22	### You should have received a copy of the GNU General Public License
	23	### along with rsync-backup; if not, write to the Free Software Foundation,
	24	### Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	25
	26	from sys import argv, exit, stdin, stdout, stderr
	27	import os as OS
	28	import re as RX
	29	import time as T
	30	import stat as ST
	31	import optparse as OP
	32	import hashlib as H
	33	import sqlite3 as DB
	34	import zlib as Z
	35
	36	PACKAGE = '@PACKAGE@'
	37	VERSION = '@VERSION@'
	38
	39	###--------------------------------------------------------------------------
	40	### Utilities.
	41
	42	QUIS = OS.path.basename(argv[0])
	43
	44	def moan(msg):
	45	stderr.write('%s: %s\n' % (QUIS, msg))
	46
	47	def die(msg, rc = 1):
	48	moan(msg)
	49	exit(rc)
	50
	51	SYSERR = 0
	52	def syserr(msg):
	53	global SYSERR
	54	moan(msg)
	55	SYSERR += 1
	56
	57	###--------------------------------------------------------------------------
	58	### File system enumeration.
	59
	60	class FileInfo (object):
	61	def __init__(me, file, st = None):
	62	me.name = file
	63	if st:
	64	me.st = st
	65	me.err = None
	66	else:
	67	try:
	68	me.st = OS.lstat(file)
	69	me.err = None
	70	except OSError, err:
	71	me.st = None
	72	me.err = err
	73
	74	def enum_walk(file, func):
	75
	76	def dirents(name):
	77	try:
	78	return OS.listdir(name)
	79	except OSError, err:
	80	syserr("failed to read directory `%s': %s" % (name, err.strerror))
	81	return []
	82
	83	def dir(ee, dev):
	84	ff = []
	85	dd = []
	86	for e in ee:
	87	fi = FileInfo(e)
	88	if fi.st and fi.st.st_dev != dev: pass
	89	if fi.st and ST.S_ISDIR(fi.st.st_mode): dd.append(fi)
	90	else: ff.append(fi)
	91	ff.sort(key = lambda fi: fi.name)
	92	dd.sort(key = lambda fi: fi.name + '/')
	93	for f in ff:
	94	func(f)
	95	for d in dd:
	96	if d.st.st_dev == dev:
	97	func(d)
	98	dir([OS.path.join(d.name, e) for e in dirents(d.name)], dev)
	99
	100	if file.endswith('/'):
	101	cwd = OS.open('.', OS.O_RDONLY)
	102	try:
	103	OS.chdir(file)
	104	fi = FileInfo('.')
	105	func(fi)
	106	dir(dirents('.'), fi.st.st_dev)
	107	finally:
	108	OS.fchdir(cwd)
	109	OS.close(cwd)
	110	else:
	111	fi = FileInfo(file)
	112	func(fi)
	113	if fi.st and ST.S_ISDIR(fi.st.st_mode):
	114	dir([OS.path.join(fi.name, e) for e in dirents(fi.name)],
	115	fi.st.st_dev)
	116
	117	def enum_find0(f, func):
	118	tail = ""
	119	while True:
	120	buf = f.read(8192)
	121	last = len(buf) == 0
	122	names = (tail + buf).split('\0')
	123	tail = names.pop()
	124	for n in names:
	125	func(FileInfo(n))
	126	if last:
	127	break
	128	if len(tail):
	129	moan("ignored trailing junk after last filename")
	130
	131	RX_RSYNCESC = RX.compile(r'\\ \# ([0-7]{3})', RX.VERBOSE)
	132	def enum_rsync(f, func):
	133
	134	## The format is a little fiddly. Each line consists of PERMS SIZE DATE
	135	## TIME NAME, separated by runs of whitespace, but the NAME starts exactly
	136	## one space character after the TIME and may begin with a space.
	137	## Sequences of the form `\#OOO' where OOO are three octal digits, stand
	138	## for a byte with that value. Newlines and backslashes which would be
	139	## ambiguous are converted into this form; all other characters are
	140	## literal.
	141	##
	142	## We ignore the stat information and retrieve it ourselves, because it's
	143	## incomplete. Hopefully the dcache is still warm.
	144
	145	for line in f:
	146	if line.endswith('\n'): line = line[:-1]
	147
	148	## Extract the escaped name.
	149	ff = line.split(None, 3)
	150	if len(ff) != 4:
	151	syserr("ignoring invalid line from rsync: `%s'" % line)
	152	continue
	153	tail = ff[3]
	154	try:
	155	spc = tail.index(' ')
	156	except ValueError:
	157	syserr("ignoring invalid line from rsync: `%s'" % line)
	158	continue
	159	name = tail[spc + 1:]
	160
	161	## Now translate escape sequences.
	162	name = RX_RSYNCESC.sub(lambda m: chr(int(m.group(1), 8)), name)
	163
	164	## Call the client.
	165	try:
	166	fi = FileInfo(name)
	167	except OSError, err:
	168	syserr("failed to stat `%s': %s" % (name, err.strerror))
	169	continue
	170	func(fi)
	171
	172	###--------------------------------------------------------------------------
	173	### The hash cache.
	174
	175	class HashCache (object):
	176
	177	VERSION = 0
	178	BUFSZ = 128*1024
	179
	180	INIT = [
	181	"""CREATE TABLE meta (
	182	version INTEGER NOT NULL,
	183	hash TEXT NOT NULL
	184	);""",
	185	"""CREATE TABLE hash (
	186	ino INTEGER PRIMARY KEY,
	187	mtime INTEGER NOT NULL,
	188	ctime INTEGER NOT NULL,
	189	size INTEGER NOT NULL,
	190	hash TEXT NOT NULL,
	191	seen BOOLEAN NOT NULL DEFAULT TRUE
	192	);""",
	193	"""PRAGMA journal_mode = WAL;"""
	194	]
	195
	196	def __init__(me, file, hash = None):
	197
	198	if file is None:
	199
	200	## We're going this alone, with no cache.
	201	db = None
	202	if hash is None:
	203	die("no hash specified and no database cache to read from")
	204	else:
	205
	206	## Connect to the database.
	207	db = DB.connect(file)
	208	db.text_factory = str
	209
	210	## See whether we can understand the cache database.
	211	c = db.cursor()
	212	v = h = None
	213	try:
	214	c.execute('SELECT version, hash FROM meta')
	215	v, h = c.fetchone()
	216	if c.fetchone() is not None:
	217	die("cache database corrupt: meta table has mutliple rows")
	218	except (DB.Error, TypeError):
	219	pass
	220
	221	## If that didn't work, we'd better clear the thing and start again.
	222	## But only if we know how to initialize it.
	223	if v != me.VERSION:
	224
	225	## Explain the situation.
	226	moan("cache version %s not understood" % v)
	227	if hash is None:
	228	if h is None:
	229	die("can't initialize cache: no hash function set")
	230	else:
	231	hash = h
	232	try:
	233	H.new(hash)
	234	except Exception:
	235	die("unknown hash function `%s'" % hash)
	236
	237	## Drop old things.
	238	c.execute('SELECT type, name FROM sqlite_master')
	239	for type, name in c.fetchall():
	240	c.execute('DROP %s IF EXISTS %s' % (type, name))
	241
	242	## Now we're ready to go.
	243	for stmt in me.INIT:
	244	c.execute(stmt)
	245	c.execute('INSERT INTO meta VALUES (?, ?)', [me.VERSION, hash])
	246	db.commit()
	247
	248	## Check the hash function if necessary.
	249	if hash is None:
	250	hash = h
	251	elif h is not None and h != hash:
	252	die("hash mismatch: cache uses %s but %s requested" % (h, hash))
	253
	254	## All done.
	255	me.hash = hash
	256	me._db = db
	257	me._pend = 0
	258
	259	def hashfile(me, fi):
	260
	261	## If this isn't a proper file then don't try to hash it.
	262	if fi.err or not ST.S_ISREG(fi.st.st_mode):
	263	return None
	264
	265	## See whether there's a valid entry in the cache.
	266	if me._db:
	267	c = me._db.cursor()
	268	c.execute(
	269	'SELECT mtime, size, hash, seen FROM hash WHERE ino = ?;',
	270	[fi.st.st_ino])
	271	r = c.fetchone()
	272	if r is not None:
	273	mt, sz, h, s = r
	274	if mt == fi.st.st_mtime and \
	275	sz == fi.st.st_size:
	276	if not s:
	277	c.execute('UPDATE hash SET seen = 1 WHERE ino = ?',
	278	[fi.st.st_ino])
	279	me._update()
	280	return h
	281
	282	## Hash the file. Beware raciness: update the file information from the
	283	## open descriptor, but set the size from what we actually read.
	284	h = H.new(me.hash)
	285	try:
	286	with open(fi.name, 'rb') as f:
	287	sz = 0
	288	while True:
	289	buf = f.read(me.BUFSZ)
	290	if len(buf) == 0:
	291	break
	292	sz += len(buf)
	293	h.update(buf)
	294	fi.st = OS.fstat(f.fileno())
	295	##fi.st.st_size = sz
	296	hash = h.digest()
	297	except (OSError, IOError), err:
	298	fi.st = None
	299	fi.err = err
	300	return None
	301	hash = hash.encode('hex')
	302
	303	## Insert a record into the database.
	304	if me._db:
	305	c.execute("""
	306	INSERT OR REPLACE INTO hash
	307	(ino, mtime, ctime, size, hash, seen)
	308	VALUES
	309	(?, ?, ?, ?, ?, 1);
	310	""", [fi.st.st_ino,
	311	fi.st.st_mtime,
	312	fi.st.st_ctime,
	313	fi.st.st_size,
	314	hash])
	315	me._update()
	316
	317	## Done.
	318	return hash
	319
	320	def _update(me):
	321	me._pend += 1
	322	if me._pend >= 1024:
	323	me.flush()
	324
	325	def flush(me):
	326	if me._db:
	327	me._db.commit()
	328	me._pend = 0
	329
	330	def need_db(me):
	331	if not me._db:
	332	die("no cache database")
	333
	334	def reset(me):
	335	me.need_db()
	336	c = me._db.cursor()
	337	c.execute('UPDATE hash SET seen = 0 WHERE seen')
	338	me.flush()
	339
	340	def prune(me):
	341	me.need_db()
	342	c = me._db.cursor()
	343	c.execute('DELETE FROM hash WHERE NOT seen')
	344	me.flush()
	345
	346	###--------------------------------------------------------------------------
	347	### Printing output.
	348
	349	class GenericFormatter (object):
	350	def __init__(me, fi):
	351	me.fi = fi
	352	def _fmt_time(me, t):
	353	tm = T.gmtime(t)
	354	return T.strftime('%Y-%m-%dT%H:%M:%SZ', tm)
	355	def _enc_name(me, n):
	356	return ' \\-> '.join(n.encode('string_escape').split(' -> '))
	357	def name(me):
	358	return me._enc_name(me.fi.name)
	359	def info(me):
	360	return me.TYPE
	361	def mode(me):
	362	return '%06o' % me.fi.st.st_mode
	363	def size(me):
	364	return me.fi.st.st_size
	365	def mtime(me):
	366	return me._fmt_time(me.fi.st.st_mtime)
	367	def owner(me):
	368	return '%5d:%d' % (me.fi.st.st_uid, me.fi.st.st_gid)
	369
	370	class ErrorFormatter (GenericFormatter):
	371	def info(me):
	372	return 'E%d %s' % (me.fi.err.errno, me.fi.err.strerror)
	373	def error(me): return 'error'
	374	mode = size = mtime = owner = error
	375
	376	class SocketFormatter (GenericFormatter):
	377	TYPE = 'socket'
	378	class PipeFormatter (GenericFormatter):
	379	TYPE = 'fifo'
	380
	381	class LinkFormatter (GenericFormatter):
	382	TYPE = 'symbolic-link'
	383	def name(me):
	384	n = GenericFormatter.name(me)
	385	try:
	386	d = OS.readlink(me.fi.name)
	387	return '%s -> %s' % (n, me._enc_name(d))
	388	except OSError, err:
	389	return '%s -> <E%d %s>' % (n, err.errno, err.strerror)
	390
	391	class DirectoryFormatter (GenericFormatter):
	392	TYPE = 'directory'
	393	def name(me): return GenericFormatter.name(me) + '/'
	394	def size(me): return 'dir'
	395
	396	class DeviceFormatter (GenericFormatter):
	397	def info(me):
	398	return '%s %d:%d' % (me.TYPE,
	399	OS.major(me.fi.st.st_rdev),
	400	OS.minor(me.fi.st.st_rdev))
	401	class BlockDeviceFormatter (DeviceFormatter):
	402	TYPE = 'block-device'
	403	class CharDeviceFormatter (DeviceFormatter):
	404	TYPE = 'character-device'
	405
	406	class FileFormatter (GenericFormatter):
	407	TYPE = 'regular-file'
	408
	409	class Reporter (object):
	410
	411	TYMAP = {
	412	ST.S_IFSOCK: SocketFormatter,
	413	ST.S_IFDIR: DirectoryFormatter,
	414	ST.S_IFLNK: LinkFormatter,
	415	ST.S_IFREG: FileFormatter,
	416	ST.S_IFBLK: BlockDeviceFormatter,
	417	ST.S_IFCHR: CharDeviceFormatter,
	418	ST.S_IFIFO: PipeFormatter,
	419	}
	420
	421	def __init__(me, db):
	422	me._inomap = {}
	423	me._vinomap = {}
	424	me._db = db
	425	me._hsz = int(H.new(db.hash).digest_size)
	426
	427	def file(me, fi):
	428	h = me._db.hashfile(fi)
	429	if fi.err:
	430	fmt = ErrorFormatter(fi)
	431	vino = 'error'
	432	else:
	433	fmt = me.TYMAP[ST.S_IFMT(fi.st.st_mode)](fi)
	434	inoidx = fi.st.st_dev, fi.st.st_ino
	435	try:
	436	vino = me._inomap[inoidx]
	437	except KeyError:
	438	suffix = ''
	439	seq = 0
	440	while True:
	441	vino = '%08x' % (Z.crc32(fi.name + suffix) & 0xffffffff)
	442	if vino not in me._vinomap: break
	443	suffix = '\0%d' % seq
	444	seq += 1
	445	me._inomap[inoidx] = vino
	446	if h: info = h
	447	else: info = '[%-s]' % (2me._hsz - 2, fmt.info())
	448	print '%s %8s %6s %-12s %-20s %20s %s' % (
	449	info, vino, fmt.mode(), fmt.owner(),
	450	fmt.mtime(), fmt.size(), fmt.name())
	451
	452	###--------------------------------------------------------------------------
	453	### Main program.
	454
	455	FMTMAP = {
	456	'rsync': lambda f: enum_rsync(stdin, f),
	457	'find0': lambda f: enum_find0(stdin, f)
	458	}
	459	op = OP.OptionParser(
	460	usage = '%prog [-a] [-c CACHE] [-f FORMAT] [-H HASH] [FILE ...]',
	461	version = '%%prog, version %s' % VERSION,
	462	description = '''\
	463	Print a digest of a filesystem (or a collection of specified files) to
	464	standard output. The idea is that the digest should be mostly /complete/
	465	(i.e., any `interesting\' change to the filesystem results in a different
	466	digest) and /canonical/ (i.e., identical filesystem contents result in
	467	identical output).
	468	''')
	469
	470	for short, long, props in [
	471	('-a', '--all', { 'action': 'store_true', 'dest': 'all',
	472	'help': 'clear cache of all files not seen' }),
	473	('-c', '--cache', { 'dest': 'cache', 'metavar': 'FILE',
	474	'help': 'use FILE as a cache for file hashes' }),
	475	('-f', '--files', { 'dest': 'files', 'metavar': 'FORMAT',
	476	'type': 'choice', 'choices': FMTMAP.keys(),
	477	'help': 'read files to report in the given FORMAT' }),
	478	('-H', '--hash', { 'dest': 'hash', 'metavar': 'HASH',
	479	##'type': 'choice', 'choices': H.algorithms,
	480	'help': 'use HASH as the hash function' })]:
	481	op.add_option(short, long, **props)
	482	opts, args = op.parse_args(argv)
	483
	484	if not opts.files and len(args) <= 1:
	485	die("no filename sources: nothing to do")
	486	db = HashCache(opts.cache, opts.hash)
	487	if opts.all:
	488	db.reset()
	489	rep = Reporter(db)
	490	if opts.files:
	491	FMTMAP[opts.files](rep.file)
	492	for dir in args[1:]:
	493	enum_walk(dir, rep.file)
	494	if opts.all:
	495	db.prune()
	496	db.flush()
	497
	498	###----- That's all, folks --------------------------------------------------