From: Ian Jackson <ian@liberator.relativity.greenend.org.uk>
Date: Fri, 15 May 2009 18:24:37 +0000 (+0100)
Subject: WIP can find crew and flag
X-Git-Tag: 1.0~75
X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=commitdiff_plain;h=c1ddacde91a4fbc9dc3b6b9fc3741d3f6564ba5f

WIP can find crew and flag
---

diff --git a/yoweb-scrape b/yoweb-scrape
index 844cd46..0866667 100755
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -1,5 +1,8 @@
 #!/usr/bin/python
 
+import signal
+signal.signal(signal.SIGINT, signal.SIG_DFL)
+
 import os
 import time
 import urllib
@@ -7,155 +10,248 @@ import urllib2
 import errno
 import sys
 import re as regexp
-import optparse
+from optparse import OptionParser
 
 from BeautifulSoup import BeautifulSoup
 
-max_age = 120
-ocean = 'ice'
-
-now = time.time()
+opts = None
 
 duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
 	'/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
 	'/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
 	'/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
 
-standings = ('Able/Distinguished/Respected/Master/Renowned'+
+standingvals = ('Able/Distinguished/Respected/Master/Renowned'+
 		'/Grand-Master/Legendary/Ultimate').split('/')
 
-def fetch(url):
-	cache_corename = urllib.quote_plus(url)
-	cache_basename = "#%s#" % cache_corename
-	try: f = file(cache_basename, 'r')
-	except (OSError,IOError), oe:
-		if oe.errno != errno.ENOENT: raise
-		f = None
-	if f is not None:
-		s = os.fstat(f.fileno())
-		if now > s.st_mtime + max_age:
+def debug(m):
+	if opts.debug:
+		print >>sys.stderr, m
+
+class Fetcher:
+	def __init__(self, ocean, cachedir):
+		debug('Fetcher init %s' % cachedir)
+		self.ocean = ocean
+		self.cachedir = cachedir
+		try: os.mkdir(cachedir)
+		except (OSError,IOError), oe:
+			if oe.errno != errno.EEXIST: raise
+
+	def _rate_limit_cache_clean(self, now):
+		ages = []
+		for path in os.listdir(self.cachedir):
+			if not path.startswith('#'): continue
+			try: s = os.stat(path)
+			except (OSError,IOError), oe:
+				if oe.errno != errno.ENOENT: raise
+				continue
+			age = now - s.st_mtime
+			if age > opts.max_age:
+				debug('Fetcher expire %d %s' % (age, path))
+				try: os.remove(path)
+				except (OSError,IOError), oe:
+					if oe.errno != errno.ENOENT: raise
+				continue
+			ages.append(age)
+		ages.sort()
+		debug('Fetcher ages ' + `ages`)
+		min_age = 1
+		need_wait = 0
+		for age in ages:
+			if age < min_age:
+				debug('Fetcher morewait min=%d age=%d' %
+					(min_age, age))
+				need_wait = max(need_wait, age - min_age)
+			min_age *= 2
+			min_age += 1
+		if need_wait:
+			debug('Fetcher wait %d' % need_wait)
+			os.sleep(need_wait)
+
+	def fetch(self, url):
+		debug('Fetcher fetch %s' % url)
+		cache_corename = urllib.quote_plus(url)
+		cache_item = "%s/#%s#" % (self.cachedir, cache_corename)
+		try: f = file(cache_item, 'r')
+		except (OSError,IOError), oe:
+			if oe.errno != errno.ENOENT: raise
 			f = None
-	if f is not None:
-		data = f.read()
-		f.close()
-	else:
-		os.sleep(1)
+		now = time.time()
+		if f is not None:
+			s = os.fstat(f.fileno())
+			if now > s.st_mtime + opts.max_age:
+				debug('Fetcher  stale')
+				f = None
+		if f is not None:
+			data = f.read()
+			f.close()
+			debug('Fetcher  cached')
+			return data
+
+		debug('Fetcher  fetch')
+		self._rate_limit_cache_clean(now)
+
 		stream = urllib2.urlopen(url)
 		data = stream.read()
-		cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
-		f = file(cache_ourname, 'w')
+		cache_tmp = "%s/#%s~%d#" % (
+			self.cachedir, cache_corename, os.getpid())
+		f = file(cache_tmp, 'w')
 		f.write(data)
 		f.close()
-		os.rename(cache_ourname, cache_basename)
-	return data
+		os.rename(cache_tmp, cache_item)
+		debug('Fetcher  stored')
+		return data
 
-def yoweb_fetch(kind, tail):
-	url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
-	return fetch(url)
+	def yoweb(self, kind, tail):
+		url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
+			self.ocean, kind, tail)
+		return self.fetch(url)
 
-class PirateInfo:
-	# Public data members:
-	#  pi.skills = { 'Treasure Haul': 'Able' ... }
-	#  pi.msgs = [ 'message describing problem with scrape' ]
-	def _log(self, m):
+class SoupLog:
+	def __init__(self):
+		self.msgs = [ ]
+	def msg(self, m):
 		self.msgs.append(m)
+	def soupm(self, obj, m):
+		self.msg(m + '; in ' + `obj`)
+	def needs_msgs(self, child_souplog):
+		self.msgs += child_souplog.msgs
+		child_souplog.msgs = [ ]
 
-	def _logsoup(self, soup, m):
-		self._log(m + '; in ' + `soup`)
+class PirateInfo(SoupLog):
+	# Public data members:
+	#  pi.standings = { 'Treasure Haul': 'Able' ... }
+	#  pi.crew = (id, name)
+	#  pi.flag = (id, name)
+	#  pi.msgs = [ 'message describing problem with scrape' ]
 
-	def __init__(self, pirate):
-		html = yoweb_fetch('pirate.wm?target=', pirate)
-		soup = BeautifulSoup(html,
-#			convertEntities=BeautifulSoup.HTML_ENTITIES
-			)
-		imgs = soup.findAll('img',
+	def _find_standings(self):
+		imgs = self.soup.findAll('img',
 			src=regexp.compile('/yoweb/images/stat.*'))
 		re = regexp.compile(
-u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
+u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\&nbsp\\;([-A-Za-z]+)\\)\\s*$'
 			)
-		skills = { }
-		self.msgs = [ ]
+		standings = { }
 
 		for skill in duties:
-			skills[skill] = [ ]
+			standings[skill] = [ ]
+
+		skl = SoupLog()
 
 		for img in imgs:
 			try: duty = img['alt']
 			except KeyError: continue
 
 			if not duty in duties:
-				self._logsoup(img, 'unknown duty: "%s"' % duty)
+				skl.soupm(img, 'unknown duty: "%s"' % duty)
 				continue
 			key = img.findParent('td')
 			if key is None:
-				self._logsoup(img, 'duty at root! "%s"' % duty)
+				skl.soupm(img, 'duty at root! "%s"' % duty)
 				continue
 			valelem = key.findNextSibling('td')
 			if valelem is None:
-				self._logsoup(key, 'duty missing sibling "%s"'
+				skl.soupm(key, 'duty missing sibling "%s"'
 					% duty)
 				continue
 			valstr = ''.join(valelem.findAll(text=True))
 			match = re.match(valstr)
 			if match is None:
-				self._logsoup(key, 'duty "%s" unparseable'+
+				skl.soupm(key, 'duty "%s" unparseable'+
 					' standing "%s"' % (duty, valstr))
 				continue
 			standing = match.group(match.lastindex)
-			skills[duty].append(standing)
+			standings[duty].append(standing)
 
-		self.skills = { }
+		self.standings = { }
 
 		for duty in duties:
-			sl = skills[duty]
+			sl = standings[duty]
 			if len(sl) > 1:
-				self.log('duty "%s" multiple standings %s' %
+				skl.msg('duty "%s" multiple standings %s' %
 						(duty, `sl`))
 				continue
 			if not len(sl):
-				self.log('duty "%s" no standing found' % duty)
+				skl.msg('duty "%s" no standing found' % duty)
 				continue
 			standing = sl[0]
-			for i in range(0, len(standings)):
-				if standing == standings[i]:
-					self.skills[duty] = i
-			if not duty in self.skills:
-				self.log('duty "%s" unknown standing "%s"' %
+			for i in range(0, len(standingvals)-1):
+				if standing == standingvals[i]:
+					self.standings[duty] = i
+			if not duty in self.standings:
+				skl.msg('duty "%s" unknown standing "%s"' %
 					(duty, standing))
-		all_skills_ok = True
+
+		all_standings_ok = True
 		for duty in duties:
-			if not duty in self.skills:
-				all_skills_ok = False
-		if all_skills_ok:
-			self.msgs = [ ]
+			if not duty in self.standings:
+				self.needs_msgs(skl)
+
+	def _find_crewflag(self, cf, yoweb_re):
+		things = self.soup.findAll('a', href=regexp.compile(yoweb_re))
+		if len(things) != 1:
+			self.msg('zero or several %s id references found' % cf)
+			return None
+		thing = things[0]
+		id_re = '\\b%sid\\=(\\w+)$' % cf
+		id_haystack = thing['href']
+		match = regexp.compile(id_re).search(id_haystack)
+		if match is None:
+			self.soupm(thing, ('incomprehensible %s id ref'+
+				' (%s in %s)') % (cf, id_re, id_haystack))
+			return None
+		name = ''.join(thing.findAll(text=True))
+		return (match.group(1), name)
+		
+	def __init__(self, pirate):
+		SoupLog.__init__(self)
+
+		html = fetcher.yoweb('pirate.wm?target=', pirate)
+		self.soup = BeautifulSoup(html,
+#			convertEntities=BeautifulSoup.HTML_ENTITIES
+			)
+
+		self._find_standings()
+
+		self.crew = self._find_crewflag('crew',
+			'^/yoweb/crew/info\\.wm')
+		self.flag = self._find_crewflag('flag',
+			'^/yoweb/flag/info\\.wm')
 
 	def __str__(self):
-		return `self.skills`
+		return `(self.crew, self.flag, self.standings, self.msgs)`
 
 def main():
-	os.chdir(os.getenv('HOME'))
-	cache_dir = '.yoweb-scrape-cache'
-	try:
-		os.chdir(cache_dir)
-	except (OSError,IOError), oe:
-		if oe.errno != errno.ENOENT: raise
-		os.mkdir(cache_dir)
-		os.chdir(cache_dir)
-
-	for path in os.listdir('.'):
-		if not path.startswith('#'): continue
-		max_time = max_age
-		if '~' in path: max_time = 10
-		try:
-			s = os.stat(path)
-			if now > s.st_mtime + max_time:
-				os.remove(path)
-		except (OSError,IOError), oe:
-			if oe.errno != errno.ENOENT: raise
+	global opts, fetcher
+
+	pa = OptionParser(
+'''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
+actions:
+ yoweb-scrape [--ocean OCEAN ...] pirate PIRATE
+ yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE
+ yoweb-scrape [--ocean OCEAN ...] dutytab-crew-of PIRATE
+''')
+	ao = pa.add_option
+	ao('-O','--ocean',dest='ocean', metavar='OCEAN',
+		default='ice',
+		help='select ocean OCEAN')
+	ao('--cache-dir', dest='cache_dir', metavar='DIR',
+		default='~/.yoweb-scrape-cache',
+		help='cache yoweb pages in DIR')
+	ao('-D','--debug', action='store_true', dest='debug', default=False,
+		help='enable debugging output')
+	ao('-q','--quiet', action='store_true', dest='quiet',
+		help='suppress warning output')
+	(opts,args) = pa.parse_args()
+
+	# fixed parameters
+	opts.max_age = 240
+	if opts.cache_dir.startswith('~/'):
+		opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:]
+
+	fetcher = Fetcher(opts.ocean, opts.cache_dir)
 
 	# test program:
-	global ocean
-	ocean = 'midnight'
 	test = PirateInfo('Anaplian')
 	print test