X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yoweb-scrape;h=90a5677c50e3ff1b1a01fb5427d98ff5ebf564ca;hp=68df2163812afe1b409937bd3fff9c24147073f7;hb=bb6f3057cff743f25c2ef95a734b9c0f05c97ff6;hpb=5ee6146a73510ae17e9dc8d78d46ec0098548c49

diff --git a/yoweb-scrape b/yoweb-scrape
index 68df216..90a5677 100755
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -110,19 +110,14 @@ def yppsc_dir():
 #---------- caching and rate-limiting data fetcher ----------
 
 class Fetcher:
-	def __init__(self, ocean, cachedir):
+	def __init__(self, cachedir):
 		debug('Fetcher init %s' % cachedir)
-		self.ocean = ocean
 		self.cachedir = cachedir
 		try: os.mkdir(cachedir)
 		except (OSError,IOError), oe:
 			if oe.errno != errno.EEXIST: raise
 		self._cache_scan(time.time())
 
-	def default_ocean(self, ocean='ice'):
-		if self.ocean is None:
-			self.ocean = ocean
-
 	def _cache_scan(self, now):
 		# returns list of ages, unsorted
 		ages = []
@@ -204,12 +199,38 @@ class Fetcher:
 		debug('Fetcher  stored')
 		return data
 
+class Yoweb(Fetcher):
+	def __init__(self, ocean, cachedir):
+		debug('Yoweb init %s' % cachedir)
+		self.ocean = ocean
+		Fetcher.__init__(self, cachedir)
+
+	def default_ocean(self, ocean='ice'):
+		if self.ocean is None:
+			self.ocean = ocean
+
 	def yoweb(self, kind, tail, max_age):
 		self.default_ocean()
+		assert(self.ocean)
 		url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
 			self.ocean, kind, tail)
 		return self.fetch(url, max_age)
 
+class Yppedia(Fetcher):
+	def __init__(self, cachedir):
+		debug('Yoweb init %s' % cachedir)
+		self.base = 'http://yppedia.puzzlepirates.com/'
+		self.localhtml = opts.localhtml
+		Fetcher.__init__(self, cachedir)
+
+	def __call__(self, rhs):
+		if self.localhtml is None:
+			url = self.base + rhs
+			debug('Yppedia retrieving YPP '+url);
+			return self.fetch(url, 3000)
+		else:
+			return file(opts.localhtml + '/' + rhs, 'r')
+
 #---------- logging assistance for troubled screenscrapers ----------
 
 class SoupLog:
@@ -429,18 +450,13 @@ class FlagInfo(SomethingSoupInfo):
 				(`head`, ``waritem``))
 
 		def wihelp_item(waritem, thing):
-			if waritem.name == 'a':
-				url = waritem.get('href', None)
-				if url is None:
-					return ('no url for '+thing,None,None)
-			else:
-				hr = waritem.find('a',{'href':True})
-				if not hr: return ('no a for '+thing,None,None)
-				url = hr['href']
+			url = waritem.get('href', None)
+			if url is None:
+				return ('no url for '+thing,None,None)
 			m = regexp.search('\?'+thing+'id=(\d+)$', url)
 			if not m: return ('no '+thing+'id',None,None)
 			tid = m.group(1)
-			tname = m.string
+			tname = waritem.string
 			if tname is None:
 				return (thing+' name not just string',None,None)
 			return (None,tid,tname)
@@ -452,7 +468,7 @@ class FlagInfo(SomethingSoupInfo):
 			if rel: return 'flag id twice!'
 			if flagname in self.relation_byname:
 				return 'flag name twice!'
-			rel = (flagname,flagid,[], thisdecl,othermin,othermax)
+			rel = (flagname,flagid,head, thisdecl,othermin,othermax)
 			self.relations.append(rel)
 			self.relation_byid[flagid] = rel
 			self.relation_byname[flagid] = rel
@@ -472,20 +488,24 @@ class FlagInfo(SomethingSoupInfo):
 
 		how = (wi_warn, None)
 
-		for waritem in warinfo.contents:
-			debug('WARITEM '+``waritem``)
-			if isinstance(waritem, unicode):
-				waritem = waritem.strip()
-				if waritem: warn('unknown waritem '+``waritem``)
-				continue
-			if waritem.name == 'br':
-				continue
-			if waritem.name == 'b':
-				head = ''.join(waritem.findAll(text=True))
+		for waritem in warinfo.findAll(['font','a']):
+			if waritem is None: break
+			if waritem.name == 'font':
+				colour = waritem.get('color',None)
+				if colour.lstrip('#') != '958A5F':
+					warn('strange colour %s in %s' %
+						(colour,``waritem``))
+					continue
+				head = waritem.string
+				if head is None:
+					warn('no head string in '+``waritem``)
+					continue
 				head = regexp.sub('\\s+', ' ', head).strip()
 				head = head.rstrip(':')
 				how = (head,) + warmap.get(head, (wi_warn,))
 				continue
+			assert(waritem.name == 'a')				
+
 			debug('WARHOW %s(%s, waritem, *%s)' %
 				(how[1], `how[0]`, `how[2:]`))
 			bad = how[1](how[0], waritem, *how[2:])
@@ -498,49 +518,139 @@ class FlagInfo(SomethingSoupInfo):
 
 #---------- scraper for ocean info incl. embargoes etc. ----------
 
-class IslandInfo():
+class IslandBasicInfo():
+	# Public data attributes:
+	#  ocean
+	#  name
+	# Public data attributes maybe set by caller:
+	#  arch
 	def __init__(self, ocean, islename):
 		self.ocean = ocean
 		self.name = islename
-	def collect(self):
-		pass
-	def yppedia_dataf(self):
+	def yppedia(self):
 		def q(x): return urllib.quote(x.replace(' ','_'))
 		url_rhs = q(self.name) + '_(' + q(self.ocean) + ')'
-		if opts.localhtml is None:
-			url = 'http://yppedia.puzzlepirates.com/' + url_rhs
-			debug('IslandInfo retrieving YPP '+url);
-			return urllib.urlopen(url)
-		else:
-			return file(opts.localhtml + '/' + url_rhs, 'r')
-	def yoweb_url(self):
-		soup = BeautifulSoup(self.yppedia_dataf())
+		return yppedia(url_rhs)
+	def __str__(self):
+		return `(self.ocean, self.name)`
+
+class IslandExtendedInfo(IslandBasicInfo):
+	# Public data attributes (inherited):
+	#  ocean
+	#  name
+	# Public data attributes (additional):
+	#  islandid
+	#  yoweb_url
+	#  flagid
+	def __init__(self, ocean, islename):
+		IslandBasicInfo.__init__(self, ocean, islename)
+		self.islandid = None
+		self.yoweb_url = None
+		self._collect_yoweb()
+		self._collect_flagid()
+
+	def _collect_yoweb(self):
+		debug('IEI COLLECT YOWEB '+`self.name`)
+		self.islandid = None
+		self.yoweb_url = None
+
+		soup = BeautifulSoup(self.yppedia())
 		content = soup.find('div', attrs = {'id': 'content'})
 		yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
-			'yoweb/island/info\.wm\?islandid=\d+$')
+			'yoweb/island/info\.wm\?islandid=(\d+)$')
 		a = soup.find('a', attrs = { 'href': yoweb_re })
-		if a is None: return None
-		return a['href']
-	def ruling_flag_id(self):
-		yo = self.yoweb_url()
+		if a is None:
+			debug('IEI COLLECT YOWEB '+`self.name`+' NONE')
+			return
+
+		debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``)
+		self.yoweb_url = a['href']
+		m = yoweb_re.search(self.yoweb_url)
+		self.islandid = m.group(1)
+
+	def _collect_flagid(self):
+		self.flagid = None
+
+		yo = self.yoweb_url
+		debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
 		if yo is None: return None
-		dataf = fetcher.fetch(yo, 600)
+		dataf = fetcher.fetch(yo, 1800)
 		soup = BeautifulSoup(dataf)
-		ruler_re = regexp.compile('http://\w+\.puzzlepirates\.com/'+
-			'yoweb/flag/info\.wm\?flagid=(\d+)$')
+		ruler_re = regexp.compile(
+			'/yoweb/flag/info\.wm\?flagid=(\d+)$')
 		ruler = soup.find('a', attrs = { 'href': ruler_re })
-		if not ruler: return None
-		m = ruler_re.find(ruler['href'])
-		return m.group(1)
+		if not ruler: 
+			debug('IEI COLLECT FLAGID '+`self.name`+' NONE')
+			return
+		debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``)
+		m = ruler_re.search(ruler['href'])
+		self.flagid = m.group(1)
+
+	def __str__(self):
+		return `(self.ocean, self.islandid, self.name,
+			self.yoweb_url, self.flagid)`
+
+class IslandFlagInfo(IslandExtendedInfo):
+	# Public data attributes (inherited):
+	#  ocean
+	#  name
+	#  islandid
+	#  yoweb_url
+	#  flagid
+	# Public data attributes (additional):
+	#  flag
+	def __init__(self, ocean, islename):
+		IslandExtendedInfo.__init__(self, ocean, islename)
+		self.flag = None
+		self._collect_flag()
+
+	def _collect_flag(self):
+		if self.flagid is None: return
+		self.flag = FlagInfo(self.flagid, 1800)
+
+	def __str__(self):
+		return IslandExtendedInfo.__str__(self) + '; ' + str(self.flag)
+
+class NullProgressReporter():
+	def start(self): pass
+	def doing(self, msg): pass
+	def stop(self): pass
+
+class TypewriterProgressReporter():
+	def start(self):
+		self._l = 0
+	def doing(self,m):
+		self._doing(m + '...')
+	def _doing(self,m):
+		self._write('\r')
+		self._write(m)
+		less = self._l - len(m)
+		if less > 0:
+			self._write(' ' * less)
+			self._write('\b' * less)
+		self._l = len(m)
+		sys.stdout.flush()
+	def stop(self):
+		self._doing('')
+		self._l = 0
+	def _write(self,t):
+		sys.stdout.write(t)
 
 class OceanInfo():
-	# Public data attributes (valid after collect()):
+	# Public data attributes:
 	#   oi.islands[islename] = IslandInfo(...)
 	#   oi.arches[archname][islename] = IslandInfo(...)
-	def __init__(self):
-		self.isleclass = IslandInfo
+	def __init__(self, isleclass=IslandBasicInfo, progressreporter=None):
+		if progressreporter is None:
+			if opts.debug: progressreporter = NullProgressReporter()
+			else: progressreporter = TypewriterProgressReporter()
+
+		self.isleclass = isleclass
 		self.ocean = fetcher.ocean.lower().capitalize()
-	def collect(self):
+
+		progressreporter.start()
+		progressreporter.doing('fetching ocean info')
+
 		cmdl = ['./yppedia-ocean-scraper']
 		if opts.localhtml is not None:
 			cmdl += ['--local-html-dir',opts.localhtml]
@@ -558,10 +668,16 @@ class OceanInfo():
 		arch_re = regexp.compile('^ (\S.*)')
 		island_re = regexp.compile('^  (\S.*)')
 
+		oscraper.wait()
+		assert(oscraper.returncode == 0)
+
 		self.islands = { }
 		self.arches = { }
 		archname = None
 
+		isles = [ ]
+		progressreporter.doing('parsing ocean info')
+
 		for l in oscraper.stdout:
 			debug('OceanInfo collect l '+`l`)
 			l = l.rstrip('\n')
@@ -569,10 +685,7 @@ class OceanInfo():
 			if m:
 				assert(archname is not None)
 				islename = m.group(1)
-				isle = self.isleclass(self.ocean, islename)
-				isle.arch = archname
-				self.islands[islename] = isle
-				self.arches[archname][islename] = isle
+				isles.append((archname, islename))
 				continue
 			m = arch_re.match(l)
 			if m:
@@ -581,8 +694,21 @@ class OceanInfo():
 				self.arches[archname] = { }
 				continue
 			assert(False)
-		oscraper.wait()
-		assert(oscraper.returncode == 0)
+
+		for i in xrange(0, len(isles)-1):
+			(archname, islename) = isles[i]
+			progressreporter.doing(
+				'fetching isle info %2d/%d (%s: %s)'
+				% (i, len(isles), archname, islename))
+			isle = self.isleclass(self.ocean, islename)
+			isle.arch = archname
+			self.islands[islename] = isle
+			self.arches[archname][islename] = isle
+
+		progressreporter.stop()
+
+	def __str__(self):
+		return `(self.islands, self.arches)`
 
 #---------- pretty-printer for tables of pirate puzzle standings ----------
 
@@ -1249,7 +1375,7 @@ def do_flag_of(args, bu):
 	pi = PirateInfo(args[0], max_age)
 	if pi.flag is None: fi = None
 	else: fi = FlagInfo(pi.flag[0], max_age)
-	print `fi`
+	print fi
 
 def do_standings_crew_of(args, bu):
 	ci = prep_crew_of(args, bu, 60)
@@ -1266,12 +1392,43 @@ def do_standings_crew_of(args, bu):
 def do_ocean(args, bu):
 	if (len(args)): bu('ocean takes no further arguments')
 	fetcher.default_ocean()
-	oi = OceanInfo()
-	oi.collect()
+	oi = OceanInfo(IslandFlagInfo)
+	print oi
 	for islename in sorted(oi.islands.keys()):
 		isle = oi.islands[islename]
-		yoweb_url = isle.yoweb_url()
-		print " %s -- %s" % (islename, yoweb_url)
+		print isle
+
+def do_embargoes(args, bu):
+	if (len(args)): bu('ocean takes no further arguments')
+	fetcher.default_ocean()
+	oi = OceanInfo(IslandFlagInfo)
+	wr = sys.stdout.write
+	print ('EMBARGOES:  Island    | Owning flag'+
+		'                    | Embargoed flags')
+
+	def getflname(isle):
+		if isle.islandid is None: return 'uncolonisable'
+		if isle.flag is None: return 'uncolonised'
+		return isle.flag.name
+
+	for archname in sorted(oi.arches.keys()):
+		print 'ARCHIPELAGO: ',archname
+		for islename in sorted(oi.arches[archname].keys()):
+			isle = oi.islands[islename]
+			wr(' %-20s | ' % isle.name)
+			flname = getflname(isle)
+			wr('%-30s | ' % flname)
+			flag = isle.flag
+			if flag is None: print ''; continue
+			delim = ''
+			for rel in flag.relations:
+				(oname, oid, dummy, thisdeclaring,
+					odeclaringmin,odeclaringmax) = rel
+				if thisdeclaring >= 0: continue
+				wr(delim)
+				wr(oname)
+				delim = '; '
+			print ''
 
 #----- modes which use the chat log parser are quite complex -----
 
@@ -1594,7 +1751,7 @@ class KeystrokeReader(DummyKeystrokeReader):
 #---------- main program ----------
 
 def main():
-	global opts, fetcher
+	global opts, fetcher, yppedia
 
 	pa = OptionParser(
 '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
@@ -1676,7 +1833,8 @@ display modes (for --display) apply to ship-aid:
 		else:
 			opts.display = 'overwrite'
 
-	fetcher = Fetcher(opts.ocean, opts.cache_dir)
+	fetcher = Yoweb(opts.ocean, opts.cache_dir)
+	yppedia = Yppedia(opts.cache_dir)
 
 	mode_fn(args[1:], pa.error)