#---------- caching and rate-limiting data fetcher ----------
class Fetcher:
- def __init__(self, ocean, cachedir):
+ def __init__(self, cachedir):
debug('Fetcher init %s' % cachedir)
- self.ocean = ocean
self.cachedir = cachedir
try: os.mkdir(cachedir)
except (OSError,IOError), oe:
if oe.errno != errno.EEXIST: raise
self._cache_scan(time.time())
- def default_ocean(self, ocean='ice'):
- if self.ocean is None:
- self.ocean = ocean
-
def _cache_scan(self, now):
# returns list of ages, unsorted
ages = []
debug('Fetcher stored')
return data
+class Yoweb(Fetcher):
+ def __init__(self, ocean, cachedir):
+ debug('Yoweb init %s' % cachedir)
+ self.ocean = ocean
+ Fetcher.__init__(self, cachedir)
+
+ def default_ocean(self, ocean='ice'):
+ if self.ocean is None:
+ self.ocean = ocean
+
def yoweb(self, kind, tail, max_age):
self.default_ocean()
+ assert(self.ocean)
url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
self.ocean, kind, tail)
return self.fetch(url, max_age)
+class Yppedia(Fetcher):
+ def __init__(self, cachedir):
+ debug('Yoweb init %s' % cachedir)
+ self.base = 'http://yppedia.puzzlepirates.com/'
+ self.localhtml = opts.localhtml
+ Fetcher.__init__(self, cachedir)
+
+ def __call__(self, rhs):
+ if self.localhtml is None:
+ url = self.base + rhs
+ debug('Yppedia retrieving YPP '+url);
+ return self.fetch(url, 3000)
+ else:
+ return file(opts.localhtml + '/' + rhs, 'r')
+
#---------- logging assistance for troubled screenscrapers ----------
class SoupLog:
#---------- scraper for ocean info incl. embargoes etc. ----------
-class IslandInfo():
+class IslandBasicInfo():
+ # Public data attributes:
+ # ocean
+ # name
+ # Public data attributes maybe set by caller:
+ # arch
def __init__(self, ocean, islename):
self.ocean = ocean
self.name = islename
- def collect(self):
- pass
- def yppedia_dataf(self):
+ def yppedia(self):
def q(x): return urllib.quote(x.replace(' ','_'))
url_rhs = q(self.name) + '_(' + q(self.ocean) + ')'
- if opts.localhtml is None:
- url = 'http://yppedia.puzzlepirates.com/' + url_rhs
- debug('IslandInfo retrieving YPP '+url);
- return urllib.urlopen(url)
- else:
- return file(opts.localhtml + '/' + url_rhs, 'r')
- def yoweb_url(self):
- soup = BeautifulSoup(self.yppedia_dataf())
+ return yppedia(url_rhs)
+ def __str__(self):
+ return `(self.ocean, self.name)`
+
+class IslandExtendedInfo(IslandBasicInfo):
+ # Public data attributes (inherited):
+ # ocean
+ # name
+ # Public data attributes (additional):
+ # islandid
+ # yoweb_url
+ # flagid
+ def __init__(self, ocean, islename):
+ IslandBasicInfo.__init__(self, ocean, islename)
+ self.islandid = None
+ self.yoweb_url = None
+ self._collect_yoweb()
+ self._collect_flagid()
+
+ def _collect_yoweb(self):
+ debug('IEI COLLECT YOWEB '+`self.name`)
+ self.islandid = None
+ self.yoweb_url = None
+
+ soup = BeautifulSoup(self.yppedia())
content = soup.find('div', attrs = {'id': 'content'})
yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
- 'yoweb/island/info\.wm\?islandid=\d+$')
+ 'yoweb/island/info\.wm\?islandid=(\d+)$')
a = soup.find('a', attrs = { 'href': yoweb_re })
- if a is None: return None
- return a['href']
- def ruling_flag_id(self):
- yo = self.yoweb_url()
+ if a is None:
+ debug('IEI COLLECT YOWEB '+`self.name`+' NONE')
+ return
+
+ debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``)
+ self.yoweb_url = a['href']
+ m = yoweb_re.search(self.yoweb_url)
+ self.islandid = m.group(1)
+
+ def _collect_flagid(self):
+ self.flagid = None
+
+ yo = self.yoweb_url
+ debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
if yo is None: return None
- dataf = fetcher.fetch(yo, 600)
+ dataf = fetcher.fetch(yo, 1800)
soup = BeautifulSoup(dataf)
- ruler_re = regexp.compile('http://\w+\.puzzlepirates\.com/'+
- 'yoweb/flag/info\.wm\?flagid=(\d+)$')
+ ruler_re = regexp.compile(
+ '/yoweb/flag/info\.wm\?flagid=(\d+)$')
ruler = soup.find('a', attrs = { 'href': ruler_re })
- if not ruler: return None
- m = ruler_re.find(ruler['href'])
- return m.group(1)
+ if not ruler:
+ debug('IEI COLLECT FLAGID '+`self.name`+' NONE')
+ return
+ debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``)
+ m = ruler_re.search(ruler['href'])
+ self.flagid = m.group(1)
+
+ def __str__(self):
+ return `(self.ocean, self.islandid, self.name,
+ self.yoweb_url, self.flagid)`
class OceanInfo():
- # Public data attributes (valid after collect()):
+ # Public data attributes:
# oi.islands[islename] = IslandInfo(...)
# oi.arches[archname][islename] = IslandInfo(...)
- def __init__(self):
- self.isleclass = IslandInfo
+ def __init__(self, isleclass=IslandBasicInfo):
+ self.isleclass = isleclass
self.ocean = fetcher.ocean.lower().capitalize()
- def collect(self):
+
cmdl = ['./yppedia-ocean-scraper']
if opts.localhtml is not None:
cmdl += ['--local-html-dir',opts.localhtml]
oscraper.wait()
assert(oscraper.returncode == 0)
+ def __str__(self):
+ return `(self.islands, self.arches)`
+
#---------- pretty-printer for tables of pirate puzzle standings ----------
class StandingsTable:
def do_ocean(args, bu):
if (len(args)): bu('ocean takes no further arguments')
fetcher.default_ocean()
- oi = OceanInfo()
- oi.collect()
+ oi = OceanInfo(IslandExtendedInfo)
+ print oi
for islename in sorted(oi.islands.keys()):
isle = oi.islands[islename]
- yoweb_url = isle.yoweb_url()
- print " %s -- %s" % (islename, yoweb_url)
+ print isle
#----- modes which use the chat log parser are quite complex -----
#---------- main program ----------
def main():
- global opts, fetcher
+ global opts, fetcher, yppedia
pa = OptionParser(
'''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
else:
opts.display = 'overwrite'
- fetcher = Fetcher(opts.ocean, opts.cache_dir)
+ fetcher = Yoweb(opts.ocean, opts.cache_dir)
+ yppedia = Yppedia(opts.cache_dir)
mode_fn(args[1:], pa.error)