From: Ian Jackson Date: Sun, 16 Jan 2011 15:30:41 +0000 (+0000) Subject: yoweb-scrape: wip new flag and ocean functionality - can fetch island ownerships too X-Git-Tag: 6.8.0~5 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=commitdiff_plain;h=f0eb636cf4249dbc353f24a74287913f8c88fcc0 yoweb-scrape: wip new flag and ocean functionality - can fetch island ownerships too --- diff --git a/yoweb-scrape b/yoweb-scrape index ba828f5..d2557b5 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -110,19 +110,14 @@ def yppsc_dir(): #---------- caching and rate-limiting data fetcher ---------- class Fetcher: - def __init__(self, ocean, cachedir): + def __init__(self, cachedir): debug('Fetcher init %s' % cachedir) - self.ocean = ocean self.cachedir = cachedir try: os.mkdir(cachedir) except (OSError,IOError), oe: if oe.errno != errno.EEXIST: raise self._cache_scan(time.time()) - def default_ocean(self, ocean='ice'): - if self.ocean is None: - self.ocean = ocean - def _cache_scan(self, now): # returns list of ages, unsorted ages = [] @@ -204,12 +199,38 @@ class Fetcher: debug('Fetcher stored') return data +class Yoweb(Fetcher): + def __init__(self, ocean, cachedir): + debug('Yoweb init %s' % cachedir) + self.ocean = ocean + Fetcher.__init__(self, cachedir) + + def default_ocean(self, ocean='ice'): + if self.ocean is None: + self.ocean = ocean + def yoweb(self, kind, tail, max_age): self.default_ocean() + assert(self.ocean) url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( self.ocean, kind, tail) return self.fetch(url, max_age) +class Yppedia(Fetcher): + def __init__(self, cachedir): + debug('Yoweb init %s' % cachedir) + self.base = 'http://yppedia.puzzlepirates.com/' + self.localhtml = opts.localhtml + Fetcher.__init__(self, cachedir) + + def __call__(self, rhs): + if self.localhtml is None: + url = self.base + rhs + debug('Yppedia retrieving YPP '+url); + return self.fetch(url, 3000) + else: + return file(opts.localhtml + '/' + rhs, 'r') + #---------- logging assistance for troubled screenscrapers ---------- class SoupLog: @@ -497,47 +518,84 @@ class FlagInfo(SomethingSoupInfo): #---------- scraper for ocean info incl. embargoes etc. ---------- -class IslandInfo(): +class IslandBasicInfo(): + # Public members: + # ocean + # name + # Public members maybe set by caller: + # arch def __init__(self, ocean, islename): self.ocean = ocean self.name = islename def collect(self): pass - def yppedia_dataf(self): + def yppedia(self): def q(x): return urllib.quote(x.replace(' ','_')) url_rhs = q(self.name) + '_(' + q(self.ocean) + ')' - if opts.localhtml is None: - url = 'http://yppedia.puzzlepirates.com/' + url_rhs - debug('IslandInfo retrieving YPP '+url); - return urllib.urlopen(url) - else: - return file(opts.localhtml + '/' + url_rhs, 'r') - def yoweb_url(self): - soup = BeautifulSoup(self.yppedia_dataf()) + return yppedia(url_rhs) + def __str__(self): + return `(self.ocean, self.name)` + +class IslandExtendedInfo(IslandBasicInfo): + # Public members (inherited): + # ocean + # name + # Public members (additional): + # islandid + # yoweb_url + # flagid + def collect(self): + IslandBasicInfo.collect(self) + self._collect_yoweb() + self._collect_flagid() + + def _collect_yoweb(self): + debug('IEI COLLECT YOWEB '+`self.name`) + self.islandid = None + self.yoweb_url = None + + soup = BeautifulSoup(self.yppedia()) content = soup.find('div', attrs = {'id': 'content'}) yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+ - 'yoweb/island/info\.wm\?islandid=\d+$') + 'yoweb/island/info\.wm\?islandid=(\d+)$') a = soup.find('a', attrs = { 'href': yoweb_re }) - if a is None: return None - return a['href'] - def ruling_flag_id(self): - yo = self.yoweb_url() + if a is None: + debug('IEI COLLECT YOWEB '+`self.name`+' NONE') + return + + debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``) + self.yoweb_url = a['href'] + m = yoweb_re.search(self.yoweb_url) + self.islandid = m.group(1) + + def _collect_flagid(self): + self.flagid = None + + yo = self.yoweb_url + debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`) if yo is None: return None - dataf = fetcher.fetch(yo, 600) + dataf = fetcher.fetch(yo, 1800) soup = BeautifulSoup(dataf) - ruler_re = regexp.compile('http://\w+\.puzzlepirates\.com/'+ - 'yoweb/flag/info\.wm\?flagid=(\d+)$') + ruler_re = regexp.compile( + '/yoweb/flag/info\.wm\?flagid=(\d+)$') ruler = soup.find('a', attrs = { 'href': ruler_re }) - if not ruler: return None - m = ruler_re.find(ruler['href']) - return m.group(1) + if not ruler: + debug('IEI COLLECT FLAGID '+`self.name`+' NONE') + return + debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``) + m = ruler_re.search(ruler['href']) + self.flagid = m.group(1) + + def __str__(self): + return `(self.ocean, self.islandid, self.name, + self.yoweb_url, self.flagid)` class OceanInfo(): # Public data attributes (valid after collect()): # oi.islands[islename] = IslandInfo(...) # oi.arches[archname][islename] = IslandInfo(...) - def __init__(self): - self.isleclass = IslandInfo + def __init__(self, isleclass=IslandBasicInfo): + self.isleclass = isleclass self.ocean = fetcher.ocean.lower().capitalize() def collect(self): cmdl = ['./yppedia-ocean-scraper'] @@ -570,6 +628,7 @@ class OceanInfo(): islename = m.group(1) isle = self.isleclass(self.ocean, islename) isle.arch = archname + isle.collect() self.islands[islename] = isle self.arches[archname][islename] = isle continue @@ -582,6 +641,8 @@ class OceanInfo(): assert(False) oscraper.wait() assert(oscraper.returncode == 0) + def __str__(self): + return `(self.islands, self.arches)` #---------- pretty-printer for tables of pirate puzzle standings ---------- @@ -1265,12 +1326,12 @@ def do_standings_crew_of(args, bu): def do_ocean(args, bu): if (len(args)): bu('ocean takes no further arguments') fetcher.default_ocean() - oi = OceanInfo() + oi = OceanInfo(IslandExtendedInfo) oi.collect() + print oi for islename in sorted(oi.islands.keys()): isle = oi.islands[islename] - yoweb_url = isle.yoweb_url() - print " %s -- %s" % (islename, yoweb_url) + print isle #----- modes which use the chat log parser are quite complex ----- @@ -1593,7 +1654,7 @@ class KeystrokeReader(DummyKeystrokeReader): #---------- main program ---------- def main(): - global opts, fetcher + global opts, fetcher, yppedia pa = OptionParser( '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...] @@ -1675,7 +1736,8 @@ display modes (for --display) apply to ship-aid: else: opts.display = 'overwrite' - fetcher = Fetcher(opts.ocean, opts.cache_dir) + fetcher = Yoweb(opts.ocean, opts.cache_dir) + yppedia = Yppedia(opts.cache_dir) mode_fn(args[1:], pa.error)