X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yoweb-scrape;h=90a5677c50e3ff1b1a01fb5427d98ff5ebf564ca;hp=68df2163812afe1b409937bd3fff9c24147073f7;hb=bb6f3057cff743f25c2ef95a734b9c0f05c97ff6;hpb=5ee6146a73510ae17e9dc8d78d46ec0098548c49 diff --git a/yoweb-scrape b/yoweb-scrape index 68df216..90a5677 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -110,19 +110,14 @@ def yppsc_dir(): #---------- caching and rate-limiting data fetcher ---------- class Fetcher: - def __init__(self, ocean, cachedir): + def __init__(self, cachedir): debug('Fetcher init %s' % cachedir) - self.ocean = ocean self.cachedir = cachedir try: os.mkdir(cachedir) except (OSError,IOError), oe: if oe.errno != errno.EEXIST: raise self._cache_scan(time.time()) - def default_ocean(self, ocean='ice'): - if self.ocean is None: - self.ocean = ocean - def _cache_scan(self, now): # returns list of ages, unsorted ages = [] @@ -204,12 +199,38 @@ class Fetcher: debug('Fetcher stored') return data +class Yoweb(Fetcher): + def __init__(self, ocean, cachedir): + debug('Yoweb init %s' % cachedir) + self.ocean = ocean + Fetcher.__init__(self, cachedir) + + def default_ocean(self, ocean='ice'): + if self.ocean is None: + self.ocean = ocean + def yoweb(self, kind, tail, max_age): self.default_ocean() + assert(self.ocean) url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( self.ocean, kind, tail) return self.fetch(url, max_age) +class Yppedia(Fetcher): + def __init__(self, cachedir): + debug('Yoweb init %s' % cachedir) + self.base = 'http://yppedia.puzzlepirates.com/' + self.localhtml = opts.localhtml + Fetcher.__init__(self, cachedir) + + def __call__(self, rhs): + if self.localhtml is None: + url = self.base + rhs + debug('Yppedia retrieving YPP '+url); + return self.fetch(url, 3000) + else: + return file(opts.localhtml + '/' + rhs, 'r') + #---------- logging assistance for troubled screenscrapers ---------- class SoupLog: @@ -429,18 +450,13 @@ class FlagInfo(SomethingSoupInfo): (`head`, ``waritem``)) def wihelp_item(waritem, thing): - if waritem.name == 'a': - url = waritem.get('href', None) - if url is None: - return ('no url for '+thing,None,None) - else: - hr = waritem.find('a',{'href':True}) - if not hr: return ('no a for '+thing,None,None) - url = hr['href'] + url = waritem.get('href', None) + if url is None: + return ('no url for '+thing,None,None) m = regexp.search('\?'+thing+'id=(\d+)$', url) if not m: return ('no '+thing+'id',None,None) tid = m.group(1) - tname = m.string + tname = waritem.string if tname is None: return (thing+' name not just string',None,None) return (None,tid,tname) @@ -452,7 +468,7 @@ class FlagInfo(SomethingSoupInfo): if rel: return 'flag id twice!' if flagname in self.relation_byname: return 'flag name twice!' - rel = (flagname,flagid,[], thisdecl,othermin,othermax) + rel = (flagname,flagid,head, thisdecl,othermin,othermax) self.relations.append(rel) self.relation_byid[flagid] = rel self.relation_byname[flagid] = rel @@ -472,20 +488,24 @@ class FlagInfo(SomethingSoupInfo): how = (wi_warn, None) - for waritem in warinfo.contents: - debug('WARITEM '+``waritem``) - if isinstance(waritem, unicode): - waritem = waritem.strip() - if waritem: warn('unknown waritem '+``waritem``) - continue - if waritem.name == 'br': - continue - if waritem.name == 'b': - head = ''.join(waritem.findAll(text=True)) + for waritem in warinfo.findAll(['font','a']): + if waritem is None: break + if waritem.name == 'font': + colour = waritem.get('color',None) + if colour.lstrip('#') != '958A5F': + warn('strange colour %s in %s' % + (colour,``waritem``)) + continue + head = waritem.string + if head is None: + warn('no head string in '+``waritem``) + continue head = regexp.sub('\\s+', ' ', head).strip() head = head.rstrip(':') how = (head,) + warmap.get(head, (wi_warn,)) continue + assert(waritem.name == 'a') + debug('WARHOW %s(%s, waritem, *%s)' % (how[1], `how[0]`, `how[2:]`)) bad = how[1](how[0], waritem, *how[2:]) @@ -498,49 +518,139 @@ class FlagInfo(SomethingSoupInfo): #---------- scraper for ocean info incl. embargoes etc. ---------- -class IslandInfo(): +class IslandBasicInfo(): + # Public data attributes: + # ocean + # name + # Public data attributes maybe set by caller: + # arch def __init__(self, ocean, islename): self.ocean = ocean self.name = islename - def collect(self): - pass - def yppedia_dataf(self): + def yppedia(self): def q(x): return urllib.quote(x.replace(' ','_')) url_rhs = q(self.name) + '_(' + q(self.ocean) + ')' - if opts.localhtml is None: - url = 'http://yppedia.puzzlepirates.com/' + url_rhs - debug('IslandInfo retrieving YPP '+url); - return urllib.urlopen(url) - else: - return file(opts.localhtml + '/' + url_rhs, 'r') - def yoweb_url(self): - soup = BeautifulSoup(self.yppedia_dataf()) + return yppedia(url_rhs) + def __str__(self): + return `(self.ocean, self.name)` + +class IslandExtendedInfo(IslandBasicInfo): + # Public data attributes (inherited): + # ocean + # name + # Public data attributes (additional): + # islandid + # yoweb_url + # flagid + def __init__(self, ocean, islename): + IslandBasicInfo.__init__(self, ocean, islename) + self.islandid = None + self.yoweb_url = None + self._collect_yoweb() + self._collect_flagid() + + def _collect_yoweb(self): + debug('IEI COLLECT YOWEB '+`self.name`) + self.islandid = None + self.yoweb_url = None + + soup = BeautifulSoup(self.yppedia()) content = soup.find('div', attrs = {'id': 'content'}) yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+ - 'yoweb/island/info\.wm\?islandid=\d+$') + 'yoweb/island/info\.wm\?islandid=(\d+)$') a = soup.find('a', attrs = { 'href': yoweb_re }) - if a is None: return None - return a['href'] - def ruling_flag_id(self): - yo = self.yoweb_url() + if a is None: + debug('IEI COLLECT YOWEB '+`self.name`+' NONE') + return + + debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``) + self.yoweb_url = a['href'] + m = yoweb_re.search(self.yoweb_url) + self.islandid = m.group(1) + + def _collect_flagid(self): + self.flagid = None + + yo = self.yoweb_url + debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`) if yo is None: return None - dataf = fetcher.fetch(yo, 600) + dataf = fetcher.fetch(yo, 1800) soup = BeautifulSoup(dataf) - ruler_re = regexp.compile('http://\w+\.puzzlepirates\.com/'+ - 'yoweb/flag/info\.wm\?flagid=(\d+)$') + ruler_re = regexp.compile( + '/yoweb/flag/info\.wm\?flagid=(\d+)$') ruler = soup.find('a', attrs = { 'href': ruler_re }) - if not ruler: return None - m = ruler_re.find(ruler['href']) - return m.group(1) + if not ruler: + debug('IEI COLLECT FLAGID '+`self.name`+' NONE') + return + debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``) + m = ruler_re.search(ruler['href']) + self.flagid = m.group(1) + + def __str__(self): + return `(self.ocean, self.islandid, self.name, + self.yoweb_url, self.flagid)` + +class IslandFlagInfo(IslandExtendedInfo): + # Public data attributes (inherited): + # ocean + # name + # islandid + # yoweb_url + # flagid + # Public data attributes (additional): + # flag + def __init__(self, ocean, islename): + IslandExtendedInfo.__init__(self, ocean, islename) + self.flag = None + self._collect_flag() + + def _collect_flag(self): + if self.flagid is None: return + self.flag = FlagInfo(self.flagid, 1800) + + def __str__(self): + return IslandExtendedInfo.__str__(self) + '; ' + str(self.flag) + +class NullProgressReporter(): + def start(self): pass + def doing(self, msg): pass + def stop(self): pass + +class TypewriterProgressReporter(): + def start(self): + self._l = 0 + def doing(self,m): + self._doing(m + '...') + def _doing(self,m): + self._write('\r') + self._write(m) + less = self._l - len(m) + if less > 0: + self._write(' ' * less) + self._write('\b' * less) + self._l = len(m) + sys.stdout.flush() + def stop(self): + self._doing('') + self._l = 0 + def _write(self,t): + sys.stdout.write(t) class OceanInfo(): - # Public data attributes (valid after collect()): + # Public data attributes: # oi.islands[islename] = IslandInfo(...) # oi.arches[archname][islename] = IslandInfo(...) - def __init__(self): - self.isleclass = IslandInfo + def __init__(self, isleclass=IslandBasicInfo, progressreporter=None): + if progressreporter is None: + if opts.debug: progressreporter = NullProgressReporter() + else: progressreporter = TypewriterProgressReporter() + + self.isleclass = isleclass self.ocean = fetcher.ocean.lower().capitalize() - def collect(self): + + progressreporter.start() + progressreporter.doing('fetching ocean info') + cmdl = ['./yppedia-ocean-scraper'] if opts.localhtml is not None: cmdl += ['--local-html-dir',opts.localhtml] @@ -558,10 +668,16 @@ class OceanInfo(): arch_re = regexp.compile('^ (\S.*)') island_re = regexp.compile('^ (\S.*)') + oscraper.wait() + assert(oscraper.returncode == 0) + self.islands = { } self.arches = { } archname = None + isles = [ ] + progressreporter.doing('parsing ocean info') + for l in oscraper.stdout: debug('OceanInfo collect l '+`l`) l = l.rstrip('\n') @@ -569,10 +685,7 @@ class OceanInfo(): if m: assert(archname is not None) islename = m.group(1) - isle = self.isleclass(self.ocean, islename) - isle.arch = archname - self.islands[islename] = isle - self.arches[archname][islename] = isle + isles.append((archname, islename)) continue m = arch_re.match(l) if m: @@ -581,8 +694,21 @@ class OceanInfo(): self.arches[archname] = { } continue assert(False) - oscraper.wait() - assert(oscraper.returncode == 0) + + for i in xrange(0, len(isles)-1): + (archname, islename) = isles[i] + progressreporter.doing( + 'fetching isle info %2d/%d (%s: %s)' + % (i, len(isles), archname, islename)) + isle = self.isleclass(self.ocean, islename) + isle.arch = archname + self.islands[islename] = isle + self.arches[archname][islename] = isle + + progressreporter.stop() + + def __str__(self): + return `(self.islands, self.arches)` #---------- pretty-printer for tables of pirate puzzle standings ---------- @@ -1249,7 +1375,7 @@ def do_flag_of(args, bu): pi = PirateInfo(args[0], max_age) if pi.flag is None: fi = None else: fi = FlagInfo(pi.flag[0], max_age) - print `fi` + print fi def do_standings_crew_of(args, bu): ci = prep_crew_of(args, bu, 60) @@ -1266,12 +1392,43 @@ def do_standings_crew_of(args, bu): def do_ocean(args, bu): if (len(args)): bu('ocean takes no further arguments') fetcher.default_ocean() - oi = OceanInfo() - oi.collect() + oi = OceanInfo(IslandFlagInfo) + print oi for islename in sorted(oi.islands.keys()): isle = oi.islands[islename] - yoweb_url = isle.yoweb_url() - print " %s -- %s" % (islename, yoweb_url) + print isle + +def do_embargoes(args, bu): + if (len(args)): bu('ocean takes no further arguments') + fetcher.default_ocean() + oi = OceanInfo(IslandFlagInfo) + wr = sys.stdout.write + print ('EMBARGOES: Island | Owning flag'+ + ' | Embargoed flags') + + def getflname(isle): + if isle.islandid is None: return 'uncolonisable' + if isle.flag is None: return 'uncolonised' + return isle.flag.name + + for archname in sorted(oi.arches.keys()): + print 'ARCHIPELAGO: ',archname + for islename in sorted(oi.arches[archname].keys()): + isle = oi.islands[islename] + wr(' %-20s | ' % isle.name) + flname = getflname(isle) + wr('%-30s | ' % flname) + flag = isle.flag + if flag is None: print ''; continue + delim = '' + for rel in flag.relations: + (oname, oid, dummy, thisdeclaring, + odeclaringmin,odeclaringmax) = rel + if thisdeclaring >= 0: continue + wr(delim) + wr(oname) + delim = '; ' + print '' #----- modes which use the chat log parser are quite complex ----- @@ -1594,7 +1751,7 @@ class KeystrokeReader(DummyKeystrokeReader): #---------- main program ---------- def main(): - global opts, fetcher + global opts, fetcher, yppedia pa = OptionParser( '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...] @@ -1676,7 +1833,8 @@ display modes (for --display) apply to ship-aid: else: opts.display = 'overwrite' - fetcher = Fetcher(opts.ocean, opts.cache_dir) + fetcher = Yoweb(opts.ocean, opts.cache_dir) + yppedia = Yppedia(opts.cache_dir) mode_fn(args[1:], pa.error)