X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yoweb-scrape;h=717b539131d8ffb0e4ba6abba5f6cddc3b38b199;hp=a705f6792f05ad5d017086d54e98c717b9b79131;hb=ecf2c0123bbeda951c5e7d9a832d20eaf99291ce;hpb=1da7a29fdca76778d028146370e212307a6358f5 diff --git a/yoweb-scrape b/yoweb-scrape index a705f67..717b539 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -44,6 +44,8 @@ import random import curses import termios import random +import subprocess +import copy from optparse import OptionParser from StringIO import StringIO @@ -98,28 +100,55 @@ def format_time_interval(ti): if ti < 86400: return '%dh' % (ti / 3600) return '%dd' % (ti / 86400) +def yppsc_dir(): + lib = os.getenv("YPPSC_YARRG_SRCBASE") + if lib is not None: return lib + lib = sys.argv[0] + lib = regexp.sub('/[^/]+$', '', lib) + os.environ["YPPSC_YARRG_SRCBASE"] = lib + return lib + +soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) +soup_massage.append( + (regexp.compile('(\ 0: need_wait += random.random() - 0.5 return need_wait - def _rate_limit_cache_clean(self, now): - need_wait = self.need_wait(now) + def _rate_limit_cache_clean(self, now, next_url=None): + need_wait = self.need_wait(now, next_url=next_url) if need_wait > 0: - debug('Fetcher wait %d' % need_wait) + debug('Fetcher wait %f' % need_wait) sleep(need_wait) def fetch(self, url, max_age): @@ -182,7 +210,7 @@ class Fetcher: return data debug('Fetcher fetch') - self._rate_limit_cache_clean(now) + self._rate_limit_cache_clean(now, next_url=url) stream = urllib2.urlopen(url) data = stream.read() @@ -195,12 +223,38 @@ class Fetcher: debug('Fetcher stored') return data +class Yoweb(Fetcher): + def __init__(self, ocean, cachedir): + debug('Yoweb init %s' % cachedir) + self.ocean = ocean + Fetcher.__init__(self, cachedir) + + def default_ocean(self, ocean='ice'): + if self.ocean is None: + self.ocean = ocean + def yoweb(self, kind, tail, max_age): self.default_ocean() + assert(self.ocean) url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( self.ocean, kind, tail) return self.fetch(url, max_age) +class Yppedia(Fetcher): + def __init__(self, cachedir): + debug('Yoweb init %s' % cachedir) + self.base = 'http://yppedia.puzzlepirates.com/' + self.localhtml = opts.localhtml + Fetcher.__init__(self, cachedir) + + def __call__(self, rhs): + if self.localhtml is None: + url = self.base + rhs + debug('Yppedia retrieving YPP '+url); + return self.fetch(url, 3000) + else: + return file(opts.localhtml + '/' + rhs, 'r') + #---------- logging assistance for troubled screenscrapers ---------- class SoupLog: @@ -222,9 +276,7 @@ class SomethingSoupInfo(SoupLog): def __init__(self, kind, tail, max_age): SoupLog.__init__(self) html = fetcher.yoweb(kind, tail, max_age) - self._soup = BeautifulSoup(html, - convertEntities=BeautifulSoup.HTML_ENTITIES - ) + self._soup = make_soup(html) #---------- scraper for pirate pages ---------- @@ -331,12 +383,14 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A- class CrewInfo(SomethingSoupInfo): # Public data members: + # ci.crewid # ci.crew = [ ('Captain', ['Pirate', ...]), # ('Senior Officer', [...]), # ... ] # pi.msgs = [ 'message describing problem with scrape' ] def __init__(self, crewid, max_age=300): + self.crewid = crewid SomethingSoupInfo.__init__(self, 'crew/info.wm?crewid=', crewid, max_age) self._find_crew() @@ -377,6 +431,322 @@ class CrewInfo(SomethingSoupInfo): def __str__(self): return `(self.crew, self.msgs)` +class FlagRelation(): + # Public data members (put there by hand by creater) + # other_flagname + # other_flagid + # yoweb_heading + # this_declaring + # other_declaring_min + # other_declaring_max + # where {this,other}_declaring{,_min,_max} are: + # -1 {this,other} is declaring war + # 0 {this,other} is not doing either + # +1 {this,other} is allying + def __repr__(self): + return '' % ( + self.yoweb_heading, self.this_declaring, + self.other_declaring_min, self.other_declaring_max, + self.other_flagname, self.other_flagid) + +class FlagInfo(SomethingSoupInfo): + # Public data members (after init): + # + # flagid + # name # string + # + # relations[n] = FlagRelation + # relation_byname[otherflagname] = relations[some_n] + # relation_byid[otherflagname] = relations[some_n] + # + # islands[n] = (islandname, islandid) + # + def __init__(self, flagid, max_age=600): + self.flagid = flagid + SomethingSoupInfo.__init__(self, + 'flag/info.wm?flagid=', flagid, max_age) + self._find_flag() + + def _find_flag(self): + font2 = self._soup.find('font',{'size':'+2'}) + self.name = font2.find('b').contents[0] + + self.relations = [ ] + self.relation_byname = { } + self.relation_byid = { } + self.islands = [ ] + + magnate = self._soup.find('img',{'src': + '/yoweb/images/repute-MAGNATE.png'}) + warinfo = (magnate.findParent('table').findParent('tr'). + findNextSibling('tr').findNext('td',{'align':'left'})) + + def warn(m): + print >>sys.stderr, 'WARNING: '+m + + def wi_warn(head, waritem): + warn('unknown warmap item: %s: %s' % + (`head`, ``waritem``)) + + def wihelp_item(waritem, thing): + url = waritem.get('href', None) + if url is None: + return ('no url for '+thing,None,None) + m = regexp.search('\?'+thing+'id=(\d+)$', url) + if not m: return ('no '+thing+'id',None,None) + tid = m.group(1) + tname = waritem.string + if tname is None: + return (thing+' name not just string',None,None) + return (None,tid,tname) + + def wi_alwar(head, waritem, thisdecl, othermin, othermax): + (err,flagid,flagname) = wihelp_item(waritem,'flag') + if err: return err + rel = self.relation_byid.get(flagid, None) + if rel: return 'flag id twice!' + if flagname in self.relation_byname: + return 'flag name twice!' + rel = FlagRelation() + rel.other_flagname = flagname + rel.other_flagid = flagid + rel.yoweb_heading = head + rel.this_declaring = thisdecl + rel.other_declaring_min = othermin + rel.other_declaring_max = othermax + self.relations.append(rel) + self.relation_byid[flagid] = rel + self.relation_byname[flagid] = rel + + def wi_isle(head, waritem): + (err,isleid,islename) = wihelp_item(waritem,'island') + if err: return err + self.islands.append((isleid,islename)) + + warmap = { + 'Allied with': (wi_alwar,+1,+1,+1), + 'Declaring war against': (wi_alwar,-1, 0,+1), + 'At war with': (wi_alwar,-1,-1,-1), + 'Trying to form an alliance with': (wi_alwar,+1,-1,0), + 'Islands controlled by this flag': (wi_isle,), + } + + how = (wi_warn, None) + + for waritem in warinfo.findAll(['font','a']): + if waritem is None: break + if waritem.name == 'font': + colour = waritem.get('color',None) + if colour.lstrip('#') != '958A5F': + warn('strange colour %s in %s' % + (colour,``waritem``)) + continue + head = waritem.string + if head is None: + warn('no head string in '+``waritem``) + continue + head = regexp.sub('\\s+', ' ', head).strip() + head = head.rstrip(':') + how = (head,) + warmap.get(head, (wi_warn,)) + continue + assert(waritem.name == 'a') + + debug('WARHOW %s(%s, waritem, *%s)' % + (how[1], `how[0]`, `how[2:]`)) + bad = how[1](how[0], waritem, *how[2:]) + if bad: + warn('bad waritem %s: %s: %s' % (`how[0]`, + bad, ``waritem``)) + + def __str__(self): + return `(self.name, self.islands, self.relations)` + +#---------- scraper for ocean info incl. embargoes etc. ---------- + +class IslandBasicInfo(): + # Public data attributes: + # ocean + # name + # Public data attributes maybe set by caller: + # arch + def __init__(self, ocean, islename): + self.ocean = ocean + self.name = islename + def yppedia(self): + def q(x): return urllib.quote(x.replace(' ','_')) + url_rhs = q(self.name) + '_(' + q(self.ocean) + ')' + return yppedia(url_rhs) + def __str__(self): + return `(self.ocean, self.name)` + +class IslandExtendedInfo(IslandBasicInfo): + # Public data attributes (inherited): + # ocean + # name + # Public data attributes (additional): + # islandid + # yoweb_url + # flagid + def __init__(self, ocean, islename): + IslandBasicInfo.__init__(self, ocean, islename) + self.islandid = None + self.yoweb_url = None + self._collect_yoweb() + self._collect_flagid() + + def _collect_yoweb(self): + debug('IEI COLLECT YOWEB '+`self.name`) + self.islandid = None + self.yoweb_url = None + + soup = make_soup(self.yppedia()) + content = soup.find('div', attrs = {'id': 'content'}) + yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+ + 'yoweb/island/info\.wm\?islandid=(\d+)$') + a = soup.find('a', attrs = { 'href': yoweb_re }) + if a is None: + debug('IEI COLLECT YOWEB '+`self.name`+' NONE') + return + + debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``) + self.yoweb_url = a['href'] + m = yoweb_re.search(self.yoweb_url) + self.islandid = m.group(1) + + def _collect_flagid(self): + self.flagid = None + + yo = self.yoweb_url + debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`) + if yo is None: return None + dataf = fetcher.fetch(yo, 1800) + soup = make_soup(dataf) + ruler_re = regexp.compile( + '/yoweb/flag/info\.wm\?flagid=(\d+)$') + ruler = soup.find('a', attrs = { 'href': ruler_re }) + if not ruler: + debug('IEI COLLECT FLAGID '+`self.name`+' NONE') + return + debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``) + m = ruler_re.search(ruler['href']) + self.flagid = m.group(1) + + def __str__(self): + return `(self.ocean, self.islandid, self.name, + self.yoweb_url, self.flagid)` + +class IslandFlagInfo(IslandExtendedInfo): + # Public data attributes (inherited): + # ocean + # name + # islandid + # yoweb_url + # flagid + # Public data attributes (additional): + # flag + def __init__(self, ocean, islename): + IslandExtendedInfo.__init__(self, ocean, islename) + self.flag = None + self._collect_flag() + + def _collect_flag(self): + if self.flagid is None: return + self.flag = FlagInfo(self.flagid, 1800) + + def __str__(self): + return IslandExtendedInfo.__str__(self) + '; ' + str(self.flag) + +class NullProgressReporter(): + def doing(self, msg): pass + def stop(self): pass + +class TypewriterProgressReporter(): + def __init__(self): + self._l = 0 + def doing(self,m): + self._doing(m + '...') + def _doing(self,m): + self._write('\r') + self._write(m) + less = self._l - len(m) + if less > 0: + self._write(' ' * less) + self._write('\b' * less) + self._l = len(m) + sys.stdout.flush() + def stop(self): + self._doing('') + self._l = 0 + def _write(self,t): + sys.stdout.write(t) + +class OceanInfo(): + # Public data attributes: + # oi.islands[islename] = IslandInfo(...) + # oi.arches[archname][islename] = IslandInfo(...) + def __init__(self, isleclass=IslandBasicInfo): + self.isleclass = isleclass + self.ocean = fetcher.ocean.lower().capitalize() + + progressreporter.doing('fetching ocean info') + + cmdl = ['./yppedia-ocean-scraper'] + if opts.localhtml is not None: + cmdl += ['--local-html-dir',opts.localhtml] + cmdl += [self.ocean] + debug('OceanInfo collect running ' + `cmdl`) + oscraper = subprocess.Popen( + cmdl, + stdout = subprocess.PIPE, + cwd = yppsc_dir()+'/yarrg', + shell=False, stderr=None, + ) + h = oscraper.stdout.readline() + debug('OceanInfo collect h '+`h`) + assert(regexp.match('^ocean ', h)) + arch_re = regexp.compile('^ (\S.*)') + island_re = regexp.compile('^ (\S.*)') + + oscraper.wait() + assert(oscraper.returncode == 0) + + self.islands = { } + self.arches = { } + archname = None + + isles = [ ] + progressreporter.doing('parsing ocean info') + + for l in oscraper.stdout: + debug('OceanInfo collect l '+`l`) + l = l.rstrip('\n') + m = island_re.match(l) + if m: + assert(archname is not None) + islename = m.group(1) + isles.append((archname, islename)) + continue + m = arch_re.match(l) + if m: + archname = m.group(1) + assert(archname not in self.arches) + self.arches[archname] = { } + continue + assert(False) + + for i in xrange(0, len(isles)-1): + (archname, islename) = isles[i] + progressreporter.doing( + 'fetching isle info %2d/%d (%s: %s)' + % (i, len(isles), archname, islename)) + isle = self.isleclass(self.ocean, islename) + isle.arch = archname + self.islands[islename] = isle + self.arches[archname][islename] = isle + + def __str__(self): + return `(self.islands, self.arches)` + #---------- pretty-printer for tables of pirate puzzle standings ---------- class StandingsTable: @@ -1026,16 +1396,29 @@ def do_pirate(pirates, bu): print '%s: %s,' % (`pirate`, info) print '}' -def prep_crew_of(args, bu, max_age=300): - if len(args) != 1: bu('crew-of takes one pirate name') +def prep_crewflag_of(args, bu, max_age, selector, constructor): + if len(args) != 1: bu('crew-of etc. take one pirate name') pi = PirateInfo(args[0], max_age) - if pi.crew is None: return None - return CrewInfo(pi.crew[0], max_age) + cf = selector(pi) + if cf is None: return None + return constructor(cf[0], max_age) + +def prep_crew_of(args, bu, max_age=300): + return prep_crewflag_of(args, bu, max_age, + (lambda pi: pi.crew), CrewInfo) + +def prep_flag_of(args, bu, max_age=300): + return prep_crewflag_of(args, bu, max_age, + (lambda pi: pi.flag), FlagInfo) def do_crew_of(args, bu): ci = prep_crew_of(args, bu) print ci +def do_flag_of(args, bu): + fi = prep_flag_of(args, bu) + print fi + def do_standings_crew_of(args, bu): ci = prep_crew_of(args, bu, 60) tab = StandingsTable(sys.stdout) @@ -1048,6 +1431,101 @@ def do_standings_crew_of(args, bu): pi = PirateInfo(p, random.randint(900,1800)) tab.pirate(pi) +def do_ocean(args, bu): + if (len(args)): bu('ocean takes no further arguments') + fetcher.default_ocean() + oi = OceanInfo(IslandFlagInfo) + print oi + for islename in sorted(oi.islands.keys()): + isle = oi.islands[islename] + print isle + +def do_embargoes(args, bu): + if (len(args)): bu('ocean takes no further arguments') + fetcher.default_ocean() + oi = OceanInfo(IslandFlagInfo) + wr = sys.stdout.write + print ('EMBARGOES: Island | Owning flag'+ + ' | Embargoed flags') + + def getflname(isle): + if isle.islandid is None: return 'uncolonisable' + if isle.flag is None: return 'uncolonised' + return isle.flag.name + + progressreporter.stop() + + for archname in sorted(oi.arches.keys()): + print 'ARCHIPELAGO: ',archname + for islename in sorted(oi.arches[archname].keys()): + isle = oi.islands[islename] + wr(' %-20s | ' % isle.name) + flname = getflname(isle) + wr('%-30s | ' % flname) + flag = isle.flag + if flag is None: print ''; continue + delim = '' + for rel in flag.relations: + if rel.this_declaring >= 0: continue + wr(delim) + wr(rel.other_flagname) + delim = '; ' + print '' + +def do_embargoes_flag_of(args, bu): + progressreporter.doing('fetching flag info') + fi = prep_flag_of(args, bu) + if fi is None: + progressreporter.stop() + print 'Pirate is not in a flag.' + return + + oi = OceanInfo(IslandFlagInfo) + + progressreporter.stop() + print '' + + any = False + for islename in sorted(oi.islands.keys()): + isle = oi.islands[islename] + flag = isle.flag + if flag is None: continue + for rel in flag.relations: + if rel.this_declaring >= 0: continue + if rel.other_flagid != fi.flagid: continue + if not any: print 'EMBARGOED:' + any = True + print " %-30s (%s)" % (islename, flag.name) + if not any: + print 'No embargoes.' + print '' + + war_flag(fi) + print '' + +def do_war_flag_of(args, bu): + fi = prep_flag_of(args, bu) + war_flag(fi) + +def war_flag(fi): + any = False + for certain in [True, False]: + anythis = False + for rel in fi.relations: + if rel.this_declaring >= 0: continue + if (rel.other_declaring_max < 0) != certain: continue + if not anythis: + if certain: m = 'SINKING PvP' + else: m = 'RISK OF SINKING PvP' + print '%s (%s):' % (m, rel.yoweb_heading) + anythis = True + any = True + print " ", rel.other_flagname + if not any: + print 'No sinking PvP.' + +#----- modes which use the chat log parser are quite complex ----- + class ProgressPrintPercentage: def __init__(self, f=sys.stdout): self._f = f @@ -1063,8 +1541,6 @@ class ProgressPrintPercentage: self._f.write(' \r') self._f.flush() -#----- modes which use the chat log parser are quite complex ----- - def prep_chat_log(args, bu, progress=ProgressPrintPercentage(), max_myself_age=3600): @@ -1369,7 +1845,7 @@ class KeystrokeReader(DummyKeystrokeReader): #---------- main program ---------- def main(): - global opts, fetcher + global opts, fetcher, yppedia, progressreporter pa = OptionParser( '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...] @@ -1378,6 +1854,8 @@ actions: yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE yoweb-scrape [--ocean OCEAN ...] standings-crew-of PIRATE yoweb-scrape [--ocean OCEAN ...] track-chat-log CHAT-LOG + yoweb-scrape [--ocean OCEAN ...] ocean|embargoes + yoweb-scrape [--ocean OCEAN ...] war-flag-of|embargoes-flag-of PIRATE yoweb-scrape [options] ship-aid CHAT-LOG (must be .../PIRATE_OCEAN_chat-log*) display modes (for --display) apply to ship-aid: @@ -1398,6 +1876,9 @@ display modes (for --display) apply to ship-aid: ao('--display', action='store', dest='display', type='choice', choices=['dumb','overwrite'], help='how to display ship aid') + ao('--local-ypp-dir', action='store', dest='localhtml', + help='get yppedia pages from local directory LOCALHTML'+ + ' instead of via HTTP') ao_jt = lambda wh, t: ao( '--timeout-sa-'+wh, action='store', dest='timeout_'+wh, @@ -1448,7 +1929,13 @@ display modes (for --display) apply to ship-aid: else: opts.display = 'overwrite' - fetcher = Fetcher(opts.ocean, opts.cache_dir) + fetcher = Yoweb(opts.ocean, opts.cache_dir) + yppedia = Yppedia(opts.cache_dir) + + if opts.debug or not os.isatty(0): + progressreporter = NullProgressReporter() + else: + progressreporter = TypewriterProgressReporter() mode_fn(args[1:], pa.error)