X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=blobdiff_plain;f=yoweb-scrape;h=2a3559009ef23e941e17bed9fc35298634251c4d;hp=54ef0fe14bd62ade32cf38a1b9c2b9c78f88ffeb;hb=6024b8b7e54143104a1f29d3976c5194c889f5ba;hpb=46a1fd2eb3beef6334ea615c1dc97596ea08c35c diff --git a/yoweb-scrape b/yoweb-scrape index 54ef0fe..2a35590 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -1,5 +1,7 @@ #!/usr/bin/python +#---------- setup ---------- + import signal signal.signal(signal.SIGINT, signal.SIG_DFL) @@ -10,25 +12,41 @@ import urllib2 import errno import sys import re as regexp +import random from optparse import OptionParser from BeautifulSoup import BeautifulSoup opts = None -duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ +#---------- YPP parameters and arrays ---------- + +puzzles = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') -standingvals = ('Able/Distinguished/Respected/Master/Renowned'+ - '/Grand-Master/Legendary/Ultimate').split('/') +standingvals = ('Able/Distinguished/Respected/Master'+ + '/Renowned/Grand-Master/Legendary/Ultimate').split('/') pirate_ref_re = regexp.compile('^/yoweb/pirate\\.wm') +max_pirate_namelen = 12 + + +#---------- general utilities ---------- + def debug(m): - if opts.debug: - print >>sys.stderr, m + if opts.debug > 0: + print m + +def format_time_interval(ti): + if ti < 120: return '%d:%02d' % (ti / 60, ti % 60) + if ti < 7200: return '%2dm' % (ti / 60) + if ti < 86400: return '%dh' % (ti / 3600) + return '%dd' % (ti / 86400) + +#---------- caching and rate-limiting data fetcher ---------- class Fetcher: def __init__(self, ocean, cachedir): @@ -40,6 +58,10 @@ class Fetcher: if oe.errno != errno.EEXIST: raise self._cache_scan(time.time()) + def _default_ocean(self): + if self.ocean is None: + self.ocean = 'ice' + def _cache_scan(self, now): # returns list of ages, unsorted ages = [] @@ -52,7 +74,7 @@ class Fetcher: if oe.errno != errno.ENOENT: raise continue age = now - s.st_mtime - if age > opts.max_age: + if age > opts.expire_age: debug('Fetcher expire %d %s' % (age, path)) try: os.remove(path) except (OSError,IOError), oe: @@ -61,24 +83,28 @@ class Fetcher: ages.append(age) return ages - def _rate_limit_cache_clean(self, now): + def need_wait(self, now): ages = self._cache_scan(now) ages.sort() debug('Fetcher ages ' + `ages`) min_age = 1 need_wait = 0 for age in ages: - if age < min_age: + if age < min_age and age < 300: debug('Fetcher morewait min=%d age=%d' % (min_age, age)) need_wait = max(need_wait, min_age - age) - min_age *= 2 - min_age += 1 + min_age += 3 + min_age *= 1.25 + return need_wait + + def _rate_limit_cache_clean(self, now): + need_wait = self.need_wait(now) if need_wait > 0: debug('Fetcher wait %d' % need_wait) time.sleep(need_wait) - def fetch(self, url): + def fetch(self, url, max_age): debug('Fetcher fetch %s' % url) cache_corename = urllib.quote_plus(url) cache_item = "%s/#%s#" % (self.cachedir, cache_corename) @@ -87,15 +113,17 @@ class Fetcher: if oe.errno != errno.ENOENT: raise f = None now = time.time() + max_age = max(opts.min_max_age, min(max_age, opts.expire_age)) if f is not None: s = os.fstat(f.fileno()) - if now > s.st_mtime + opts.max_age: - debug('Fetcher stale') + age = now - s.st_mtime + if age > max_age: + debug('Fetcher stale %d < %d'% (max_age, age)) f = None if f is not None: data = f.read() f.close() - debug('Fetcher cached') + debug('Fetcher cached %d > %d' % (max_age, age)) return data debug('Fetcher fetch') @@ -112,10 +140,13 @@ class Fetcher: debug('Fetcher stored') return data - def yoweb(self, kind, tail): + def yoweb(self, kind, tail, max_age): + self._default_ocean() url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( self.ocean, kind, tail) - return self.fetch(url) + return self.fetch(url, max_age) + +#---------- logging assistance for troubled screenscrapers ---------- class SoupLog: def __init__(self): @@ -133,23 +164,27 @@ def soup_text(obj): return str.strip() class SomethingSoupInfo(SoupLog): - def __init__(self, kind, tail): + def __init__(self, kind, tail, max_age): SoupLog.__init__(self) - html = fetcher.yoweb(kind, tail) - self.soup = BeautifulSoup(html, + html = fetcher.yoweb(kind, tail, max_age) + self._soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES ) +#---------- scraper for pirate pages ---------- + class PirateInfo(SomethingSoupInfo): # Public data members: # pi.standings = { 'Treasure Haul': 'Able' ... } + # pi.name = name # pi.crew = (id, name) # pi.flag = (id, name) # pi.msgs = [ 'message describing problem with scrape' ] - def __init__(self, pirate): + def __init__(self, pirate, max_age=300): SomethingSoupInfo.__init__(self, - 'pirate.wm?target=', pirate) + 'pirate.wm?target=', pirate, max_age) + self.name = pirate self._find_standings() self.crew = self._find_crewflag('crew', '^/yoweb/crew/info\\.wm') @@ -157,69 +192,69 @@ class PirateInfo(SomethingSoupInfo): '^/yoweb/flag/info\\.wm') def _find_standings(self): - imgs = self.soup.findAll('img', + imgs = self._soup.findAll('img', src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$' ) standings = { } - for skill in duties: + for skill in puzzles: standings[skill] = [ ] skl = SoupLog() for img in imgs: - try: duty = img['alt'] + try: puzzle = img['alt'] except KeyError: continue - if not duty in duties: - skl.soupm(img, 'unknown duty: "%s"' % duty) + if not puzzle in puzzles: + skl.soupm(img, 'unknown puzzle: "%s"' % puzzle) continue key = img.findParent('td') if key is None: - skl.soupm(img, 'duty at root! "%s"' % duty) + skl.soupm(img, 'puzzle at root! "%s"' % puzzle) continue valelem = key.findNextSibling('td') if valelem is None: - skl.soupm(key, 'duty missing sibling "%s"' - % duty) + skl.soupm(key, 'puzzle missing sibling "%s"' + % puzzle) continue valstr = soup_text(valelem) match = re.match(valstr) if match is None: - skl.soupm(key, ('duty "%s" unparseable'+ - ' standing "%s"') % (duty, valstr)) + skl.soupm(key, ('puzzle "%s" unparseable'+ + ' standing "%s"') % (puzzle, valstr)) continue standing = match.group(match.lastindex) - standings[duty].append(standing) + standings[puzzle].append(standing) self.standings = { } - for duty in duties: - sl = standings[duty] + for puzzle in puzzles: + sl = standings[puzzle] if len(sl) > 1: - skl.msg('duty "%s" multiple standings %s' % - (duty, `sl`)) + skl.msg('puzzle "%s" multiple standings %s' % + (puzzle, `sl`)) continue - if not len(sl): - skl.msg('duty "%s" no standing found' % duty) + if not sl: + skl.msg('puzzle "%s" no standing found' % puzzle) continue standing = sl[0] for i in range(0, len(standingvals)-1): if standing == standingvals[i]: - self.standings[duty] = i - if not duty in self.standings: - skl.msg('duty "%s" unknown standing "%s"' % - (duty, standing)) + self.standings[puzzle] = i + if not puzzle in self.standings: + skl.msg('puzzle "%s" unknown standing "%s"' % + (puzzle, standing)) all_standings_ok = True - for duty in duties: - if not duty in self.standings: + for puzzle in puzzles: + if not puzzle in self.standings: self.needs_msgs(skl) def _find_crewflag(self, cf, yoweb_re): - things = self.soup.findAll('a', href=regexp.compile(yoweb_re)) + things = self._soup.findAll('a', href=regexp.compile(yoweb_re)) if len(things) != 1: self.msg('zero or several %s id references found' % cf) return None @@ -237,6 +272,8 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A- def __str__(self): return `(self.crew, self.flag, self.standings, self.msgs)` +#---------- scraper for crew pages ---------- + class CrewInfo(SomethingSoupInfo): # Public data members: # ci.crew = [ ('Captain', ['Pirate', ...]), @@ -244,14 +281,14 @@ class CrewInfo(SomethingSoupInfo): # ... ] # pi.msgs = [ 'message describing problem with scrape' ] - def __init__(self, crewid): + def __init__(self, crewid, max_age=300): SomethingSoupInfo.__init__(self, - 'crew/info.wm?crewid=', crewid) + 'crew/info.wm?crewid=', crewid, max_age) self._find_crew() def _find_crew(self): self.crew = [] - capts = self.soup.findAll('img', + capts = self._soup.findAll('img', src='/yoweb/images/crew-captain.png') if len(capts) != 1: self.msg('crew members: no. of captain images != 1') @@ -266,7 +303,7 @@ class CrewInfo(SomethingSoupInfo): crew_rank_re = regexp.compile('/yoweb/images/crew') for row in tbl.contents: # findAll(recurse=False) - if isinstance(row, unicode): + if isinstance(row,basestring): continue is_rank = row.find('img', attrs={'src': crew_rank_re}) @@ -285,6 +322,338 @@ class CrewInfo(SomethingSoupInfo): def __str__(self): return `(self.crew, self.msgs)` +#---------- pretty-printer for tables of pirate puzzle standings ---------- + +class StandingsTable: + def __init__(self, use_puzzles=None, col_width=6): + if use_puzzles is None: + if opts.ship_duty: + use_puzzles=[ + 'Navigating','Battle Navigation', + 'Gunning', + ['Sailing','Rigging'], + 'Bilging', + 'Carpentry', + 'Treasure Haul' + ] + else: + use_puzzles=puzzles + self._puzzles = use_puzzles + self.s = '' + self._cw = col_width-1 + + def _pline(self, pirate, puzstrs, extra): + self.s += ' %-*s' % (max(max_pirate_namelen, 14), pirate) + for v in puzstrs: + self.s += ' %-*.*s' % (self._cw,self._cw, v) + if extra: + self.s += ' ' + extra + self.s += '\n' + + def _puzstr(self, pi, puzzle): + if not isinstance(puzzle,list): puzzle = [puzzle] + try: standing = max([pi.standings[p] for p in puzzle]) + except KeyError: return '?' + if not standing: return '' + s = '' + if self._cw > 4: + c1 = standingvals[standing][0] + if standing < 3: c1 = c1.lower() # 3 = Master + s += `standing` + if self._cw > 5: + s += ' ' + s += '*' * (standing / 2) + s += '+' * (standing % 2) + return s + + def headings(self): + def puzn_redact(name): + if isinstance(name,list): + return '/'.join( + ["%.*s" % (self._cw/2, puzn_redact(n)) + for n in name]) + spc = name.find(' ') + if spc < 0: return name + return name[0:min(4,spc)] + name[spc+1:] + self._pline('', map(puzn_redact, self._puzzles), None) + def literalline(self, line): + self.s += line + '\n' + def pirate_dummy(self, name, standingstring, extra=None): + self._pline(name, standingstring * len(self._puzzles), extra) + def pirate(self, pi, extra=None): + puzstrs = [self._puzstr(pi,puz) for puz in self._puzzles] + self._pline(pi.name, puzstrs, extra) + + def results(self): + return self.s + +#---------- chat log parser ---------- + +class PirateAboard: + # This is essentially a transparent, dumb, data class. + # pa.v + # pa.name + # pa.last_time + # pa.last_event + # pa.gunner + # pa.last_chat_time + # pa.last_chat_chan + # pa.pi + + def __init__(pa, pn, v, time, event): + pa.name = pn + pa.v = v + pa.last_time = time + pa.last_event = event + pa.last_chat_time = None + pa.last_chat_chan = None + pa.gunner = False + pa.pi = None + + def pirate_info(pa): + if not pa.pi and not fetcher.need_wait(time.time()): + pa.pi = PirateInfo(pa.name, 3600) + return pa.pi + +class ChatLogTracker: + # This is quite complex so we make it opaque. Use the + # official invokers, accessors etc. + + def __init__(self, myself_pi, logfn): + self._pl = {} # self._pl['Pirate'] = + self._vl = {} # self._vl['Vessel']['Pirate'] = PirateAboard + # self._vl['Vessel']['#lastaboard'] + self._v = None # self._v = + self._vessel = None # self._vl[self._vessel] + self._date = None + self._myself = myself_pi + self._need_redisplay = False + self._f = file(logfn) + self._lbuf = '' + self._progress = [0, os.fstat(self._f.fileno()).st_size] + + def _refresh(self): + self._need_redisplay = True + + def _onboard_event(self,timestamp,pirate,event): + try: pa = self._pl[pirate] + except KeyError: pa = None + if pa is not None and pa.v is self._v: + pa.last_time = timestamp + pa.last_event = event + else: + if pa is not None: del pa.v[pirate] + pa = PirateAboard(pirate, self._v, timestamp, event) + self._pl[pirate] = pa + self._v[pirate] = pa + self._v['#lastaboard'] = timestamp + self._refresh() + return pa + + def _trash_vessel(self, v): + for pn in v: + if pn.startswith('#'): continue + del self._pl[pn] + self._refresh() + + def expire_garbage(self, timestamp): + for (vn,v) in list(self._vl.iteritems()): + la = v['#lastaboard'] + if timestamp - la > opts.ship_reboard_clearout: + self._debug_line_disposition(timestamp,'', + 'stale reset '+vn) + self._trash_vessel(v) + del self._vl[vn] + + def clear_vessel(self, timestamp): + if self._v is not None: + self._trash_vessel(self._v) + self._v = {'#lastaboard': timestamp} + self._vl[self._vessel] = self._v + + def _debug_line_disposition(self,timestamp,l,m): + debug('CLT %13s %-30s %s' % (timestamp,m,l)) + + def chatline(self,l): + rm = lambda re: regexp.match(re,l) + d = lambda m: self._debug_line_disposition(timestamp,l,m) + timestamp = None + + m = rm('=+ (\\d+)/(\\d+)/(\\d+) =+$') + if m: + self._date = m.groups() + return d('date '+`self._date`) + + if self._date is None: + return d('date unset') + + m = rm('\\[(\d\d):(\d\d):(\d\d)\\] ') + if not m: + return d('no timestamp') + + time_tuple = [int(x) for x in self._date + m.groups()] + time_tuple += (-1,-1,-1) + timestamp = time.mktime(time_tuple) + l = l[l.find(' ')+1:] + + def ob_x(who,event): + return self._onboard_event(timestamp, who, event) + def ob1(did): ob_x(m.group(1), did); return d(did) + def oba(did): return ob1('%s %s' % (did, m.group(2))) + + m = rm('Going aboard the (\\S.*\\S)\\.\\.\\.$') + if m: + pn = self._myself.name + self._vessel = m.group(1) + dm = 'boarding' + + try: self._v = self._vl[self._vessel] + except KeyError: self._v = None; dm += ' new' + + if self._v is not None: la = self._v['#lastaboard'] + else: la = 0; dm += ' ?la' + + if timestamp - la > opts.ship_reboard_clearout: + self.clear_vessel(timestamp) + dm += ' stale' + + ob_x(pn, 'we boarded') + self.expire_garbage(timestamp) + return d(dm) + + if self._v is None: + return d('no vessel') + + m = rm('(\\w+) has come aboard\\.$') + if m: return ob1('boarded'); + + m = rm('You have ordered (\\w+) to do some (\\S.*\\S)\\.$') + if m: + (who,what) = m.groups() + pa = ob_x(who,'ord '+what) + if what == 'Gunning': + pa.gunner = True + return d('duty order') + + m = rm('(\\w+) abandoned a (\\S.*\\S) station\\.$') + if m: oba('stopped'); return d("end") + + def chat(what): + who = m.group(1) + try: pa = self._pl[who] + except KeyError: return d('chat mystery') + if pa.v is self._v: + pa.last_chat_time = timestamp + pa.last_chat_chan = what + self._refresh() + return d(what+' chat') + + m = rm('(\\w+) (?:issued an order|ordered everyone) "') + if m: return ob1('general order'); + + m = rm('(\\w+) says, "') + if m: return chat('public') + + m = rm('(\\w+) tells ye, "') + if m: return chat('private') + + m = rm('(\\w+) flag officer chats, "') + if m: return chat('flag officer') + + m = rm('(\\w+) officer chats, "') + if m: return chat('officer') + + m = rm('Game over\\. Winners: ([A-Za-z, ]+)\\.$') + if m: + pl = m.group(1).split(', ') + if not self._myself.name in pl: + return d('lost boarding battle') + for pn in pl: + if ' ' in pn: continue + ob_x(pn,'won boarding battle') + return d('won boarding battle') + + m = rm('(\\w+) is eliminated\\!') + if m: return ob1('eliminated in fray'); + + m = rm('(\\w+) has left the vessel\.') + if m: + who = m.group(1) + ob_x(who, 'disembarked') + del self._v[who] + del self._pl[who] + return d('disembarked') + + return d('not matched') + + def _str_vessel(self, vn, v): + s = ' vessel %s\n' % vn + s += ' '*20 + "%-*s %13s\n" % ( + max_pirate_namelen, '#lastaboard', + v['#lastaboard']) + for pn in sorted(v.keys()): + if pn.startswith('#'): continue + pa = v[pn] + assert pa.v == v + assert self._pl[pn] == pa + s += ' '*20 + "%s %-*s %13s %-30s %13s %s\n" % ( + (' ','G')[pa.gunner], + max_pirate_namelen, pn, + pa.last_time, pa.last_event, + pa.last_chat_time, pa.last_chat_chan) + return s + + def __str__(self): + s = ''' 0 and opts.debug_fd is None) + or not os.isatty(sys.stdout.fileno())): + opts.display = 'dumb' + else: + opts.display = 'overwrite' + fetcher = Fetcher(opts.ocean, opts.cache_dir) mode_fn(args[1:], pa.error)