X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=blobdiff_plain;f=yoweb-scrape;h=462bd01c0366a42e877d8b47b8c4dade0a9c34ec;hp=844cd46139095cfcfde1e6095fc3c44ac531c1f5;hb=2b1646498eea5609775d17e121937feea4aa1196;hpb=d599b52a3b142bd27995ab65e67246343790a727 diff --git a/yoweb-scrape b/yoweb-scrape index 844cd46..462bd01 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -1,5 +1,8 @@ #!/usr/bin/python +import signal +signal.signal(signal.SIGINT, signal.SIG_DFL) + import os import time import urllib @@ -7,156 +10,568 @@ import urllib2 import errno import sys import re as regexp -import optparse +import random +from optparse import OptionParser from BeautifulSoup import BeautifulSoup -max_age = 120 -ocean = 'ice' +opts = None -now = time.time() -duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ +puzzles = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') -standings = ('Able/Distinguished/Respected/Master/Renowned'+ - '/Grand-Master/Legendary/Ultimate').split('/') - -def fetch(url): - cache_corename = urllib.quote_plus(url) - cache_basename = "#%s#" % cache_corename - try: f = file(cache_basename, 'r') - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise - f = None - if f is not None: - s = os.fstat(f.fileno()) - if now > s.st_mtime + max_age: +standingvals = ('Able/Distinguished/Respected/Master'+ + '/Renowned/Grand-Master/Legendary/Ultimate').split('/') + +pirate_ref_re = regexp.compile('^/yoweb/pirate\\.wm') + +max_pirate_namelen = 12 + + +def debug(m): + if opts.debug: + print >>sys.stderr, m + +class Fetcher: + def __init__(self, ocean, cachedir): + debug('Fetcher init %s' % cachedir) + self.ocean = ocean + self.cachedir = cachedir + try: os.mkdir(cachedir) + except (OSError,IOError), oe: + if oe.errno != errno.EEXIST: raise + self._cache_scan(time.time()) + + def _default_ocean(self): + if self.ocean is None: + self.ocean = 'ice' + + def _cache_scan(self, now): + # returns list of ages, unsorted + ages = [] + debug('Fetcher scan_cache') + for leaf in os.listdir(self.cachedir): + if not leaf.startswith('#'): continue + path = self.cachedir + '/' + leaf + try: s = os.stat(path) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + continue + age = now - s.st_mtime + if age > opts.expire_age: + debug('Fetcher expire %d %s' % (age, path)) + try: os.remove(path) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + continue + ages.append(age) + return ages + + def _rate_limit_cache_clean(self, now): + ages = self._cache_scan(now) + ages.sort() + debug('Fetcher ages ' + `ages`) + min_age = 1 + need_wait = 0 + for age in ages: + if age < min_age and age < 300: + debug('Fetcher morewait min=%d age=%d' % + (min_age, age)) + need_wait = max(need_wait, min_age - age) + min_age += 3 + min_age *= 1.25 + if need_wait > 0: + debug('Fetcher wait %d' % need_wait) + time.sleep(need_wait) + + def fetch(self, url, max_age): + debug('Fetcher fetch %s' % url) + cache_corename = urllib.quote_plus(url) + cache_item = "%s/#%s#" % (self.cachedir, cache_corename) + try: f = file(cache_item, 'r') + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise f = None - if f is not None: - data = f.read() - f.close() - else: - os.sleep(1) + now = time.time() + max_age = max(opts.min_max_age, min(max_age, opts.expire_age)) + if f is not None: + s = os.fstat(f.fileno()) + age = now - s.st_mtime + if age > max_age: + debug('Fetcher stale %d < %d'% (max_age, age)) + f = None + if f is not None: + data = f.read() + f.close() + debug('Fetcher cached %d > %d' % (max_age, age)) + return data + + debug('Fetcher fetch') + self._rate_limit_cache_clean(now) + stream = urllib2.urlopen(url) data = stream.read() - cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) - f = file(cache_ourname, 'w') + cache_tmp = "%s/#%s~%d#" % ( + self.cachedir, cache_corename, os.getpid()) + f = file(cache_tmp, 'w') f.write(data) f.close() - os.rename(cache_ourname, cache_basename) - return data + os.rename(cache_tmp, cache_item) + debug('Fetcher stored') + return data -def yoweb_fetch(kind, tail): - url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail) - return fetch(url) + def yoweb(self, kind, tail, max_age): + self._default_ocean() + url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( + self.ocean, kind, tail) + return self.fetch(url, max_age) -class PirateInfo: - # Public data members: - # pi.skills = { 'Treasure Haul': 'Able' ... } - # pi.msgs = [ 'message describing problem with scrape' ] - def _log(self, m): +class SoupLog: + def __init__(self): + self.msgs = [ ] + def msg(self, m): self.msgs.append(m) + def soupm(self, obj, m): + self.msg(m + '; in ' + `obj`) + def needs_msgs(self, child_souplog): + self.msgs += child_souplog.msgs + child_souplog.msgs = [ ] - def _logsoup(self, soup, m): - self._log(m + '; in ' + `soup`) +def soup_text(obj): + str = ''.join(obj.findAll(text=True)) + return str.strip() - def __init__(self, pirate): - html = yoweb_fetch('pirate.wm?target=', pirate) - soup = BeautifulSoup(html, -# convertEntities=BeautifulSoup.HTML_ENTITIES +class SomethingSoupInfo(SoupLog): + def __init__(self, kind, tail, max_age): + SoupLog.__init__(self) + html = fetcher.yoweb(kind, tail, max_age) + self._soup = BeautifulSoup(html, + convertEntities=BeautifulSoup.HTML_ENTITIES ) - imgs = soup.findAll('img', + +class PirateInfo(SomethingSoupInfo): + # Public data members: + # pi.standings = { 'Treasure Haul': 'Able' ... } + # pi.name = name + # pi.crew = (id, name) + # pi.flag = (id, name) + # pi.msgs = [ 'message describing problem with scrape' ] + + def __init__(self, pirate, max_age=300): + SomethingSoupInfo.__init__(self, + 'pirate.wm?target=', pirate, max_age) + self.name = pirate + self._find_standings() + self.crew = self._find_crewflag('crew', + '^/yoweb/crew/info\\.wm') + self.flag = self._find_crewflag('flag', + '^/yoweb/flag/info\\.wm') + + def _find_standings(self): + imgs = self._soup.findAll('img', src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( -u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\ \;([-A-Za-z]+)\)\s*$' +u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$' ) - skills = { } - self.msgs = [ ] + standings = { } + + for skill in puzzles: + standings[skill] = [ ] - for skill in duties: - skills[skill] = [ ] + skl = SoupLog() for img in imgs: - try: duty = img['alt'] + try: puzzle = img['alt'] except KeyError: continue - if not duty in duties: - self._logsoup(img, 'unknown duty: "%s"' % duty) + if not puzzle in puzzles: + skl.soupm(img, 'unknown puzzle: "%s"' % puzzle) continue key = img.findParent('td') if key is None: - self._logsoup(img, 'duty at root! "%s"' % duty) + skl.soupm(img, 'puzzle at root! "%s"' % puzzle) continue valelem = key.findNextSibling('td') if valelem is None: - self._logsoup(key, 'duty missing sibling "%s"' - % duty) + skl.soupm(key, 'puzzle missing sibling "%s"' + % puzzle) continue - valstr = ''.join(valelem.findAll(text=True)) + valstr = soup_text(valelem) match = re.match(valstr) if match is None: - self._logsoup(key, 'duty "%s" unparseable'+ - ' standing "%s"' % (duty, valstr)) + skl.soupm(key, ('puzzle "%s" unparseable'+ + ' standing "%s"') % (puzzle, valstr)) continue standing = match.group(match.lastindex) - skills[duty].append(standing) + standings[puzzle].append(standing) - self.skills = { } + self.standings = { } - for duty in duties: - sl = skills[duty] + for puzzle in puzzles: + sl = standings[puzzle] if len(sl) > 1: - self.log('duty "%s" multiple standings %s' % - (duty, `sl`)) + skl.msg('puzzle "%s" multiple standings %s' % + (puzzle, `sl`)) continue - if not len(sl): - self.log('duty "%s" no standing found' % duty) + if not sl: + skl.msg('puzzle "%s" no standing found' % puzzle) continue standing = sl[0] - for i in range(0, len(standings)): - if standing == standings[i]: - self.skills[duty] = i - if not duty in self.skills: - self.log('duty "%s" unknown standing "%s"' % - (duty, standing)) - all_skills_ok = True - for duty in duties: - if not duty in self.skills: - all_skills_ok = False - if all_skills_ok: - self.msgs = [ ] + for i in range(0, len(standingvals)-1): + if standing == standingvals[i]: + self.standings[puzzle] = i + if not puzzle in self.standings: + skl.msg('puzzle "%s" unknown standing "%s"' % + (puzzle, standing)) + + all_standings_ok = True + for puzzle in puzzles: + if not puzzle in self.standings: + self.needs_msgs(skl) + + def _find_crewflag(self, cf, yoweb_re): + things = self._soup.findAll('a', href=regexp.compile(yoweb_re)) + if len(things) != 1: + self.msg('zero or several %s id references found' % cf) + return None + thing = things[0] + id_re = '\\b%sid\\=(\\w+)$' % cf + id_haystack = thing['href'] + match = regexp.compile(id_re).search(id_haystack) + if match is None: + self.soupm(thing, ('incomprehensible %s id ref'+ + ' (%s in %s)') % (cf, id_re, id_haystack)) + return None + name = soup_text(thing) + return (match.group(1), name) + + def __str__(self): + return `(self.crew, self.flag, self.standings, self.msgs)` + +class CrewInfo(SomethingSoupInfo): + # Public data members: + # ci.crew = [ ('Captain', ['Pirate', ...]), + # ('Senior Officer', [...]), + # ... ] + # pi.msgs = [ 'message describing problem with scrape' ] + + def __init__(self, crewid, max_age=300): + SomethingSoupInfo.__init__(self, + 'crew/info.wm?crewid=', crewid, max_age) + self._find_crew() + + def _find_crew(self): + self.crew = [] + capts = self._soup.findAll('img', + src='/yoweb/images/crew-captain.png') + if len(capts) != 1: + self.msg('crew members: no. of captain images != 1') + return + tbl = capts[0] + while not tbl.find('a', href=pirate_ref_re): + tbl = tbl.findParent('table') + if not tbl: + self.msg('crew members: cannot find table') + return + current_rank_crew = None + crew_rank_re = regexp.compile('/yoweb/images/crew') + for row in tbl.contents: + # findAll(recurse=False) + if isinstance(row, unicode): + continue + + is_rank = row.find('img', attrs={'src': crew_rank_re}) + if is_rank: + rank = soup_text(row) + current_rank_crew = [] + self.crew.append((rank, current_rank_crew)) + continue + for cell in row.findAll('a', href=pirate_ref_re): + if current_rank_crew is None: + self.soupm(cell, 'crew members: crew' + ' before rank') + continue + current_rank_crew.append(soup_text(cell)) def __str__(self): - return `self.skills` + return `(self.crew, self.msgs)` + +class StandingsTable: + def __init__(self, use_puzzles=None, col_width=6): + if use_puzzles is None: + if opts.ship_duty: + use_puzzles=[ + 'Navigating','Battle Navigation', + 'Gunning', + ['Sailing','Rigging'], + 'Bilging', + 'Carpentry', + 'Treasure Haul' + ] + else: + use_puzzles=puzzles + self._puzzles = use_puzzles + self.s = '' + self._cw = col_width-1 + + def _pline(self, pirate, puzstrs): + self.s += ' %-*s' % (max(max_pirate_namelen, 14), pirate) + for v in puzstrs: + self.s += ' %-*.*s' % (self._cw,self._cw, v) + self.s += '\n' + + def _puzstr(self, pi, puzzle): + if not isinstance(puzzle,list): puzzle = [puzzle] + try: standing = max([pi.standings[p] for p in puzzle]) + except KeyError: return '?' + if not standing: return '' + s = '' + if self._cw > 4: + c1 = standingvals[standing][0] + if standing < 3: c1 = c1.lower() # 3 = Master + s += `standing` + if self._cw > 5: + s += ' ' + s += '*' * (standing / 2) + s += '+' * (standing % 2) + return s + + def headings(self): + def puzn_redact(name): + if isinstance(name,list): + return '/'.join( + ["%.*s" % (self._cw/2, puzn_redact(n)) + for n in name]) + spc = name.find(' ') + if spc < 0: return name + return name[0:min(4,spc)] + name[spc+1:] + self._pline('', map(puzn_redact, self._puzzles)) + def literalline(self, line): + self.s += line + '\n' + def pirate(self, pi): + puzstrs = [self._puzstr(pi,puz) for puz in self._puzzles] + self._pline(pi.name, puzstrs) + + def results(self): + return self.s + +def do_pirate(pirates, bu): + print '{' + for pirate in pirates: + info = PirateInfo(pirate) + print '%s: %s,' % (`pirate`, info) + print '}' + +def prep_crew_of(args, bu, max_age=300): + if len(args) != 1: bu('crew-of takes one pirate name') + pi = PirateInfo(args[0], max_age) + return CrewInfo(pi.crew[0], max_age) + +def do_crew_of(args, bu): + ci = prep_crew_of(args, bu) + print ci + +def do_standings_crew_of(args, bu): + ci = prep_crew_of(args, bu, 60) + tab = StandingsTable() + tab.headings() + for (rank, members) in ci.crew: + if not members: continue + tab.literalline('%s:' % rank) + for p in members: + pi = PirateInfo(p, random.randint(900,1800)) + tab.pirate(pi) + print tab.results() + +class PirateAboard: + # pa.v + # pa.last_time + # pa.last_event + # pa.gunner + def __init__(pa, v, time, event): + pa.v = v + pa.last_time = time + pa.last_event = event + pa.gunner = False + +class ShipCrewTracker: + def __init__(self, myself_pi): + self._pl = {} # self._pl['Pirate'] = + self._vl = {} # self._vl['Vessel']['Pirate'] = PirateAboard + # self._vl['Vessel']['#lastaboard'] + self._v = None # self._v = + self._vessel = None # self._vl[self._vessel] + self._date = None + self._myself = myself_pi + self._need_redisplay = False + + def _refresh(self): + self._need_redisplay = True + + def _onboard_event(self,timestamp,pirate,event): + try: pa = self._pl[pirate] + except KeyError: pa = None + if pa is not None and pa.v is self._v: + pa.last_time = timestamp + pa.last_event = event + else: + if pa is not None: del pa.v[pirate] + pa = PirateAboard(self._v, timestamp, event) + self._pl[pirate] = pa + self._v[pirate] = pa + self._v['#lastaboard'] = timestamp + self._refresh() + return pa + + def clear_vessel(self, timestamp): + if self._v is not None: + for p in self._v: + if p.startswith('#'): continue + del self._pl[p] + self._v = {'#lastaboard': timestamp} + + def _debug_line_disposition(self,timestamp,l,m): + debug('SCT %-13s %-30s %s' % (timestamp,m,l)) + + def chatline(self,l): + rm = lambda re: regexp.match(re,l) + d = lambda m: self._debug_line_disposition(timestamp,l,m) + timestamp = None + + m = rm('=+ (\\d+)/(\\d+)/(\\d+) =+$') + if m: + self._date = m.groups() + return d('date '+`self._date`) + + if self._date is None: + return d('date unset') + + m = rm('\\[(\d\d):(\d\d):(\d\d)\\] ') + if not m: + return d('no timestamp') + + time_tuple = [int(x) for x in self._date + m.groups()] + time_tuple += (-1,-1,-1) + timestamp = time.mktime(time_tuple) + l = l[l.find(' ')+1:] + + ob = lambda who, event: self._onboard_event( + timestamp, who, event) + oba = lambda m, did: ob( + m.group(1), '%s %s' % (did, m.group(2))) + + m = rm('Going aboard the (\\S.*\\S)\\.\\.\\.$') + if m: + self._vessel = m.group(1) + dm = 'boarding' + try: self._v = self._vl[self._vessel] + except KeyError: self._v = None; dm += ' new' + if self._v is not None: la = self._v['#lastaboard'] + else: la = 0; dm += ' ?la' + if timestamp - la > 3600: + self.clear_vessel(timestamp) + dm += ' stale' + self._vl[self._vessel] = self._v + ob(self._myself.name, 'we boarded') + return d(dm) + + if self._v is None: + return d('no vessel') + + m = rm('You have ordered (\\w+) to do some (\\S.*\\S)\\.$') + if m: + pa = oba(m, 'ordered') + if m.group(2) == 'Gunning': + pa.gunner = True + return d('duty order') + + m = rm('(\\w+) abandoned a (\\S.*\\S) station\\.$') + if m: oba(m,'abandoned'); return d('abandoned') + + m = rm('(\\w+) says, "') + if m: ob(m.group(1), 'talked'); return d('talked') + + m = rm('(\\w+) has left the vessel\.') + if m: + who = m.group(1) + ob(who, 'disembarked') + del self._v[who] + del self._pl[who] + return d('disembarked') + + return d('not matched') + +def do_ship_aid(args, bu): + if len(args) != 1: bu('ship-aid takes only chat log filename') + logfn = args[0] + logfn_re = '(?:.*/)?([A-Z][a-z]+)_([a-z]+)_chat-log-\\w+$' + match = regexp.match(logfn_re, logfn) + if not match: bu('ship-aid chat log filename is not in default format') + (pirate, fetcher.ocean) = match.groups() + myself_pi = PirateInfo(pirate,3600) + track = ShipCrewTracker(myself_pi) + f = file(logfn) + l = '' + while True: + l += f.readline() + if l.endswith('\n'): + track.chatline(l.rstrip()) +# print `track.__dict__` + l = '' + continue + if l: + continue + print `track.__dict__` + os.sleep(1) def main(): - os.chdir(os.getenv('HOME')) - cache_dir = '.yoweb-scrape-cache' - try: - os.chdir(cache_dir) - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise - os.mkdir(cache_dir) - os.chdir(cache_dir) - - for path in os.listdir('.'): - if not path.startswith('#'): continue - max_time = max_age - if '~' in path: max_time = 10 - try: - s = os.stat(path) - if now > s.st_mtime + max_time: - os.remove(path) - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise + global opts, fetcher + + pa = OptionParser( +'''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...] +actions: + yoweb-scrape [--ocean OCEAN ...] pirate PIRATE + yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE + yoweb-scrape [--ocean OCEAN ...] standings-crew-of PIRATE + yoweb-scrape [--ocean OCEAN ...] ship-aid CHAT-LOG +''') + ao = pa.add_option + ao('-O','--ocean',dest='ocean', metavar='OCEAN', default=None, + help='select ocean OCEAN') + ao('--cache-dir', dest='cache_dir', metavar='DIR', + default='~/.yoweb-scrape-cache', + help='cache yoweb pages in DIR') + ao('-D','--debug', action='store_true', dest='debug', default=False, + help='enable debugging output') + ao('-q','--quiet', action='store_true', dest='quiet', + help='suppress warning output') + + ao('--ship-duty', action='store_true', dest='ship_duty', + help='show ship duty station puzzles') + + (opts,args) = pa.parse_args() + random.seed() + + if len(args) < 1: + pa.error('need a mode argument') + + mode = args[0] + mode_fn_name = 'do_' + mode.replace('_','#').replace('-','_') + try: mode_fn = globals()[mode_fn_name] + except KeyError: pa.error('unknown mode "%s"' % mode) + + # fixed parameters + opts.min_max_age = 60 + opts.expire_age = 3600 + + if opts.cache_dir.startswith('~/'): + opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:] + + fetcher = Fetcher(opts.ocean, opts.cache_dir) - # test program: - global ocean - ocean = 'midnight' - test = PirateInfo('Anaplian') - print test + mode_fn(args[1:], pa.error) main()