From: Ian Jackson Date: Fri, 15 May 2009 18:24:37 +0000 (+0100) Subject: WIP can find crew and flag X-Git-Tag: 1.0~75 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=commitdiff_plain;h=c1ddacde91a4fbc9dc3b6b9fc3741d3f6564ba5f WIP can find crew and flag --- diff --git a/yoweb-scrape b/yoweb-scrape index 844cd46..0866667 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -1,5 +1,8 @@ #!/usr/bin/python +import signal +signal.signal(signal.SIGINT, signal.SIG_DFL) + import os import time import urllib @@ -7,155 +10,248 @@ import urllib2 import errno import sys import re as regexp -import optparse +from optparse import OptionParser from BeautifulSoup import BeautifulSoup -max_age = 120 -ocean = 'ice' - -now = time.time() +opts = None duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') -standings = ('Able/Distinguished/Respected/Master/Renowned'+ +standingvals = ('Able/Distinguished/Respected/Master/Renowned'+ '/Grand-Master/Legendary/Ultimate').split('/') -def fetch(url): - cache_corename = urllib.quote_plus(url) - cache_basename = "#%s#" % cache_corename - try: f = file(cache_basename, 'r') - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise - f = None - if f is not None: - s = os.fstat(f.fileno()) - if now > s.st_mtime + max_age: +def debug(m): + if opts.debug: + print >>sys.stderr, m + +class Fetcher: + def __init__(self, ocean, cachedir): + debug('Fetcher init %s' % cachedir) + self.ocean = ocean + self.cachedir = cachedir + try: os.mkdir(cachedir) + except (OSError,IOError), oe: + if oe.errno != errno.EEXIST: raise + + def _rate_limit_cache_clean(self, now): + ages = [] + for path in os.listdir(self.cachedir): + if not path.startswith('#'): continue + try: s = os.stat(path) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + continue + age = now - s.st_mtime + if age > opts.max_age: + debug('Fetcher expire %d %s' % (age, path)) + try: os.remove(path) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + continue + ages.append(age) + ages.sort() + debug('Fetcher ages ' + `ages`) + min_age = 1 + need_wait = 0 + for age in ages: + if age < min_age: + debug('Fetcher morewait min=%d age=%d' % + (min_age, age)) + need_wait = max(need_wait, age - min_age) + min_age *= 2 + min_age += 1 + if need_wait: + debug('Fetcher wait %d' % need_wait) + os.sleep(need_wait) + + def fetch(self, url): + debug('Fetcher fetch %s' % url) + cache_corename = urllib.quote_plus(url) + cache_item = "%s/#%s#" % (self.cachedir, cache_corename) + try: f = file(cache_item, 'r') + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise f = None - if f is not None: - data = f.read() - f.close() - else: - os.sleep(1) + now = time.time() + if f is not None: + s = os.fstat(f.fileno()) + if now > s.st_mtime + opts.max_age: + debug('Fetcher stale') + f = None + if f is not None: + data = f.read() + f.close() + debug('Fetcher cached') + return data + + debug('Fetcher fetch') + self._rate_limit_cache_clean(now) + stream = urllib2.urlopen(url) data = stream.read() - cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) - f = file(cache_ourname, 'w') + cache_tmp = "%s/#%s~%d#" % ( + self.cachedir, cache_corename, os.getpid()) + f = file(cache_tmp, 'w') f.write(data) f.close() - os.rename(cache_ourname, cache_basename) - return data + os.rename(cache_tmp, cache_item) + debug('Fetcher stored') + return data -def yoweb_fetch(kind, tail): - url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail) - return fetch(url) + def yoweb(self, kind, tail): + url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % ( + self.ocean, kind, tail) + return self.fetch(url) -class PirateInfo: - # Public data members: - # pi.skills = { 'Treasure Haul': 'Able' ... } - # pi.msgs = [ 'message describing problem with scrape' ] - def _log(self, m): +class SoupLog: + def __init__(self): + self.msgs = [ ] + def msg(self, m): self.msgs.append(m) + def soupm(self, obj, m): + self.msg(m + '; in ' + `obj`) + def needs_msgs(self, child_souplog): + self.msgs += child_souplog.msgs + child_souplog.msgs = [ ] - def _logsoup(self, soup, m): - self._log(m + '; in ' + `soup`) +class PirateInfo(SoupLog): + # Public data members: + # pi.standings = { 'Treasure Haul': 'Able' ... } + # pi.crew = (id, name) + # pi.flag = (id, name) + # pi.msgs = [ 'message describing problem with scrape' ] - def __init__(self, pirate): - html = yoweb_fetch('pirate.wm?target=', pirate) - soup = BeautifulSoup(html, -# convertEntities=BeautifulSoup.HTML_ENTITIES - ) - imgs = soup.findAll('img', + def _find_standings(self): + imgs = self.soup.findAll('img', src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( -u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\ \;([-A-Za-z]+)\)\s*$' +u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z]+)\\)\\s*$' ) - skills = { } - self.msgs = [ ] + standings = { } for skill in duties: - skills[skill] = [ ] + standings[skill] = [ ] + + skl = SoupLog() for img in imgs: try: duty = img['alt'] except KeyError: continue if not duty in duties: - self._logsoup(img, 'unknown duty: "%s"' % duty) + skl.soupm(img, 'unknown duty: "%s"' % duty) continue key = img.findParent('td') if key is None: - self._logsoup(img, 'duty at root! "%s"' % duty) + skl.soupm(img, 'duty at root! "%s"' % duty) continue valelem = key.findNextSibling('td') if valelem is None: - self._logsoup(key, 'duty missing sibling "%s"' + skl.soupm(key, 'duty missing sibling "%s"' % duty) continue valstr = ''.join(valelem.findAll(text=True)) match = re.match(valstr) if match is None: - self._logsoup(key, 'duty "%s" unparseable'+ + skl.soupm(key, 'duty "%s" unparseable'+ ' standing "%s"' % (duty, valstr)) continue standing = match.group(match.lastindex) - skills[duty].append(standing) + standings[duty].append(standing) - self.skills = { } + self.standings = { } for duty in duties: - sl = skills[duty] + sl = standings[duty] if len(sl) > 1: - self.log('duty "%s" multiple standings %s' % + skl.msg('duty "%s" multiple standings %s' % (duty, `sl`)) continue if not len(sl): - self.log('duty "%s" no standing found' % duty) + skl.msg('duty "%s" no standing found' % duty) continue standing = sl[0] - for i in range(0, len(standings)): - if standing == standings[i]: - self.skills[duty] = i - if not duty in self.skills: - self.log('duty "%s" unknown standing "%s"' % + for i in range(0, len(standingvals)-1): + if standing == standingvals[i]: + self.standings[duty] = i + if not duty in self.standings: + skl.msg('duty "%s" unknown standing "%s"' % (duty, standing)) - all_skills_ok = True + + all_standings_ok = True for duty in duties: - if not duty in self.skills: - all_skills_ok = False - if all_skills_ok: - self.msgs = [ ] + if not duty in self.standings: + self.needs_msgs(skl) + + def _find_crewflag(self, cf, yoweb_re): + things = self.soup.findAll('a', href=regexp.compile(yoweb_re)) + if len(things) != 1: + self.msg('zero or several %s id references found' % cf) + return None + thing = things[0] + id_re = '\\b%sid\\=(\\w+)$' % cf + id_haystack = thing['href'] + match = regexp.compile(id_re).search(id_haystack) + if match is None: + self.soupm(thing, ('incomprehensible %s id ref'+ + ' (%s in %s)') % (cf, id_re, id_haystack)) + return None + name = ''.join(thing.findAll(text=True)) + return (match.group(1), name) + + def __init__(self, pirate): + SoupLog.__init__(self) + + html = fetcher.yoweb('pirate.wm?target=', pirate) + self.soup = BeautifulSoup(html, +# convertEntities=BeautifulSoup.HTML_ENTITIES + ) + + self._find_standings() + + self.crew = self._find_crewflag('crew', + '^/yoweb/crew/info\\.wm') + self.flag = self._find_crewflag('flag', + '^/yoweb/flag/info\\.wm') def __str__(self): - return `self.skills` + return `(self.crew, self.flag, self.standings, self.msgs)` def main(): - os.chdir(os.getenv('HOME')) - cache_dir = '.yoweb-scrape-cache' - try: - os.chdir(cache_dir) - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise - os.mkdir(cache_dir) - os.chdir(cache_dir) - - for path in os.listdir('.'): - if not path.startswith('#'): continue - max_time = max_age - if '~' in path: max_time = 10 - try: - s = os.stat(path) - if now > s.st_mtime + max_time: - os.remove(path) - except (OSError,IOError), oe: - if oe.errno != errno.ENOENT: raise + global opts, fetcher + + pa = OptionParser( +'''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...] +actions: + yoweb-scrape [--ocean OCEAN ...] pirate PIRATE + yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE + yoweb-scrape [--ocean OCEAN ...] dutytab-crew-of PIRATE +''') + ao = pa.add_option + ao('-O','--ocean',dest='ocean', metavar='OCEAN', + default='ice', + help='select ocean OCEAN') + ao('--cache-dir', dest='cache_dir', metavar='DIR', + default='~/.yoweb-scrape-cache', + help='cache yoweb pages in DIR') + ao('-D','--debug', action='store_true', dest='debug', default=False, + help='enable debugging output') + ao('-q','--quiet', action='store_true', dest='quiet', + help='suppress warning output') + (opts,args) = pa.parse_args() + + # fixed parameters + opts.max_age = 240 + if opts.cache_dir.startswith('~/'): + opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:] + + fetcher = Fetcher(opts.ocean, opts.cache_dir) # test program: - global ocean - ocean = 'midnight' test = PirateInfo('Anaplian') print test