X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?a=blobdiff_plain;ds=sidebyside;f=yoweb-scrape;h=844cd46139095cfcfde1e6095fc3c44ac531c1f5;hb=d599b52a3b142bd27995ab65e67246343790a727;hp=0a54940cc81878d3cd938ab2cad5bbdd1e856829;hpb=ed951644b1bdacabaa82cd5e854e567752101cfd;p=ypp-sc-tools.db-live.git diff --git a/yoweb-scrape b/yoweb-scrape index 0a54940..844cd46 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -5,6 +5,9 @@ import time import urllib import urllib2 import errno +import sys +import re as regexp +import optparse from BeautifulSoup import BeautifulSoup @@ -13,6 +16,14 @@ ocean = 'ice' now = time.time() +duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ + '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ + '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ + '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') + +standings = ('Able/Distinguished/Respected/Master/Renowned'+ + '/Grand-Master/Legendary/Ultimate').split('/') + def fetch(url): cache_corename = urllib.quote_plus(url) cache_basename = "#%s#" % cache_corename @@ -28,6 +39,7 @@ def fetch(url): data = f.read() f.close() else: + os.sleep(1) stream = urllib2.urlopen(url) data = stream.read() cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) @@ -41,10 +53,84 @@ def yoweb_fetch(kind, tail): url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail) return fetch(url) -def get_pirate_info(pirate): - html = yoweb_fetch('pirate.wm?target=', pirate) - soup = BeautifulSoup(html) - return `soup` +class PirateInfo: + # Public data members: + # pi.skills = { 'Treasure Haul': 'Able' ... } + # pi.msgs = [ 'message describing problem with scrape' ] + def _log(self, m): + self.msgs.append(m) + + def _logsoup(self, soup, m): + self._log(m + '; in ' + `soup`) + + def __init__(self, pirate): + html = yoweb_fetch('pirate.wm?target=', pirate) + soup = BeautifulSoup(html, +# convertEntities=BeautifulSoup.HTML_ENTITIES + ) + imgs = soup.findAll('img', + src=regexp.compile('/yoweb/images/stat.*')) + re = regexp.compile( +u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\ \;([-A-Za-z]+)\)\s*$' + ) + skills = { } + self.msgs = [ ] + + for skill in duties: + skills[skill] = [ ] + + for img in imgs: + try: duty = img['alt'] + except KeyError: continue + + if not duty in duties: + self._logsoup(img, 'unknown duty: "%s"' % duty) + continue + key = img.findParent('td') + if key is None: + self._logsoup(img, 'duty at root! "%s"' % duty) + continue + valelem = key.findNextSibling('td') + if valelem is None: + self._logsoup(key, 'duty missing sibling "%s"' + % duty) + continue + valstr = ''.join(valelem.findAll(text=True)) + match = re.match(valstr) + if match is None: + self._logsoup(key, 'duty "%s" unparseable'+ + ' standing "%s"' % (duty, valstr)) + continue + standing = match.group(match.lastindex) + skills[duty].append(standing) + + self.skills = { } + + for duty in duties: + sl = skills[duty] + if len(sl) > 1: + self.log('duty "%s" multiple standings %s' % + (duty, `sl`)) + continue + if not len(sl): + self.log('duty "%s" no standing found' % duty) + continue + standing = sl[0] + for i in range(0, len(standings)): + if standing == standings[i]: + self.skills[duty] = i + if not duty in self.skills: + self.log('duty "%s" unknown standing "%s"' % + (duty, standing)) + all_skills_ok = True + for duty in duties: + if not duty in self.skills: + all_skills_ok = False + if all_skills_ok: + self.msgs = [ ] + + def __str__(self): + return `self.skills` def main(): os.chdir(os.getenv('HOME')) @@ -70,7 +156,7 @@ def main(): # test program: global ocean ocean = 'midnight' - test = get_pirate_info('Aristarchus') + test = PirateInfo('Anaplian') print test main()