X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yoweb-scrape;h=844cd46139095cfcfde1e6095fc3c44ac531c1f5;hp=6a066d0d5bd234d1a9b50a40b699cd74fcf03aab;hb=d599b52a3b142bd27995ab65e67246343790a727;hpb=c223acffaeafa66967736cf71ef5792b5ef9e9bc diff --git a/yoweb-scrape b/yoweb-scrape index 6a066d0..844cd46 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -5,7 +5,9 @@ import time import urllib import urllib2 import errno +import sys import re as regexp +import optparse from BeautifulSoup import BeautifulSoup @@ -14,8 +16,10 @@ ocean = 'ice' now = time.time() -duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigation'+ - '/Battle Navigation/Carpentry/Rumble/Treasure Haul').split('/') +duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ + '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ + '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ + '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') standings = ('Able/Distinguished/Respected/Master/Renowned'+ '/Grand-Master/Legendary/Ultimate').split('/') @@ -35,6 +39,7 @@ def fetch(url): data = f.read() f.close() else: + os.sleep(1) stream = urllib2.urlopen(url) data = stream.read() cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) @@ -49,46 +54,81 @@ def yoweb_fetch(kind, tail): return fetch(url) class PirateInfo: + # Public data members: + # pi.skills = { 'Treasure Haul': 'Able' ... } + # pi.msgs = [ 'message describing problem with scrape' ] + def _log(self, m): + self.msgs.append(m) + + def _logsoup(self, soup, m): + self._log(m + '; in ' + `soup`) + def __init__(self, pirate): html = yoweb_fetch('pirate.wm?target=', pirate) soup = BeautifulSoup(html, # convertEntities=BeautifulSoup.HTML_ENTITIES ) - imgs = soup.findAll('img') + imgs = soup.findAll('img', + src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\ \;([-A-Za-z]+)\)\s*$' ) skills = { } + self.msgs = [ ] + for skill in duties: skills[skill] = [ ] + for img in imgs: try: duty = img['alt'] except KeyError: continue - print `duty` - if not duty in duties: continue + + if not duty in duties: + self._logsoup(img, 'unknown duty: "%s"' % duty) + continue key = img.findParent('td') - if key is None: continue + if key is None: + self._logsoup(img, 'duty at root! "%s"' % duty) + continue valelem = key.findNextSibling('td') - if valelem is None: continue + if valelem is None: + self._logsoup(key, 'duty missing sibling "%s"' + % duty) + continue valstr = ''.join(valelem.findAll(text=True)) - print `duty`, `valstr` match = re.match(valstr) - if match is None: continue - standing = match.group(1) - skills[duty] = standing - -# print `duty`, `standing` -# if standing not in standings: continue -# for i in range(0, len(standings)): -# print `duty`, `standing`, i -# if standing == standings[i]: -# print `skills[duty]` -# skills[duty].append(i) - -# self.skills = { } -# for skill in duties: - - self.skills = skills + if match is None: + self._logsoup(key, 'duty "%s" unparseable'+ + ' standing "%s"' % (duty, valstr)) + continue + standing = match.group(match.lastindex) + skills[duty].append(standing) + + self.skills = { } + + for duty in duties: + sl = skills[duty] + if len(sl) > 1: + self.log('duty "%s" multiple standings %s' % + (duty, `sl`)) + continue + if not len(sl): + self.log('duty "%s" no standing found' % duty) + continue + standing = sl[0] + for i in range(0, len(standings)): + if standing == standings[i]: + self.skills[duty] = i + if not duty in self.skills: + self.log('duty "%s" unknown standing "%s"' % + (duty, standing)) + all_skills_ok = True + for duty in duties: + if not duty in self.skills: + all_skills_ok = False + if all_skills_ok: + self.msgs = [ ] + def __str__(self): return `self.skills`