chiark / gitweb /
WIP before option parser
[ypp-sc-tools.db-live.git] / yoweb-scrape
index 0a54940cc81878d3cd938ab2cad5bbdd1e856829..844cd46139095cfcfde1e6095fc3c44ac531c1f5 100755 (executable)
@@ -5,6 +5,9 @@ import time
 import urllib
 import urllib2
 import errno
+import sys
+import re as regexp
+import optparse
 
 from BeautifulSoup import BeautifulSoup
 
@@ -13,6 +16,14 @@ ocean = 'ice'
 
 now = time.time()
 
+duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
+       '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
+       '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
+       '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
+
+standings = ('Able/Distinguished/Respected/Master/Renowned'+
+               '/Grand-Master/Legendary/Ultimate').split('/')
+
 def fetch(url):
        cache_corename = urllib.quote_plus(url)
        cache_basename = "#%s#" % cache_corename
@@ -28,6 +39,7 @@ def fetch(url):
                data = f.read()
                f.close()
        else:
+               os.sleep(1)
                stream = urllib2.urlopen(url)
                data = stream.read()
                cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
@@ -41,10 +53,84 @@ def yoweb_fetch(kind, tail):
        url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
        return fetch(url)
 
-def get_pirate_info(pirate):
-       html = yoweb_fetch('pirate.wm?target=', pirate)
-       soup = BeautifulSoup(html)
-       return `soup`
+class PirateInfo:
+       # Public data members:
+       #  pi.skills = { 'Treasure Haul': 'Able' ... }
+       #  pi.msgs = [ 'message describing problem with scrape' ]
+       def _log(self, m):
+               self.msgs.append(m)
+
+       def _logsoup(self, soup, m):
+               self._log(m + '; in ' + `soup`)
+
+       def __init__(self, pirate):
+               html = yoweb_fetch('pirate.wm?target=', pirate)
+               soup = BeautifulSoup(html,
+#                      convertEntities=BeautifulSoup.HTML_ENTITIES
+                       )
+               imgs = soup.findAll('img',
+                       src=regexp.compile('/yoweb/images/stat.*'))
+               re = regexp.compile(
+u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
+                       )
+               skills = { }
+               self.msgs = [ ]
+
+               for skill in duties:
+                       skills[skill] = [ ]
+
+               for img in imgs:
+                       try: duty = img['alt']
+                       except KeyError: continue
+
+                       if not duty in duties:
+                               self._logsoup(img, 'unknown duty: "%s"' % duty)
+                               continue
+                       key = img.findParent('td')
+                       if key is None:
+                               self._logsoup(img, 'duty at root! "%s"' % duty)
+                               continue
+                       valelem = key.findNextSibling('td')
+                       if valelem is None:
+                               self._logsoup(key, 'duty missing sibling "%s"'
+                                       % duty)
+                               continue
+                       valstr = ''.join(valelem.findAll(text=True))
+                       match = re.match(valstr)
+                       if match is None:
+                               self._logsoup(key, 'duty "%s" unparseable'+
+                                       ' standing "%s"' % (duty, valstr))
+                               continue
+                       standing = match.group(match.lastindex)
+                       skills[duty].append(standing)
+
+               self.skills = { }
+
+               for duty in duties:
+                       sl = skills[duty]
+                       if len(sl) > 1:
+                               self.log('duty "%s" multiple standings %s' %
+                                               (duty, `sl`))
+                               continue
+                       if not len(sl):
+                               self.log('duty "%s" no standing found' % duty)
+                               continue
+                       standing = sl[0]
+                       for i in range(0, len(standings)):
+                               if standing == standings[i]:
+                                       self.skills[duty] = i
+                       if not duty in self.skills:
+                               self.log('duty "%s" unknown standing "%s"' %
+                                       (duty, standing))
+               all_skills_ok = True
+               for duty in duties:
+                       if not duty in self.skills:
+                               all_skills_ok = False
+               if all_skills_ok:
+                       self.msgs = [ ]
+
+       def __str__(self):
+               return `self.skills`
 
 def main():
        os.chdir(os.getenv('HOME'))
@@ -70,7 +156,7 @@ def main():
        # test program:
        global ocean
        ocean = 'midnight'
-       test = get_pirate_info('Aristarchus')
+       test = PirateInfo('Anaplian')
        print test
 
 main()