chiark / gitweb /
WIP before option parser
[ypp-sc-tools.db-live.git] / yoweb-scrape
index 6a066d0..844cd46 100755 (executable)
@@ -5,7 +5,9 @@ import time
 import urllib
 import urllib2
 import errno
+import sys
 import re as regexp
+import optparse
 
 from BeautifulSoup import BeautifulSoup
 
@@ -14,8 +16,10 @@ ocean = 'ice'
 
 now = time.time()
 
-duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigation'+
-       '/Battle Navigation/Carpentry/Rumble/Treasure Haul').split('/')
+duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
+       '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
+       '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
+       '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
 
 standings = ('Able/Distinguished/Respected/Master/Renowned'+
                '/Grand-Master/Legendary/Ultimate').split('/')
@@ -35,6 +39,7 @@ def fetch(url):
                data = f.read()
                f.close()
        else:
+               os.sleep(1)
                stream = urllib2.urlopen(url)
                data = stream.read()
                cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
@@ -49,46 +54,81 @@ def yoweb_fetch(kind, tail):
        return fetch(url)
 
 class PirateInfo:
+       # Public data members:
+       #  pi.skills = { 'Treasure Haul': 'Able' ... }
+       #  pi.msgs = [ 'message describing problem with scrape' ]
+       def _log(self, m):
+               self.msgs.append(m)
+
+       def _logsoup(self, soup, m):
+               self._log(m + '; in ' + `soup`)
+
        def __init__(self, pirate):
                html = yoweb_fetch('pirate.wm?target=', pirate)
                soup = BeautifulSoup(html,
 #                      convertEntities=BeautifulSoup.HTML_ENTITIES
                        )
-               imgs = soup.findAll('img')
+               imgs = soup.findAll('img',
+                       src=regexp.compile('/yoweb/images/stat.*'))
                re = regexp.compile(
 u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
                        )
                skills = { }
+               self.msgs = [ ]
+
                for skill in duties:
                        skills[skill] = [ ]
+
                for img in imgs:
                        try: duty = img['alt']
                        except KeyError: continue
-                       print `duty`
-                       if not duty in duties: continue
+
+                       if not duty in duties:
+                               self._logsoup(img, 'unknown duty: "%s"' % duty)
+                               continue
                        key = img.findParent('td')
-                       if key is None: continue
+                       if key is None:
+                               self._logsoup(img, 'duty at root! "%s"' % duty)
+                               continue
                        valelem = key.findNextSibling('td')
-                       if valelem is None: continue
+                       if valelem is None:
+                               self._logsoup(key, 'duty missing sibling "%s"'
+                                       % duty)
+                               continue
                        valstr = ''.join(valelem.findAll(text=True))
-                       print `duty`, `valstr`
                        match = re.match(valstr)
-                       if match is None: continue
-                       standing = match.group(1)
-                       skills[duty] = standing
-
-#                      print `duty`, `standing`
-#                      if standing not in standings: continue
-#                      for i in range(0, len(standings)):
-#                              print `duty`, `standing`, i
-#                              if standing == standings[i]:
-#                                      print `skills[duty]`
-#                                      skills[duty].append(i)
-
-#              self.skills = { }
-#              for skill in duties:
-                       
-               self.skills = skills
+                       if match is None:
+                               self._logsoup(key, 'duty "%s" unparseable'+
+                                       ' standing "%s"' % (duty, valstr))
+                               continue
+                       standing = match.group(match.lastindex)
+                       skills[duty].append(standing)
+
+               self.skills = { }
+
+               for duty in duties:
+                       sl = skills[duty]
+                       if len(sl) > 1:
+                               self.log('duty "%s" multiple standings %s' %
+                                               (duty, `sl`))
+                               continue
+                       if not len(sl):
+                               self.log('duty "%s" no standing found' % duty)
+                               continue
+                       standing = sl[0]
+                       for i in range(0, len(standings)):
+                               if standing == standings[i]:
+                                       self.skills[duty] = i
+                       if not duty in self.skills:
+                               self.log('duty "%s" unknown standing "%s"' %
+                                       (duty, standing))
+               all_skills_ok = True
+               for duty in duties:
+                       if not duty in self.skills:
+                               all_skills_ok = False
+               if all_skills_ok:
+                       self.msgs = [ ]
+
        def __str__(self):
                return `self.skills`