WIP before option parser

author Ian Jackson <ian@liberator.relativity.greenend.org.uk>

Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)

committer Ian Jackson <ian@liberator.relativity.greenend.org.uk>

Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)
author Ian Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)
committer Ian Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)
diff --git a/yoweb-scrape b/yoweb-scrape

index 6a066d0d5bd234d1a9b50a40b699cd74fcf03aab..844cd46139095cfcfde1e6095fc3c44ac531c1f5 100755 (executable)
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -5,7 +5,9 @@ import time
  import urllib
  import urllib2
  import errno
+import sys
  import re as regexp
+import optparse
  
  from BeautifulSoup import BeautifulSoup
  
@@ -14,8 +16,10 @@ ocean = 'ice'
  
  now = time.time()
  
-duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigation'+
-       '/Battle Navigation/Carpentry/Rumble/Treasure Haul').split('/')
+duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
+       '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
+       '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
+       '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
  
  standings = ('Able/Distinguished/Respected/Master/Renowned'+
                 '/Grand-Master/Legendary/Ultimate').split('/')
@@ -35,6 +39,7 @@ def fetch(url):
                 data = f.read()
                 f.close()
         else:
+               os.sleep(1)
                 stream = urllib2.urlopen(url)
                 data = stream.read()
                 cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
@@ -49,46 +54,81 @@ def yoweb_fetch(kind, tail):
         return fetch(url)
  
  class PirateInfo:
+       # Public data members:
+       #  pi.skills = { 'Treasure Haul': 'Able' ... }
+       #  pi.msgs = [ 'message describing problem with scrape' ]
+       def _log(self, m):
+               self.msgs.append(m)
+
+       def _logsoup(self, soup, m):
+               self._log(m + '; in ' + `soup`)
+
         def __init__(self, pirate):
                 html = yoweb_fetch('pirate.wm?target=', pirate)
                 soup = BeautifulSoup(html,
  #                      convertEntities=BeautifulSoup.HTML_ENTITIES
                         )
-               imgs = soup.findAll('img')
+               imgs = soup.findAll('img',
+                       src=regexp.compile('/yoweb/images/stat.*'))
                 re = regexp.compile(
  u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
                         )
                 skills = { }
+               self.msgs = [ ]
+
                 for skill in duties:
                         skills[skill] = [ ]
+
                 for img in imgs:
                         try: duty = img['alt']
                         except KeyError: continue
-                       print `duty`
-                       if not duty in duties: continue
+
+                       if not duty in duties:
+                               self._logsoup(img, 'unknown duty: "%s"' % duty)
+                               continue
                         key = img.findParent('td')
-                       if key is None: continue
+                       if key is None:
+                               self._logsoup(img, 'duty at root! "%s"' % duty)
+                               continue
                         valelem = key.findNextSibling('td')
-                       if valelem is None: continue
+                       if valelem is None:
+                               self._logsoup(key, 'duty missing sibling "%s"'
+                                       % duty)
+                               continue
                         valstr = ''.join(valelem.findAll(text=True))
-                       print `duty`, `valstr`
                         match = re.match(valstr)
-                       if match is None: continue
-                       standing = match.group(1)
-                       skills[duty] = standing
-
-#                      print `duty`, `standing`
-#                      if standing not in standings: continue
-#                      for i in range(0, len(standings)):
-#                              print `duty`, `standing`, i
-#                              if standing == standings[i]:
-#                                      print `skills[duty]`
-#                                      skills[duty].append(i)
-
-#              self.skills = { }
-#              for skill in duties:
-                       
-               self.skills = skills
+                       if match is None:
+                               self._logsoup(key, 'duty "%s" unparseable'+
+                                       ' standing "%s"' % (duty, valstr))
+                               continue
+                       standing = match.group(match.lastindex)
+                       skills[duty].append(standing)
+
+               self.skills = { }
+
+               for duty in duties:
+                       sl = skills[duty]
+                       if len(sl) > 1:
+                               self.log('duty "%s" multiple standings %s' %
+                                               (duty, `sl`))
+                               continue
+                       if not len(sl):
+                               self.log('duty "%s" no standing found' % duty)
+                               continue
+                       standing = sl[0]
+                       for i in range(0, len(standings)):
+                               if standing == standings[i]:
+                                       self.skills[duty] = i
+                       if not duty in self.skills:
+                               self.log('duty "%s" unknown standing "%s"' %
+                                       (duty, standing))
+               all_skills_ok = True
+               for duty in duties:
+                       if not duty in self.skills:
+                               all_skills_ok = False
+               if all_skills_ok:
+                       self.msgs = [ ]
+
         def __str__(self):
                 return `self.skills`
author	Ian Jackson <ian@liberator.relativity.greenend.org.uk>
	Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)
committer	Ian Jackson <ian@liberator.relativity.greenend.org.uk>
	Fri, 15 May 2009 18:23:14 +0000 (19:23 +0100)