chiark / gitweb /
WIP
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 00:21:45 +0000 (01:21 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 00:21:45 +0000 (01:21 +0100)
yoweb-scrape

index 0a54940cc81878d3cd938ab2cad5bbdd1e856829..6a066d0d5bd234d1a9b50a40b699cd74fcf03aab 100755 (executable)
@@ -5,6 +5,7 @@ import time
 import urllib
 import urllib2
 import errno
+import re as regexp
 
 from BeautifulSoup import BeautifulSoup
 
@@ -13,6 +14,12 @@ ocean = 'ice'
 
 now = time.time()
 
+duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigation'+
+       '/Battle Navigation/Carpentry/Rumble/Treasure Haul').split('/')
+
+standings = ('Able/Distinguished/Respected/Master/Renowned'+
+               '/Grand-Master/Legendary/Ultimate').split('/')
+
 def fetch(url):
        cache_corename = urllib.quote_plus(url)
        cache_basename = "#%s#" % cache_corename
@@ -41,10 +48,49 @@ def yoweb_fetch(kind, tail):
        url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
        return fetch(url)
 
-def get_pirate_info(pirate):
-       html = yoweb_fetch('pirate.wm?target=', pirate)
-       soup = BeautifulSoup(html)
-       return `soup`
+class PirateInfo:
+       def __init__(self, pirate):
+               html = yoweb_fetch('pirate.wm?target=', pirate)
+               soup = BeautifulSoup(html,
+#                      convertEntities=BeautifulSoup.HTML_ENTITIES
+                       )
+               imgs = soup.findAll('img')
+               re = regexp.compile(
+u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
+                       )
+               skills = { }
+               for skill in duties:
+                       skills[skill] = [ ]
+               for img in imgs:
+                       try: duty = img['alt']
+                       except KeyError: continue
+                       print `duty`
+                       if not duty in duties: continue
+                       key = img.findParent('td')
+                       if key is None: continue
+                       valelem = key.findNextSibling('td')
+                       if valelem is None: continue
+                       valstr = ''.join(valelem.findAll(text=True))
+                       print `duty`, `valstr`
+                       match = re.match(valstr)
+                       if match is None: continue
+                       standing = match.group(1)
+                       skills[duty] = standing
+
+#                      print `duty`, `standing`
+#                      if standing not in standings: continue
+#                      for i in range(0, len(standings)):
+#                              print `duty`, `standing`, i
+#                              if standing == standings[i]:
+#                                      print `skills[duty]`
+#                                      skills[duty].append(i)
+
+#              self.skills = { }
+#              for skill in duties:
+                       
+               self.skills = skills
+       def __str__(self):
+               return `self.skills`
 
 def main():
        os.chdir(os.getenv('HOME'))
@@ -70,7 +116,7 @@ def main():
        # test program:
        global ocean
        ocean = 'midnight'
-       test = get_pirate_info('Aristarchus')
+       test = PirateInfo('Anaplian')
        print test
 
 main()