#!/usr/bin/python import os import time import urllib import urllib2 import errno import sys import re as regexp import optparse from BeautifulSoup import BeautifulSoup max_age = 120 ocean = 'ice' now = time.time() duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+ '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+ '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/') standings = ('Able/Distinguished/Respected/Master/Renowned'+ '/Grand-Master/Legendary/Ultimate').split('/') def fetch(url): cache_corename = urllib.quote_plus(url) cache_basename = "#%s#" % cache_corename try: f = file(cache_basename, 'r') except (OSError,IOError), oe: if oe.errno != errno.ENOENT: raise f = None if f is not None: s = os.fstat(f.fileno()) if now > s.st_mtime + max_age: f = None if f is not None: data = f.read() f.close() else: os.sleep(1) stream = urllib2.urlopen(url) data = stream.read() cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) f = file(cache_ourname, 'w') f.write(data) f.close() os.rename(cache_ourname, cache_basename) return data def yoweb_fetch(kind, tail): url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail) return fetch(url) class PirateInfo: # Public data members: # pi.skills = { 'Treasure Haul': 'Able' ... } # pi.msgs = [ 'message describing problem with scrape' ] def _log(self, m): self.msgs.append(m) def _logsoup(self, soup, m): self._log(m + '; in ' + `soup`) def __init__(self, pirate): html = yoweb_fetch('pirate.wm?target=', pirate) soup = BeautifulSoup(html, # convertEntities=BeautifulSoup.HTML_ENTITIES ) imgs = soup.findAll('img', src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\ \;([-A-Za-z]+)\)\s*$' ) skills = { } self.msgs = [ ] for skill in duties: skills[skill] = [ ] for img in imgs: try: duty = img['alt'] except KeyError: continue if not duty in duties: self._logsoup(img, 'unknown duty: "%s"' % duty) continue key = img.findParent('td') if key is None: self._logsoup(img, 'duty at root! "%s"' % duty) continue valelem = key.findNextSibling('td') if valelem is None: self._logsoup(key, 'duty missing sibling "%s"' % duty) continue valstr = ''.join(valelem.findAll(text=True)) match = re.match(valstr) if match is None: self._logsoup(key, 'duty "%s" unparseable'+ ' standing "%s"' % (duty, valstr)) continue standing = match.group(match.lastindex) skills[duty].append(standing) self.skills = { } for duty in duties: sl = skills[duty] if len(sl) > 1: self.log('duty "%s" multiple standings %s' % (duty, `sl`)) continue if not len(sl): self.log('duty "%s" no standing found' % duty) continue standing = sl[0] for i in range(0, len(standings)): if standing == standings[i]: self.skills[duty] = i if not duty in self.skills: self.log('duty "%s" unknown standing "%s"' % (duty, standing)) all_skills_ok = True for duty in duties: if not duty in self.skills: all_skills_ok = False if all_skills_ok: self.msgs = [ ] def __str__(self): return `self.skills` def main(): os.chdir(os.getenv('HOME')) cache_dir = '.yoweb-scrape-cache' try: os.chdir(cache_dir) except (OSError,IOError), oe: if oe.errno != errno.ENOENT: raise os.mkdir(cache_dir) os.chdir(cache_dir) for path in os.listdir('.'): if not path.startswith('#'): continue max_time = max_age if '~' in path: max_time = 10 try: s = os.stat(path) if now > s.st_mtime + max_time: os.remove(path) except (OSError,IOError), oe: if oe.errno != errno.ENOENT: raise # test program: global ocean ocean = 'midnight' test = PirateInfo('Anaplian') print test main()