#!/usr/bin/python
+import signal
+signal.signal(signal.SIGINT, signal.SIG_DFL)
+
import os
import time
import urllib
import urllib2
import errno
+import sys
+import re as regexp
+from optparse import OptionParser
from BeautifulSoup import BeautifulSoup
-max_age = 120
-ocean = 'ice'
-
-now = time.time()
-
-def fetch(url):
- cache_corename = urllib.quote_plus(url)
- cache_basename = "#%s#" % cache_corename
- try: f = file(cache_basename, 'r')
- except (OSError,IOError), oe:
- if oe.errno != errno.ENOENT: raise
- f = None
- if f is not None:
- s = os.fstat(f.fileno())
- if now > s.st_mtime + max_age:
+opts = None
+
+duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
+ '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
+ '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
+ '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
+
+standingvals = ('Able/Distinguished/Respected/Master/Renowned'+
+ '/Grand-Master/Legendary/Ultimate').split('/')
+
+def debug(m):
+ if opts.debug:
+ print >>sys.stderr, m
+
+class Fetcher:
+ def __init__(self, ocean, cachedir):
+ debug('Fetcher init %s' % cachedir)
+ self.ocean = ocean
+ self.cachedir = cachedir
+ try: os.mkdir(cachedir)
+ except (OSError,IOError), oe:
+ if oe.errno != errno.EEXIST: raise
+
+ def _rate_limit_cache_clean(self, now):
+ ages = []
+ for path in os.listdir(self.cachedir):
+ if not path.startswith('#'): continue
+ try: s = os.stat(path)
+ except (OSError,IOError), oe:
+ if oe.errno != errno.ENOENT: raise
+ continue
+ age = now - s.st_mtime
+ if age > opts.max_age:
+ debug('Fetcher expire %d %s' % (age, path))
+ try: os.remove(path)
+ except (OSError,IOError), oe:
+ if oe.errno != errno.ENOENT: raise
+ continue
+ ages.append(age)
+ ages.sort()
+ debug('Fetcher ages ' + `ages`)
+ min_age = 1
+ need_wait = 0
+ for age in ages:
+ if age < min_age:
+ debug('Fetcher morewait min=%d age=%d' %
+ (min_age, age))
+ need_wait = max(need_wait, age - min_age)
+ min_age *= 2
+ min_age += 1
+ if need_wait:
+ debug('Fetcher wait %d' % need_wait)
+ os.sleep(need_wait)
+
+ def fetch(self, url):
+ debug('Fetcher fetch %s' % url)
+ cache_corename = urllib.quote_plus(url)
+ cache_item = "%s/#%s#" % (self.cachedir, cache_corename)
+ try: f = file(cache_item, 'r')
+ except (OSError,IOError), oe:
+ if oe.errno != errno.ENOENT: raise
f = None
- if f is not None:
- data = f.read()
- f.close()
- else:
+ now = time.time()
+ if f is not None:
+ s = os.fstat(f.fileno())
+ if now > s.st_mtime + opts.max_age:
+ debug('Fetcher stale')
+ f = None
+ if f is not None:
+ data = f.read()
+ f.close()
+ debug('Fetcher cached')
+ return data
+
+ debug('Fetcher fetch')
+ self._rate_limit_cache_clean(now)
+
stream = urllib2.urlopen(url)
data = stream.read()
- cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
- f = file(cache_ourname, 'w')
+ cache_tmp = "%s/#%s~%d#" % (
+ self.cachedir, cache_corename, os.getpid())
+ f = file(cache_tmp, 'w')
f.write(data)
f.close()
- os.rename(cache_ourname, cache_basename)
- return data
+ os.rename(cache_tmp, cache_item)
+ debug('Fetcher stored')
+ return data
+
+ def yoweb(self, kind, tail):
+ url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
+ self.ocean, kind, tail)
+ return self.fetch(url)
+
+class SoupLog:
+ def __init__(self):
+ self.msgs = [ ]
+ def msg(self, m):
+ self.msgs.append(m)
+ def soupm(self, obj, m):
+ self.msg(m + '; in ' + `obj`)
+ def needs_msgs(self, child_souplog):
+ self.msgs += child_souplog.msgs
+ child_souplog.msgs = [ ]
+
+class PirateInfo(SoupLog):
+ # Public data members:
+ # pi.standings = { 'Treasure Haul': 'Able' ... }
+ # pi.crew = (id, name)
+ # pi.flag = (id, name)
+ # pi.msgs = [ 'message describing problem with scrape' ]
-def yoweb_fetch(kind, tail):
- url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
- return fetch(url)
+ def _find_standings(self):
+ imgs = self.soup.findAll('img',
+ src=regexp.compile('/yoweb/images/stat.*'))
+ re = regexp.compile(
+u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z]+)\\)\\s*$'
+ )
+ standings = { }
-def get_pirate_info(pirate):
- html = yoweb_fetch('pirate.wm?target=', pirate)
- soup = BeautifulSoup(html)
- return `soup`
+ for skill in duties:
+ standings[skill] = [ ]
+
+ skl = SoupLog()
+
+ for img in imgs:
+ try: duty = img['alt']
+ except KeyError: continue
+
+ if not duty in duties:
+ skl.soupm(img, 'unknown duty: "%s"' % duty)
+ continue
+ key = img.findParent('td')
+ if key is None:
+ skl.soupm(img, 'duty at root! "%s"' % duty)
+ continue
+ valelem = key.findNextSibling('td')
+ if valelem is None:
+ skl.soupm(key, 'duty missing sibling "%s"'
+ % duty)
+ continue
+ valstr = ''.join(valelem.findAll(text=True))
+ match = re.match(valstr)
+ if match is None:
+ skl.soupm(key, 'duty "%s" unparseable'+
+ ' standing "%s"' % (duty, valstr))
+ continue
+ standing = match.group(match.lastindex)
+ standings[duty].append(standing)
+
+ self.standings = { }
+
+ for duty in duties:
+ sl = standings[duty]
+ if len(sl) > 1:
+ skl.msg('duty "%s" multiple standings %s' %
+ (duty, `sl`))
+ continue
+ if not len(sl):
+ skl.msg('duty "%s" no standing found' % duty)
+ continue
+ standing = sl[0]
+ for i in range(0, len(standingvals)-1):
+ if standing == standingvals[i]:
+ self.standings[duty] = i
+ if not duty in self.standings:
+ skl.msg('duty "%s" unknown standing "%s"' %
+ (duty, standing))
+
+ all_standings_ok = True
+ for duty in duties:
+ if not duty in self.standings:
+ self.needs_msgs(skl)
+
+ def _find_crewflag(self, cf, yoweb_re):
+ things = self.soup.findAll('a', href=regexp.compile(yoweb_re))
+ if len(things) != 1:
+ self.msg('zero or several %s id references found' % cf)
+ return None
+ thing = things[0]
+ id_re = '\\b%sid\\=(\\w+)$' % cf
+ id_haystack = thing['href']
+ match = regexp.compile(id_re).search(id_haystack)
+ if match is None:
+ self.soupm(thing, ('incomprehensible %s id ref'+
+ ' (%s in %s)') % (cf, id_re, id_haystack))
+ return None
+ name = ''.join(thing.findAll(text=True))
+ return (match.group(1), name)
+
+ def __init__(self, pirate):
+ SoupLog.__init__(self)
+
+ html = fetcher.yoweb('pirate.wm?target=', pirate)
+ self.soup = BeautifulSoup(html,
+# convertEntities=BeautifulSoup.HTML_ENTITIES
+ )
+
+ self._find_standings()
+
+ self.crew = self._find_crewflag('crew',
+ '^/yoweb/crew/info\\.wm')
+ self.flag = self._find_crewflag('flag',
+ '^/yoweb/flag/info\\.wm')
+
+ def __str__(self):
+ return `(self.crew, self.flag, self.standings, self.msgs)`
def main():
- os.chdir(os.getenv('HOME'))
- cache_dir = '.yoweb-scrape-cache'
- try:
- os.chdir(cache_dir)
- except (OSError,IOError), oe:
- if oe.errno != errno.ENOENT: raise
- os.mkdir(cache_dir)
- os.chdir(cache_dir)
-
- for path in os.listdir('.'):
- if not path.startswith('#'): continue
- max_time = max_age
- if '~' in path: max_time = 10
- try:
- s = os.stat(path)
- if now > s.st_mtime + max_time:
- os.remove(path)
- except (OSError,IOError), oe:
- if oe.errno != errno.ENOENT: raise
+ global opts, fetcher
+
+ pa = OptionParser(
+'''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
+actions:
+ yoweb-scrape [--ocean OCEAN ...] pirate PIRATE
+ yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE
+ yoweb-scrape [--ocean OCEAN ...] dutytab-crew-of PIRATE
+''')
+ ao = pa.add_option
+ ao('-O','--ocean',dest='ocean', metavar='OCEAN',
+ default='ice',
+ help='select ocean OCEAN')
+ ao('--cache-dir', dest='cache_dir', metavar='DIR',
+ default='~/.yoweb-scrape-cache',
+ help='cache yoweb pages in DIR')
+ ao('-D','--debug', action='store_true', dest='debug', default=False,
+ help='enable debugging output')
+ ao('-q','--quiet', action='store_true', dest='quiet',
+ help='suppress warning output')
+ (opts,args) = pa.parse_args()
+
+ # fixed parameters
+ opts.max_age = 240
+ if opts.cache_dir.startswith('~/'):
+ opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:]
+
+ fetcher = Fetcher(opts.ocean, opts.cache_dir)
# test program:
- global ocean
- ocean = 'midnight'
- test = get_pirate_info('Aristarchus')
+ test = PirateInfo('Anaplian')
print test
main()