import termios
import random
import subprocess
+import copy
from optparse import OptionParser
from StringIO import StringIO
os.environ["YPPSC_YARRG_SRCBASE"] = lib
return lib
+soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+soup_massage.append(
+ (regexp.compile('(\<td.*") ("center")'),
+ lambda m: m.group(1)+' align='+m.group(2))
+ )
+
+def make_soup(*args, **kwargs):
+ return BeautifulSoup(*args,
+ convertEntities=BeautifulSoup.HTML_ENTITIES,
+ markupMassage=soup_massage,
+ **kwargs)
+
#---------- caching and rate-limiting data fetcher ----------
class Fetcher:
if oe.errno != errno.EEXIST: raise
self._cache_scan(time.time())
- def _cache_scan(self, now):
+ def _match_url_normalise(self, url):
+ without_scheme = regexp.sub('^[a-z]+://', '', url)
+ without_tail = regexp.sub('/.*', '', without_scheme)
+ return without_tail
+
+ def _cache_scan(self, now, match_url=None):
# returns list of ages, unsorted
+ if match_url is not None:
+ match_url = self._match_url_normalise(match_url)
ages = []
debug('Fetcher scan_cache')
for leaf in os.listdir(self.cachedir):
if not leaf.startswith('#'): continue
+ if match_url is not None:
+ leaf_url = urllib.unquote_plus(leaf.strip('#'))
+ leaf_url = self._match_url_normalise(leaf_url)
+ if leaf_url != match_url:
+ continue
path = self.cachedir + '/' + leaf
try: s = os.stat(path)
except (OSError,IOError), oe:
ages.append(age)
return ages
- def need_wait(self, now, imaginary=[]):
- ages = self._cache_scan(now)
+ def need_wait(self, now, imaginary=[], next_url=None):
+ ages = self._cache_scan(now, match_url=next_url)
ages += imaginary
ages.sort()
debug('Fetcher ages ' + `ages`)
(min_age, age))
need_wait = max(need_wait, min_age - age)
min_age += 3
- min_age *= 1.25
if need_wait > 0:
need_wait += random.random() - 0.5
return need_wait
- def _rate_limit_cache_clean(self, now):
- need_wait = self.need_wait(now)
+ def _rate_limit_cache_clean(self, now, next_url=None):
+ need_wait = self.need_wait(now, next_url=next_url)
if need_wait > 0:
- debug('Fetcher wait %d' % need_wait)
+ debug('Fetcher wait %f' % need_wait)
sleep(need_wait)
def fetch(self, url, max_age):
return data
debug('Fetcher fetch')
- self._rate_limit_cache_clean(now)
+ self._rate_limit_cache_clean(now, next_url=url)
stream = urllib2.urlopen(url)
data = stream.read()
def __init__(self, kind, tail, max_age):
SoupLog.__init__(self)
html = fetcher.yoweb(kind, tail, max_age)
- self._soup = BeautifulSoup(html,
- convertEntities=BeautifulSoup.HTML_ENTITIES
- )
+ self._soup = make_soup(html)
#---------- scraper for pirate pages ----------
self.islandid = None
self.yoweb_url = None
- soup = BeautifulSoup(self.yppedia())
+ soup = make_soup(self.yppedia())
content = soup.find('div', attrs = {'id': 'content'})
yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
'yoweb/island/info\.wm\?islandid=(\d+)$')
debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
if yo is None: return None
dataf = fetcher.fetch(yo, 1800)
- soup = BeautifulSoup(dataf)
+ soup = make_soup(dataf)
ruler_re = regexp.compile(
'/yoweb/flag/info\.wm\?flagid=(\d+)$')
ruler = soup.find('a', attrs = { 'href': ruler_re })