X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.web-test.git;a=blobdiff_plain;f=yoweb-scrape;h=717b539131d8ffb0e4ba6abba5f6cddc3b38b199;hp=db5dc1338298a47a2c9120d2f93b168d07842ba5;hb=59824f48d09192d1c535adecf5ee2e4ee2ed47d5;hpb=e772c13a54236d87fede06c8f3364549b1c8e070 diff --git a/yoweb-scrape b/yoweb-scrape index db5dc13..717b539 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -45,6 +45,7 @@ import curses import termios import random import subprocess +import copy from optparse import OptionParser from StringIO import StringIO @@ -107,6 +108,18 @@ def yppsc_dir(): os.environ["YPPSC_YARRG_SRCBASE"] = lib return lib +soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) +soup_massage.append( + (regexp.compile('(\ 0: need_wait += random.random() - 0.5 return need_wait - def _rate_limit_cache_clean(self, now): - need_wait = self.need_wait(now) + def _rate_limit_cache_clean(self, now, next_url=None): + need_wait = self.need_wait(now, next_url=next_url) if need_wait > 0: - debug('Fetcher wait %d' % need_wait) + debug('Fetcher wait %f' % need_wait) sleep(need_wait) def fetch(self, url, max_age): @@ -186,7 +210,7 @@ class Fetcher: return data debug('Fetcher fetch') - self._rate_limit_cache_clean(now) + self._rate_limit_cache_clean(now, next_url=url) stream = urllib2.urlopen(url) data = stream.read() @@ -252,9 +276,7 @@ class SomethingSoupInfo(SoupLog): def __init__(self, kind, tail, max_age): SoupLog.__init__(self) html = fetcher.yoweb(kind, tail, max_age) - self._soup = BeautifulSoup(html, - convertEntities=BeautifulSoup.HTML_ENTITIES - ) + self._soup = make_soup(html) #---------- scraper for pirate pages ---------- @@ -577,7 +599,7 @@ class IslandExtendedInfo(IslandBasicInfo): self.islandid = None self.yoweb_url = None - soup = BeautifulSoup(self.yppedia()) + soup = make_soup(self.yppedia()) content = soup.find('div', attrs = {'id': 'content'}) yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+ 'yoweb/island/info\.wm\?islandid=(\d+)$') @@ -598,7 +620,7 @@ class IslandExtendedInfo(IslandBasicInfo): debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`) if yo is None: return None dataf = fetcher.fetch(yo, 1800) - soup = BeautifulSoup(dataf) + soup = make_soup(dataf) ruler_re = regexp.compile( '/yoweb/flag/info\.wm\?flagid=(\d+)$') ruler = soup.find('a', attrs = { 'href': ruler_re })