X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.web-test.git;a=blobdiff_plain;f=yoweb-scrape;h=717b539131d8ffb0e4ba6abba5f6cddc3b38b199;hp=db5dc1338298a47a2c9120d2f93b168d07842ba5;hb=59824f48d09192d1c535adecf5ee2e4ee2ed47d5;hpb=e772c13a54236d87fede06c8f3364549b1c8e070
diff --git a/yoweb-scrape b/yoweb-scrape
index db5dc13..717b539 100755
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -45,6 +45,7 @@ import curses
import termios
import random
import subprocess
+import copy
from optparse import OptionParser
from StringIO import StringIO
@@ -107,6 +108,18 @@ def yppsc_dir():
os.environ["YPPSC_YARRG_SRCBASE"] = lib
return lib
+soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+soup_massage.append(
+ (regexp.compile('(\
0:
need_wait += random.random() - 0.5
return need_wait
- def _rate_limit_cache_clean(self, now):
- need_wait = self.need_wait(now)
+ def _rate_limit_cache_clean(self, now, next_url=None):
+ need_wait = self.need_wait(now, next_url=next_url)
if need_wait > 0:
- debug('Fetcher wait %d' % need_wait)
+ debug('Fetcher wait %f' % need_wait)
sleep(need_wait)
def fetch(self, url, max_age):
@@ -186,7 +210,7 @@ class Fetcher:
return data
debug('Fetcher fetch')
- self._rate_limit_cache_clean(now)
+ self._rate_limit_cache_clean(now, next_url=url)
stream = urllib2.urlopen(url)
data = stream.read()
@@ -252,9 +276,7 @@ class SomethingSoupInfo(SoupLog):
def __init__(self, kind, tail, max_age):
SoupLog.__init__(self)
html = fetcher.yoweb(kind, tail, max_age)
- self._soup = BeautifulSoup(html,
- convertEntities=BeautifulSoup.HTML_ENTITIES
- )
+ self._soup = make_soup(html)
#---------- scraper for pirate pages ----------
@@ -577,7 +599,7 @@ class IslandExtendedInfo(IslandBasicInfo):
self.islandid = None
self.yoweb_url = None
- soup = BeautifulSoup(self.yppedia())
+ soup = make_soup(self.yppedia())
content = soup.find('div', attrs = {'id': 'content'})
yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
'yoweb/island/info\.wm\?islandid=(\d+)$')
@@ -598,7 +620,7 @@ class IslandExtendedInfo(IslandBasicInfo):
debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
if yo is None: return None
dataf = fetcher.fetch(yo, 1800)
- soup = BeautifulSoup(dataf)
+ soup = make_soup(dataf)
ruler_re = regexp.compile(
'/yoweb/flag/info\.wm\?flagid=(\d+)$')
ruler = soup.find('a', attrs = { 'href': ruler_re })