Normalise commodity name case (from uploads)

[ypp-sc-tools.web-live.git] / yoweb-scrape
diff --git a/yoweb-scrape b/yoweb-scrape

index db5dc1338298a47a2c9120d2f93b168d07842ba5..717b539131d8ffb0e4ba6abba5f6cddc3b38b199 100755 (executable)
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -45,6 +45,7 @@ import curses
  import termios
  import random
  import subprocess
+import copy
  from optparse import OptionParser
  from StringIO import StringIO
  
@@ -107,6 +108,18 @@ def yppsc_dir():
         os.environ["YPPSC_YARRG_SRCBASE"] = lib
         return lib
  
+soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+soup_massage.append(
+               (regexp.compile('(\<td.*") ("center")'),
+                lambda m: m.group(1)+' align='+m.group(2))
+       )
+
+def make_soup(*args, **kwargs):
+       return BeautifulSoup(*args,
+               convertEntities=BeautifulSoup.HTML_ENTITIES,
+               markupMassage=soup_massage,
+                        **kwargs)
+
  #---------- caching and rate-limiting data fetcher ----------
  
  class Fetcher:
@@ -118,12 +131,24 @@ class Fetcher:
                         if oe.errno != errno.EEXIST: raise
                 self._cache_scan(time.time())
  
-       def _cache_scan(self, now):
+       def _match_url_normalise(self, url):
+               without_scheme = regexp.sub('^[a-z]+://', '', url)
+               without_tail = regexp.sub('/.*', '', without_scheme)
+               return without_tail
+
+       def _cache_scan(self, now, match_url=None):
                 # returns list of ages, unsorted
+               if match_url is not None:
+                       match_url = self._match_url_normalise(match_url)
                 ages = []
                 debug('Fetcher   scan_cache')
                 for leaf in os.listdir(self.cachedir):
                         if not leaf.startswith('#'): continue
+                       if match_url is not None:
+                               leaf_url = urllib.unquote_plus(leaf.strip('#'))
+                               leaf_url = self._match_url_normalise(leaf_url)
+                               if leaf_url != match_url:
+                                       continue
                         path = self.cachedir + '/' + leaf
                         try: s = os.stat(path)
                         except (OSError,IOError), oe:
@@ -139,8 +164,8 @@ class Fetcher:
                         ages.append(age)
                 return ages
  
-       def need_wait(self, now, imaginary=[]):
-               ages = self._cache_scan(now)
+       def need_wait(self, now, imaginary=[], next_url=None):
+               ages = self._cache_scan(now, match_url=next_url)
                 ages += imaginary
                 ages.sort()
                 debug('Fetcher   ages ' + `ages`)
@@ -152,15 +177,14 @@ class Fetcher:
                                         (min_age, age))
                                 need_wait = max(need_wait, min_age - age)
                         min_age += 3
-                       min_age *= 1.25
                 if need_wait > 0:
                         need_wait += random.random() - 0.5
                 return need_wait
  
-       def _rate_limit_cache_clean(self, now):
-               need_wait = self.need_wait(now)
+       def _rate_limit_cache_clean(self, now, next_url=None):
+               need_wait = self.need_wait(now, next_url=next_url)
                 if need_wait > 0:
-                       debug('Fetcher   wait %d' % need_wait)
+                       debug('Fetcher   wait %f' % need_wait)
                         sleep(need_wait)
  
         def fetch(self, url, max_age):
@@ -186,7 +210,7 @@ class Fetcher:
                         return data
  
                 debug('Fetcher  fetch')
-               self._rate_limit_cache_clean(now)
+               self._rate_limit_cache_clean(now, next_url=url)
  
                 stream = urllib2.urlopen(url)
                 data = stream.read()
@@ -252,9 +276,7 @@ class SomethingSoupInfo(SoupLog):
         def __init__(self, kind, tail, max_age):
                 SoupLog.__init__(self)
                 html = fetcher.yoweb(kind, tail, max_age)
-               self._soup = BeautifulSoup(html,
-                       convertEntities=BeautifulSoup.HTML_ENTITIES
-                       )
+               self._soup = make_soup(html)
  
  #---------- scraper for pirate pages ----------
  
@@ -577,7 +599,7 @@ class IslandExtendedInfo(IslandBasicInfo):
                 self.islandid = None
                 self.yoweb_url = None
  
-               soup = BeautifulSoup(self.yppedia())
+               soup = make_soup(self.yppedia())
                 content = soup.find('div', attrs = {'id': 'content'})
                 yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
                         'yoweb/island/info\.wm\?islandid=(\d+)$')
@@ -598,7 +620,7 @@ class IslandExtendedInfo(IslandBasicInfo):
                 debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
                 if yo is None: return None
                 dataf = fetcher.fetch(yo, 1800)
-               soup = BeautifulSoup(dataf)
+               soup = make_soup(dataf)
                 ruler_re = regexp.compile(
                         '/yoweb/flag/info\.wm\?flagid=(\d+)$')
                 ruler = soup.find('a', attrs = { 'href': ruler_re })