chiark / gitweb /
yoweb-scrape: add a markup massager to cope with broken html from yoweb
authorIan Jackson <ijackson@chiark.greenend.org.uk>
Sun, 30 Oct 2011 13:29:41 +0000 (13:29 +0000)
committerIan Jackson <ijackson@chiark.greenend.org.uk>
Sun, 30 Oct 2011 13:29:41 +0000 (13:29 +0000)
yoweb-scrape

index db5dc13..2bcd518 100755 (executable)
@@ -45,6 +45,7 @@ import curses
 import termios
 import random
 import subprocess
+import copy
 from optparse import OptionParser
 from StringIO import StringIO
 
@@ -107,6 +108,18 @@ def yppsc_dir():
        os.environ["YPPSC_YARRG_SRCBASE"] = lib
        return lib
 
+soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+soup_massage.append(
+               (regexp.compile('(\<td.*") ("center")'),
+                lambda m: m.group(1)+' align='+m.group(2))
+       )
+
+def make_soup(*args, **kwargs):
+       return BeautifulSoup(*args,
+               convertEntities=BeautifulSoup.HTML_ENTITIES,
+               markupMassage=soup_massage,
+                        **kwargs)
+
 #---------- caching and rate-limiting data fetcher ----------
 
 class Fetcher:
@@ -252,9 +265,7 @@ class SomethingSoupInfo(SoupLog):
        def __init__(self, kind, tail, max_age):
                SoupLog.__init__(self)
                html = fetcher.yoweb(kind, tail, max_age)
-               self._soup = BeautifulSoup(html,
-                       convertEntities=BeautifulSoup.HTML_ENTITIES
-                       )
+               self._soup = make_soup(html)
 
 #---------- scraper for pirate pages ----------
 
@@ -577,7 +588,7 @@ class IslandExtendedInfo(IslandBasicInfo):
                self.islandid = None
                self.yoweb_url = None
 
-               soup = BeautifulSoup(self.yppedia())
+               soup = make_soup(self.yppedia())
                content = soup.find('div', attrs = {'id': 'content'})
                yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
                        'yoweb/island/info\.wm\?islandid=(\d+)$')
@@ -598,7 +609,7 @@ class IslandExtendedInfo(IslandBasicInfo):
                debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
                if yo is None: return None
                dataf = fetcher.fetch(yo, 1800)
-               soup = BeautifulSoup(dataf)
+               soup = make_soup(dataf)
                ruler_re = regexp.compile(
                        '/yoweb/flag/info\.wm\?flagid=(\d+)$')
                ruler = soup.find('a', attrs = { 'href': ruler_re })