chiark
/
gitweb
/
~ijackson
/
ypp-sc-tools.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
f7cbc65
)
yoweb-scrape: add a markup massager to cope with broken html from yoweb
author
Ian Jackson
<ijackson@chiark.greenend.org.uk>
Sun, 30 Oct 2011 13:29:41 +0000
(13:29 +0000)
committer
Ian Jackson
<ijackson@chiark.greenend.org.uk>
Sun, 30 Oct 2011 13:29:41 +0000
(13:29 +0000)
yoweb-scrape
patch
|
blob
|
history
diff --git
a/yoweb-scrape
b/yoweb-scrape
index db5dc1338298a47a2c9120d2f93b168d07842ba5..2bcd518c44d036ad8d990305ab7e03488ba195e5 100755
(executable)
--- a/
yoweb-scrape
+++ b/
yoweb-scrape
@@
-45,6
+45,7
@@
import curses
import termios
import random
import subprocess
import termios
import random
import subprocess
+import copy
from optparse import OptionParser
from StringIO import StringIO
from optparse import OptionParser
from StringIO import StringIO
@@
-107,6
+108,18
@@
def yppsc_dir():
os.environ["YPPSC_YARRG_SRCBASE"] = lib
return lib
os.environ["YPPSC_YARRG_SRCBASE"] = lib
return lib
+soup_massage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
+soup_massage.append(
+ (regexp.compile('(\<td.*") ("center")'),
+ lambda m: m.group(1)+' align='+m.group(2))
+ )
+
+def make_soup(*args, **kwargs):
+ return BeautifulSoup(*args,
+ convertEntities=BeautifulSoup.HTML_ENTITIES,
+ markupMassage=soup_massage,
+ **kwargs)
+
#---------- caching and rate-limiting data fetcher ----------
class Fetcher:
#---------- caching and rate-limiting data fetcher ----------
class Fetcher:
@@
-252,9
+265,7
@@
class SomethingSoupInfo(SoupLog):
def __init__(self, kind, tail, max_age):
SoupLog.__init__(self)
html = fetcher.yoweb(kind, tail, max_age)
def __init__(self, kind, tail, max_age):
SoupLog.__init__(self)
html = fetcher.yoweb(kind, tail, max_age)
- self._soup = BeautifulSoup(html,
- convertEntities=BeautifulSoup.HTML_ENTITIES
- )
+ self._soup = make_soup(html)
#---------- scraper for pirate pages ----------
#---------- scraper for pirate pages ----------
@@
-577,7
+588,7
@@
class IslandExtendedInfo(IslandBasicInfo):
self.islandid = None
self.yoweb_url = None
self.islandid = None
self.yoweb_url = None
- soup =
BeautifulS
oup(self.yppedia())
+ soup =
make_s
oup(self.yppedia())
content = soup.find('div', attrs = {'id': 'content'})
yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
'yoweb/island/info\.wm\?islandid=(\d+)$')
content = soup.find('div', attrs = {'id': 'content'})
yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
'yoweb/island/info\.wm\?islandid=(\d+)$')
@@
-598,7
+609,7
@@
class IslandExtendedInfo(IslandBasicInfo):
debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
if yo is None: return None
dataf = fetcher.fetch(yo, 1800)
debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
if yo is None: return None
dataf = fetcher.fetch(yo, 1800)
- soup =
BeautifulS
oup(dataf)
+ soup =
make_s
oup(dataf)
ruler_re = regexp.compile(
'/yoweb/flag/info\.wm\?flagid=(\d+)$')
ruler = soup.find('a', attrs = { 'href': ruler_re })
ruler_re = regexp.compile(
'/yoweb/flag/info\.wm\?flagid=(\d+)$')
ruler = soup.find('a', attrs = { 'href': ruler_re })