chiark
/
gitweb
/
~yarrgweb
/
ypp-sc-tools.db-test.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
WIP fixes and also do entity conversion
[ypp-sc-tools.db-test.git]
/
yoweb-scrape
diff --git
a/yoweb-scrape
b/yoweb-scrape
index 0866667d7dd6982d20548f1393875e7bf282f8ef..4c7cee50db2024c42735f1259d715d88b74af855 100755
(executable)
--- a/
yoweb-scrape
+++ b/
yoweb-scrape
@@
-129,7
+129,7
@@
class PirateInfo(SoupLog):
imgs = self.soup.findAll('img',
src=regexp.compile('/yoweb/images/stat.*'))
re = regexp.compile(
imgs = self.soup.findAll('img',
src=regexp.compile('/yoweb/images/stat.*'))
re = regexp.compile(
-u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide
\\ \\;
([-A-Za-z]+)\\)\\s*$'
+u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide
(?:\\s|\\xa0)+
([-A-Za-z]+)\\)\\s*$'
)
standings = { }
)
standings = { }
@@
-157,8
+157,8
@@
u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z
valstr = ''.join(valelem.findAll(text=True))
match = re.match(valstr)
if match is None:
valstr = ''.join(valelem.findAll(text=True))
match = re.match(valstr)
if match is None:
- skl.soupm(key, 'duty "%s" unparseable'+
- ' standing "%s"' % (duty, valstr))
+ skl.soupm(key,
(
'duty "%s" unparseable'+
+ ' standing "%s"'
)
% (duty, valstr))
continue
standing = match.group(match.lastindex)
standings[duty].append(standing)
continue
standing = match.group(match.lastindex)
standings[duty].append(standing)
@@
-208,7
+208,7
@@
u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z
html = fetcher.yoweb('pirate.wm?target=', pirate)
self.soup = BeautifulSoup(html,
html = fetcher.yoweb('pirate.wm?target=', pirate)
self.soup = BeautifulSoup(html,
-
#
convertEntities=BeautifulSoup.HTML_ENTITIES
+
convertEntities=BeautifulSoup.HTML_ENTITIES
)
self._find_standings()
)
self._find_standings()