chiark / gitweb /
WIP fixes and also do entity conversion
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 18:24:37 +0000 (19:24 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Fri, 15 May 2009 18:24:37 +0000 (19:24 +0100)
yoweb-scrape

index 0866667..4c7cee5 100755 (executable)
@@ -129,7 +129,7 @@ class PirateInfo(SoupLog):
                imgs = self.soup.findAll('img',
                        src=regexp.compile('/yoweb/images/stat.*'))
                re = regexp.compile(
-u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\&nbsp\\;([-A-Za-z]+)\\)\\s*$'
+u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$'
                        )
                standings = { }
 
@@ -157,8 +157,8 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\&nbsp\\;([-A-Za-z
                        valstr = ''.join(valelem.findAll(text=True))
                        match = re.match(valstr)
                        if match is None:
-                               skl.soupm(key, 'duty "%s" unparseable'+
-                                       ' standing "%s"' % (duty, valstr))
+                               skl.soupm(key, ('duty "%s" unparseable'+
+                                       ' standing "%s"') % (duty, valstr))
                                continue
                        standing = match.group(match.lastindex)
                        standings[duty].append(standing)
@@ -208,7 +208,7 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\&nbsp\\;([-A-Za-z
 
                html = fetcher.yoweb('pirate.wm?target=', pirate)
                self.soup = BeautifulSoup(html,
-#                      convertEntities=BeautifulSoup.HTML_ENTITIES
+                       convertEntities=BeautifulSoup.HTML_ENTITIES
                        )
 
                self._find_standings()