From: Ian Jackson Date: Fri, 15 May 2009 18:24:37 +0000 (+0100) Subject: WIP fixes and also do entity conversion X-Git-Tag: 1.0~74 X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-test.git;a=commitdiff_plain;h=ccf8df182d4d7b5258b87ed6907920f738ff32d2 WIP fixes and also do entity conversion --- diff --git a/yoweb-scrape b/yoweb-scrape index 0866667..4c7cee5 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -129,7 +129,7 @@ class PirateInfo(SoupLog): imgs = self.soup.findAll('img', src=regexp.compile('/yoweb/images/stat.*')) re = regexp.compile( -u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z]+)\\)\\s*$' +u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$' ) standings = { } @@ -157,8 +157,8 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z valstr = ''.join(valelem.findAll(text=True)) match = re.match(valstr) if match is None: - skl.soupm(key, 'duty "%s" unparseable'+ - ' standing "%s"' % (duty, valstr)) + skl.soupm(key, ('duty "%s" unparseable'+ + ' standing "%s"') % (duty, valstr)) continue standing = match.group(match.lastindex) standings[duty].append(standing) @@ -208,7 +208,7 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide\\ \\;([-A-Za-z html = fetcher.yoweb('pirate.wm?target=', pirate) self.soup = BeautifulSoup(html, -# convertEntities=BeautifulSoup.HTML_ENTITIES + convertEntities=BeautifulSoup.HTML_ENTITIES ) self._find_standings()