From: Ian Jackson Date: Mon, 31 Aug 2009 15:14:51 +0000 (+0100) Subject: WIP ocean scraper before redo parser X-Git-Tag: 3.4~62 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=commitdiff_plain;h=1d5d1c30049c38ef1fc002b7403c695a6bfc469b WIP ocean scraper before redo parser --- diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index d6d2f24..104a408 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -15,6 +15,7 @@ ocean = 'Opal' soup = None def debug(k,v): +# print k,`v` pass def fetch(): @@ -25,21 +26,22 @@ def fetch(): soup = BeautifulSoup(dataf) -def title_arch_arch(t): +title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def title_arch_info(t): + # returns (arch,ocean) debug('checking',t) - if t is None: return None + if t is None: return (None,None) m = title_arch_re.match(t) - if not m: return None - return m.group(1) + if not m: return (None,None) + return m.groups() def title_arch_ok(t): - a = title_arch_arch(t) - if a is None: return False - return a == ocean - -title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') -title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') -href_img_re = regexp.compile('\\.png$') + (a,o) = title_arch_info(t) + if o is None: return False + return o == ocean def parse(): firstarch = soup.find('a', attrs = {'title': title_arch_ok}) @@ -68,8 +70,8 @@ def parse(): for arch in arches: links = arch.findAll('a', href=True) debug('links',links) - a = title_arch_arch(links[0]['title']) - assert(a) + (a,o) = title_arch_info(links[0]['title']) + assert(o == ocean) print 'arch', a for link in links[1:]: debug('link',link)