From 1d5d1c30049c38ef1fc002b7403c695a6bfc469b Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Mon, 31 Aug 2009 16:14:51 +0100 Subject: [PATCH] WIP ocean scraper before redo parser --- yarrg/yppedia-ocean-scraper | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index d6d2f24..104a408 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -15,6 +15,7 @@ ocean = 'Opal' soup = None def debug(k,v): +# print k,`v` pass def fetch(): @@ -25,21 +26,22 @@ def fetch(): soup = BeautifulSoup(dataf) -def title_arch_arch(t): +title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def title_arch_info(t): + # returns (arch,ocean) debug('checking',t) - if t is None: return None + if t is None: return (None,None) m = title_arch_re.match(t) - if not m: return None - return m.group(1) + if not m: return (None,None) + return m.groups() def title_arch_ok(t): - a = title_arch_arch(t) - if a is None: return False - return a == ocean - -title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') -title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') -href_img_re = regexp.compile('\\.png$') + (a,o) = title_arch_info(t) + if o is None: return False + return o == ocean def parse(): firstarch = soup.find('a', attrs = {'title': title_arch_ok}) @@ -68,8 +70,8 @@ def parse(): for arch in arches: links = arch.findAll('a', href=True) debug('links',links) - a = title_arch_arch(links[0]['title']) - assert(a) + (a,o) = title_arch_info(links[0]['title']) + assert(o == ocean) print 'arch', a for link in links[1:]: debug('link',link) -- 2.30.2