chiark / gitweb /
WIP ocean scraper before redo parser
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:14:51 +0000 (16:14 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:14:51 +0000 (16:14 +0100)
yarrg/yppedia-ocean-scraper

index d6d2f24..104a408 100755 (executable)
@@ -15,6 +15,7 @@ ocean = 'Opal'
 soup = None
 
 def debug(k,v):
+#      print k,`v`
        pass
 
 def fetch():
@@ -25,21 +26,22 @@ def fetch():
        soup = BeautifulSoup(dataf)
 
 
-def title_arch_arch(t):
+title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$')
+title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
+href_img_re = regexp.compile('\\.png$')
+
+def title_arch_info(t):
+       # returns (arch,ocean)
        debug('checking',t)
-       if t is None: return None
+       if t is None: return (None,None)
        m = title_arch_re.match(t)
-       if not m: return None
-       return m.group(1)
+       if not m: return (None,None)
+       return m.groups()
 
 def title_arch_ok(t):
-       a = title_arch_arch(t)
-       if a is None: return False
-       return a == ocean
-
-title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
-title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
-href_img_re = regexp.compile('\\.png$')
+       (a,o) = title_arch_info(t)
+       if o is None: return False
+       return o == ocean
 
 def parse():
        firstarch = soup.find('a', attrs = {'title': title_arch_ok})
@@ -68,8 +70,8 @@ def parse():
        for arch in arches:
                links = arch.findAll('a', href=True)
                debug('links',links)
-               a = title_arch_arch(links[0]['title'])
-               assert(a)
+               (a,o) = title_arch_info(links[0]['title'])
+               assert(o == ocean)
                print 'arch', a
                for link in links[1:]:
                        debug('link',link)