chiark / gitweb /
WIP ocean scraper seems to mostly work, need to do arg parsing and IO
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:21:13 +0000 (16:21 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 15:21:13 +0000 (16:21 +0100)
yarrg/yppedia-ocean-scraper

index 104a408047ba15adfeaa7d09de4c015827d3f03d..d53e2365da98491bf1b6204a131295cf4fef3d5c 100755 (executable)
@@ -54,22 +54,23 @@ def parse():
                if u.name != 'table': return False
                return len(findall_title_arch_ok(u)) > 1
 
-       archestable = firstarch.findParent(is_archestable)
+       archestable = firstarch.findParent('table', attrs={'border':'1'})
        debug('at',archestable)
 
-       arches = findall_title_arch_ok(archestable)
+       arches = []
+       for row in archestable.findAll('tr',recursive=False):
+               arches += row.findAll('td',recursive=False)
        debug('ac',arches)
 
        def is_island(v):
                return len(v.findAll(text = regexp.compile('.*Large'))) > 0
        def arch_up_map(u):
                return u.findParent(is_island)
-       arches = map(arch_up_map, arches)
-       debug('ac2',arches)
 
        for arch in arches:
                links = arch.findAll('a', href=True)
                debug('links',links)
+               if not links: continue
                (a,o) = title_arch_info(links[0]['title'])
                assert(o == ocean)
                print 'arch', a