X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-test.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;fp=yarrg%2Fyppedia-ocean-scraper;h=d53e2365da98491bf1b6204a131295cf4fef3d5c;hp=104a408047ba15adfeaa7d09de4c015827d3f03d;hb=cf91713ebb03e007c03dc70867c73db3a1a754a9;hpb=1d5d1c30049c38ef1fc002b7403c695a6bfc469b diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index 104a408..d53e236 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -54,22 +54,23 @@ def parse(): if u.name != 'table': return False return len(findall_title_arch_ok(u)) > 1 - archestable = firstarch.findParent(is_archestable) + archestable = firstarch.findParent('table', attrs={'border':'1'}) debug('at',archestable) - arches = findall_title_arch_ok(archestable) + arches = [] + for row in archestable.findAll('tr',recursive=False): + arches += row.findAll('td',recursive=False) debug('ac',arches) def is_island(v): return len(v.findAll(text = regexp.compile('.*Large'))) > 0 def arch_up_map(u): return u.findParent(is_island) - arches = map(arch_up_map, arches) - debug('ac2',arches) for arch in arches: links = arch.findAll('a', href=True) debug('links',links) + if not links: continue (a,o) = title_arch_info(links[0]['title']) assert(o == ocean) print 'arch', a