chiark
/
gitweb
/
~yarrgweb
/
ypp-sc-tools.db-test.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
WIP ocean scraper seems to mostly work, need to do arg parsing and IO
[ypp-sc-tools.db-test.git]
/
yarrg
/
yppedia-ocean-scraper
diff --git
a/yarrg/yppedia-ocean-scraper
b/yarrg/yppedia-ocean-scraper
index 104a408047ba15adfeaa7d09de4c015827d3f03d..d53e2365da98491bf1b6204a131295cf4fef3d5c 100755
(executable)
--- a/
yarrg/yppedia-ocean-scraper
+++ b/
yarrg/yppedia-ocean-scraper
@@
-54,22
+54,23
@@
def parse():
if u.name != 'table': return False
return len(findall_title_arch_ok(u)) > 1
if u.name != 'table': return False
return len(findall_title_arch_ok(u)) > 1
- archestable = firstarch.findParent(
is_archestable
)
+ archestable = firstarch.findParent(
'table', attrs={'border':'1'}
)
debug('at',archestable)
debug('at',archestable)
- arches = findall_title_arch_ok(archestable)
+ arches = []
+ for row in archestable.findAll('tr',recursive=False):
+ arches += row.findAll('td',recursive=False)
debug('ac',arches)
def is_island(v):
return len(v.findAll(text = regexp.compile('.*Large'))) > 0
def arch_up_map(u):
return u.findParent(is_island)
debug('ac',arches)
def is_island(v):
return len(v.findAll(text = regexp.compile('.*Large'))) > 0
def arch_up_map(u):
return u.findParent(is_island)
- arches = map(arch_up_map, arches)
- debug('ac2',arches)
for arch in arches:
links = arch.findAll('a', href=True)
debug('links',links)
for arch in arches:
links = arch.findAll('a', href=True)
debug('links',links)
+ if not links: continue
(a,o) = title_arch_info(links[0]['title'])
assert(o == ocean)
print 'arch', a
(a,o) = title_arch_info(links[0]['title'])
assert(o == ocean)
print 'arch', a