From: Ian Jackson Date: Mon, 31 Aug 2009 15:04:52 +0000 (+0100) Subject: WIP scraper still not working right X-Git-Tag: 3.4~64 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.web-live.git;a=commitdiff_plain;h=4c12ab3580b4ee4a5580f29105b309113a77ac7a WIP scraper still not working right --- diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index c99a84e..d6d2f24 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -12,18 +12,21 @@ import re as regexp from BeautifulSoup import BeautifulSoup ocean = 'Opal' +soup = None -url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'') -dataf = urllib2.urlopen(url) +def debug(k,v): + pass -soup = BeautifulSoup(dataf) +def fetch(): + global soup + url = ('http://yppedia.puzzlepirates.com/%s_Ocean' % + urllib.quote(ocean,'')) + dataf = urllib2.urlopen(url) + soup = BeautifulSoup(dataf) -title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') -title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') -href_img_re = regexp.compile('\\.png$') def title_arch_arch(t): - print 'checking ',t + debug('checking',t) if t is None: return None m = title_arch_re.match(t) if not m: return None @@ -34,33 +37,49 @@ def title_arch_ok(t): if a is None: return False return a == ocean -firstarch = soup.find('a', attrs = {'title': title_arch_ok}) -print 'fa',`firstarch` - -archestable = firstarch.findParent( - lambda u: u.name == 'table' and - len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1 - ) - -print 'at',`archestable` - -arches = archestable.findAll('a', attrs = {'title': title_arch_ok}) -print 'ac', `arches` - -arches = map((lambda u: u.findParent( - lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0 - )), arches) -print 'ac2', `arches` - -for arch in arches: - links = arch.findAll('a', href=True) - print 'links', `links` - a = title_arch_arch(links[0]['title']) - assert(a) - print a - for link in links[1:]: - print 'link', `link` - if href_img_re.search(link['href']): continue - m = title_any_re.match(link['title']) - assert(m.group(2) == ocean) - print m.group(1) +title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def parse(): + firstarch = soup.find('a', attrs = {'title': title_arch_ok}) + debug('fa',firstarch) + + def findall_title_arch_ok(t): + return t.findAll('a', attrs = {'title': title_arch_ok}) + + def is_archestable(u): + if u.name != 'table': return False + return len(findall_title_arch_ok(u)) > 1 + + archestable = firstarch.findParent(is_archestable) + debug('at',archestable) + + arches = findall_title_arch_ok(archestable) + debug('ac',arches) + + def is_island(v): + return len(v.findAll(text = regexp.compile('.*Large'))) > 0 + def arch_up_map(u): + return u.findParent(is_island) + arches = map(arch_up_map, arches) + debug('ac2',arches) + + for arch in arches: + links = arch.findAll('a', href=True) + debug('links',links) + a = title_arch_arch(links[0]['title']) + assert(a) + print 'arch', a + for link in links[1:]: + debug('link',link) + if href_img_re.search(link['href']): continue + m = title_any_re.match(link['title']) + assert(m.group(2) == ocean) + print 'island', m.group(1) + +def main(): + fetch() + parse() + +main()