X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;fp=yarrg%2Fyppedia-ocean-scraper;h=c99a84efe5dbfcb0dc556886dfc7d7797e30b387;hp=0000000000000000000000000000000000000000;hb=8fe4fd369f976fad429139aca1885af6a67e983d;hpb=35dc4e063467490048e83f1d9d519cd3652d870e diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper new file mode 100755 index 0000000..c99a84e --- /dev/null +++ b/yarrg/yppedia-ocean-scraper @@ -0,0 +1,66 @@ +#!/usr/bin/python + +import signal +signal.signal(signal.SIGINT, signal.SIG_DFL) + +import os +import urllib +import urllib2 +import re as regexp +#from optparse import OptionParser + +from BeautifulSoup import BeautifulSoup + +ocean = 'Opal' + +url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'') +dataf = urllib2.urlopen(url) + +soup = BeautifulSoup(dataf) + +title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def title_arch_arch(t): + print 'checking ',t + if t is None: return None + m = title_arch_re.match(t) + if not m: return None + return m.group(1) + +def title_arch_ok(t): + a = title_arch_arch(t) + if a is None: return False + return a == ocean + +firstarch = soup.find('a', attrs = {'title': title_arch_ok}) +print 'fa',`firstarch` + +archestable = firstarch.findParent( + lambda u: u.name == 'table' and + len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1 + ) + +print 'at',`archestable` + +arches = archestable.findAll('a', attrs = {'title': title_arch_ok}) +print 'ac', `arches` + +arches = map((lambda u: u.findParent( + lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0 + )), arches) +print 'ac2', `arches` + +for arch in arches: + links = arch.findAll('a', href=True) + print 'links', `links` + a = title_arch_arch(links[0]['title']) + assert(a) + print a + for link in links[1:]: + print 'link', `link` + if href_img_re.search(link['href']): continue + m = title_any_re.match(link['title']) + assert(m.group(2) == ocean) + print m.group(1)