From 8fe4fd369f976fad429139aca1885af6a67e983d Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Mon, 31 Aug 2009 14:43:11 +0100 Subject: [PATCH] Initial version of scraper for oceans' arches and islands --- yarrg/yppedia-ocean-scraper | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 yarrg/yppedia-ocean-scraper diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper new file mode 100755 index 0000000..c99a84e --- /dev/null +++ b/yarrg/yppedia-ocean-scraper @@ -0,0 +1,66 @@ +#!/usr/bin/python + +import signal +signal.signal(signal.SIGINT, signal.SIG_DFL) + +import os +import urllib +import urllib2 +import re as regexp +#from optparse import OptionParser + +from BeautifulSoup import BeautifulSoup + +ocean = 'Opal' + +url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'') +dataf = urllib2.urlopen(url) + +soup = BeautifulSoup(dataf) + +title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def title_arch_arch(t): + print 'checking ',t + if t is None: return None + m = title_arch_re.match(t) + if not m: return None + return m.group(1) + +def title_arch_ok(t): + a = title_arch_arch(t) + if a is None: return False + return a == ocean + +firstarch = soup.find('a', attrs = {'title': title_arch_ok}) +print 'fa',`firstarch` + +archestable = firstarch.findParent( + lambda u: u.name == 'table' and + len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1 + ) + +print 'at',`archestable` + +arches = archestable.findAll('a', attrs = {'title': title_arch_ok}) +print 'ac', `arches` + +arches = map((lambda u: u.findParent( + lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0 + )), arches) +print 'ac2', `arches` + +for arch in arches: + links = arch.findAll('a', href=True) + print 'links', `links` + a = title_arch_arch(links[0]['title']) + assert(a) + print a + for link in links[1:]: + print 'link', `link` + if href_img_re.search(link['href']): continue + m = title_any_re.match(link['title']) + assert(m.group(2) == ocean) + print m.group(1) -- 2.30.2