chiark / gitweb /
Initial version of scraper for oceans' arches and islands
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 13:43:11 +0000 (14:43 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Mon, 31 Aug 2009 13:43:11 +0000 (14:43 +0100)
yarrg/yppedia-ocean-scraper [new file with mode: 0755]

diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper
new file mode 100755 (executable)
index 0000000..c99a84e
--- /dev/null
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+import signal
+signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+import os
+import urllib
+import urllib2
+import re as regexp
+#from optparse import OptionParser
+
+from BeautifulSoup import BeautifulSoup
+
+ocean = 'Opal'
+
+url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'')
+dataf = urllib2.urlopen(url)
+
+soup = BeautifulSoup(dataf)
+
+title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
+title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
+href_img_re = regexp.compile('\\.png$')
+
+def title_arch_arch(t):
+       print 'checking ',t
+       if t is None: return None
+       m = title_arch_re.match(t)
+       if not m: return None
+       return m.group(1)
+
+def title_arch_ok(t):
+       a = title_arch_arch(t)
+       if a is None: return False
+       return a == ocean
+
+firstarch = soup.find('a', attrs = {'title': title_arch_ok})
+print 'fa',`firstarch`
+
+archestable = firstarch.findParent(
+       lambda u: u.name == 'table' and
+               len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1
+       )
+
+print 'at',`archestable`
+
+arches = archestable.findAll('a', attrs = {'title': title_arch_ok})
+print 'ac', `arches`
+
+arches = map((lambda u: u.findParent(
+       lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0
+       )), arches)
+print 'ac2', `arches`
+
+for arch in arches:
+       links = arch.findAll('a', href=True)
+       print 'links', `links`
+       a = title_arch_arch(links[0]['title'])
+       assert(a)
+       print a
+       for link in links[1:]:
+               print 'link', `link`
+               if href_img_re.search(link['href']): continue
+               m = title_any_re.match(link['title'])
+               assert(m.group(2) == ocean)
+               print m.group(1)