From 8fe4fd369f976fad429139aca1885af6a67e983d Mon Sep 17 00:00:00 2001
From: Ian Jackson <ian@liberator.relativity.greenend.org.uk>
Date: Mon, 31 Aug 2009 14:43:11 +0100
Subject: [PATCH] Initial version of scraper for oceans' arches and islands

---
 yarrg/yppedia-ocean-scraper | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100755 yarrg/yppedia-ocean-scraper

diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper
new file mode 100755
index 0000000..c99a84e
--- /dev/null
+++ b/yarrg/yppedia-ocean-scraper
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+import signal
+signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+import os
+import urllib
+import urllib2
+import re as regexp
+#from optparse import OptionParser
+
+from BeautifulSoup import BeautifulSoup
+
+ocean = 'Opal'
+
+url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'')
+dataf = urllib2.urlopen(url)
+
+soup = BeautifulSoup(dataf)
+
+title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
+title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
+href_img_re = regexp.compile('\\.png$')
+
+def title_arch_arch(t):
+	print 'checking ',t
+	if t is None: return None
+	m = title_arch_re.match(t)
+	if not m: return None
+	return m.group(1)
+
+def title_arch_ok(t):
+	a = title_arch_arch(t)
+	if a is None: return False
+	return a == ocean
+
+firstarch = soup.find('a', attrs = {'title': title_arch_ok})
+print 'fa',`firstarch`
+
+archestable = firstarch.findParent(
+	lambda u: u.name == 'table' and
+		len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1
+	)
+
+print 'at',`archestable`
+
+arches = archestable.findAll('a', attrs = {'title': title_arch_ok})
+print 'ac', `arches`
+
+arches = map((lambda u: u.findParent(
+	lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0
+	)), arches)
+print 'ac2', `arches`
+
+for arch in arches:
+	links = arch.findAll('a', href=True)
+	print 'links', `links`
+	a = title_arch_arch(links[0]['title'])
+	assert(a)
+	print a
+	for link in links[1:]:
+		print 'link', `link`
+		if href_img_re.search(link['href']): continue
+		m = title_any_re.match(link['title'])
+		assert(m.group(2) == ocean)
+		print m.group(1)
-- 
2.30.2