X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.main.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;fp=yarrg%2Fyppedia-ocean-scraper;h=c99a84efe5dbfcb0dc556886dfc7d7797e30b387;hp=0000000000000000000000000000000000000000;hb=8fe4fd369f976fad429139aca1885af6a67e983d;hpb=35dc4e063467490048e83f1d9d519cd3652d870e

diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper
new file mode 100755
index 0000000..c99a84e
--- /dev/null
+++ b/yarrg/yppedia-ocean-scraper
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+import signal
+signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+import os
+import urllib
+import urllib2
+import re as regexp
+#from optparse import OptionParser
+
+from BeautifulSoup import BeautifulSoup
+
+ocean = 'Opal'
+
+url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'')
+dataf = urllib2.urlopen(url)
+
+soup = BeautifulSoup(dataf)
+
+title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
+title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
+href_img_re = regexp.compile('\\.png$')
+
+def title_arch_arch(t):
+	print 'checking ',t
+	if t is None: return None
+	m = title_arch_re.match(t)
+	if not m: return None
+	return m.group(1)
+
+def title_arch_ok(t):
+	a = title_arch_arch(t)
+	if a is None: return False
+	return a == ocean
+
+firstarch = soup.find('a', attrs = {'title': title_arch_ok})
+print 'fa',`firstarch`
+
+archestable = firstarch.findParent(
+	lambda u: u.name == 'table' and
+		len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1
+	)
+
+print 'at',`archestable`
+
+arches = archestable.findAll('a', attrs = {'title': title_arch_ok})
+print 'ac', `arches`
+
+arches = map((lambda u: u.findParent(
+	lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0
+	)), arches)
+print 'ac2', `arches`
+
+for arch in arches:
+	links = arch.findAll('a', href=True)
+	print 'links', `links`
+	a = title_arch_arch(links[0]['title'])
+	assert(a)
+	print a
+	for link in links[1:]:
+		print 'link', `link`
+		if href_img_re.search(link['href']): continue
+		m = title_any_re.match(link['title'])
+		assert(m.group(2) == ocean)
+		print m.group(1)