#!/usr/bin/python import signal signal.signal(signal.SIGINT, signal.SIG_DFL) import os import urllib import urllib2 import re as regexp #from optparse import OptionParser from BeautifulSoup import BeautifulSoup ocean = 'Opal' soup = None def debug(k,v): pass def fetch(): global soup url = ('http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'')) dataf = urllib2.urlopen(url) soup = BeautifulSoup(dataf) def title_arch_arch(t): debug('checking',t) if t is None: return None m = title_arch_re.match(t) if not m: return None return m.group(1) def title_arch_ok(t): a = title_arch_arch(t) if a is None: return False return a == ocean title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') href_img_re = regexp.compile('\\.png$') def parse(): firstarch = soup.find('a', attrs = {'title': title_arch_ok}) debug('fa',firstarch) def findall_title_arch_ok(t): return t.findAll('a', attrs = {'title': title_arch_ok}) def is_archestable(u): if u.name != 'table': return False return len(findall_title_arch_ok(u)) > 1 archestable = firstarch.findParent(is_archestable) debug('at',archestable) arches = findall_title_arch_ok(archestable) debug('ac',arches) def is_island(v): return len(v.findAll(text = regexp.compile('.*Large'))) > 0 def arch_up_map(u): return u.findParent(is_island) arches = map(arch_up_map, arches) debug('ac2',arches) for arch in arches: links = arch.findAll('a', href=True) debug('links',links) a = title_arch_arch(links[0]['title']) assert(a) print 'arch', a for link in links[1:]: debug('link',link) if href_img_re.search(link['href']): continue m = title_any_re.match(link['title']) assert(m.group(2) == ocean) print 'island', m.group(1) def main(): fetch() parse() main()