#!/usr/bin/python import signal signal.signal(signal.SIGINT, signal.SIG_DFL) import os import urllib import urllib2 import re as regexp #from optparse import OptionParser from BeautifulSoup import BeautifulSoup ocean = 'Opal' soup = None def debug(k,v): # print k,`v` pass def fetch(): global soup url = ('http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'')) dataf = urllib2.urlopen(url) soup = BeautifulSoup(dataf) title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$') title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') href_img_re = regexp.compile('\\.png$') def title_arch_info(t): # returns (arch,ocean) debug('checking',t) if t is None: return (None,None) m = title_arch_re.match(t) if not m: return (None,None) return m.groups() def title_arch_ok(t): (a,o) = title_arch_info(t) if o is None: return False return o == ocean def parse(): firstarch = soup.find('a', attrs = {'title': title_arch_ok}) debug('fa',firstarch) def findall_title_arch_ok(t): return t.findAll('a', attrs = {'title': title_arch_ok}) def is_archestable(u): if u.name != 'table': return False return len(findall_title_arch_ok(u)) > 1 archestable = firstarch.findParent('table', attrs={'border':'1'}) debug('at',archestable) arches = [] for row in archestable.findAll('tr',recursive=False): arches += row.findAll('td',recursive=False) debug('ac',arches) def is_island(v): return len(v.findAll(text = regexp.compile('.*Large'))) > 0 def arch_up_map(u): return u.findParent(is_island) for arch in arches: links = arch.findAll('a', href=True) debug('links',links) if not links: continue (a,o) = title_arch_info(links[0]['title']) assert(o == ocean) print 'arch', a for link in links[1:]: debug('link',link) if href_img_re.search(link['href']): continue m = title_any_re.match(link['title']) assert(m.group(2) == ocean) print 'island', m.group(1) def main(): fetch() parse() main()