#!/usr/bin/python import signal signal.signal(signal.SIGINT, signal.SIG_DFL) import os import urllib import urllib2 import re as regexp #from optparse import OptionParser from BeautifulSoup import BeautifulSoup ocean = 'Opal' url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'') dataf = urllib2.urlopen(url) soup = BeautifulSoup(dataf) title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') href_img_re = regexp.compile('\\.png$') def title_arch_arch(t): print 'checking ',t if t is None: return None m = title_arch_re.match(t) if not m: return None return m.group(1) def title_arch_ok(t): a = title_arch_arch(t) if a is None: return False return a == ocean firstarch = soup.find('a', attrs = {'title': title_arch_ok}) print 'fa',`firstarch` archestable = firstarch.findParent( lambda u: u.name == 'table' and len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1 ) print 'at',`archestable` arches = archestable.findAll('a', attrs = {'title': title_arch_ok}) print 'ac', `arches` arches = map((lambda u: u.findParent( lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0 )), arches) print 'ac2', `arches` for arch in arches: links = arch.findAll('a', href=True) print 'links', `links` a = title_arch_arch(links[0]['title']) assert(a) print a for link in links[1:]: print 'link', `link` if href_img_re.search(link['href']): continue m = title_any_re.match(link['title']) assert(m.group(2) == ocean) print m.group(1)