X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;h=30d0c4a8de8d895292879a074a6bf654ecdec4c5;hp=d6d2f24a857d82185878706db643e3888125769a;hb=6a03862e8e23287617045432cc14fb15305c2261;hpb=4c12ab3580b4ee4a5580f29105b309113a77ac7a;ds=sidebyside diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index d6d2f24..30d0c4a 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -1,47 +1,99 @@ #!/usr/bin/python +# helper program for getting information from yppedia + +# This is part of ypp-sc-tools, a set of third-party tools for assisting +# players of Yohoho Puzzle Pirates. +# +# Copyright (C) 2009 Ian Jackson +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Yohoho and Puzzle Pirates are probably trademarks of Three Rings and +# are used without permission. This program is not endorsed or +# sponsored by Three Rings. + +copyright_info = ''' +yppedia-ocean-scraper is part of ypp-sc-tools Copyright (C) 2009 Ian Jackson +This program comes with ABSOLUTELY NO WARRANTY; this is free software, +and you are welcome to redistribute it under certain conditions. For +details, read the top of the yppedia-ocean-scraper file. +''' + import signal signal.signal(signal.SIGINT, signal.SIG_DFL) +import sys import os import urllib import urllib2 import re as regexp -#from optparse import OptionParser - +from optparse import OptionParser from BeautifulSoup import BeautifulSoup -ocean = 'Opal' +ocean = None soup = None +opts = None +arches = {} def debug(k,v): - pass + if opts.debug: + print >>sys.stderr, k,`v` def fetch(): global soup - url = ('http://yppedia.puzzlepirates.com/%s_Ocean' % - urllib.quote(ocean,'')) + if opts.chart: + url_base = 'index.php?title=Template:Map:%s_Ocean&action=edit' + else: + url_base = '%s_Ocean' + url = ('http://yppedia.puzzlepirates.com/' + + (url_base % urllib.quote(ocean,''))) + debug('fetching',url) dataf = urllib2.urlopen(url) + debug('fetched',dataf) soup = BeautifulSoup(dataf) -def title_arch_arch(t): +title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$') +title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') +href_img_re = regexp.compile('\\.png$') + +def title_arch_info(t): + # returns (arch,ocean) debug('checking',t) - if t is None: return None + if t is None: return (None,None) m = title_arch_re.match(t) - if not m: return None - return m.group(1) + if not m: return (None,None) + return m.groups() def title_arch_ok(t): - a = title_arch_arch(t) - if a is None: return False - return a == ocean - -title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') -title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') -href_img_re = regexp.compile('\\.png$') - -def parse(): + (a,o) = title_arch_info(t) + if o is None: return False + return o == ocean + +def parse_chart(): + ta = soup.find('textarea') + debug('ta',ta) + s = ta.string + debug('s',s) + s = regexp.sub(r'\<\;', '<', s) + s = regexp.sub(r'\>\;', '>', s) + s = regexp.sub(r'\&\;', '&', s) + debug('s',s) + return s + +def parse_ocean(): firstarch = soup.find('a', attrs = {'title': title_arch_ok}) debug('fa',firstarch) @@ -52,34 +104,72 @@ def parse(): if u.name != 'table': return False return len(findall_title_arch_ok(u)) > 1 - archestable = firstarch.findParent(is_archestable) + archestable = firstarch.findParent('table', attrs={'border':'1'}) debug('at',archestable) - arches = findall_title_arch_ok(archestable) - debug('ac',arches) + archsoups = [] + for row in archestable.findAll('tr',recursive=False): + archsoups += row.findAll('td',recursive=False) + debug('ac',archsoups) def is_island(v): return len(v.findAll(text = regexp.compile('.*Large'))) > 0 def arch_up_map(u): return u.findParent(is_island) - arches = map(arch_up_map, arches) - debug('ac2',arches) - for arch in arches: + for arch in archsoups: links = arch.findAll('a', href=True) debug('links',links) - a = title_arch_arch(links[0]['title']) - assert(a) - print 'arch', a + if not links: continue + (a,o) = title_arch_info(links[0]['title']) + debug('arch-ocean', (a,o)) + assert(o == ocean) + assert(a not in arches) + isles = [] for link in links[1:]: debug('link',link) if href_img_re.search(link['href']): continue m = title_any_re.match(link['title']) assert(m.group(2) == ocean) - print 'island', m.group(1) + island = m.group(1) + debug('island', island) + isles.append(island) + isles.sort() + arches[a] = isles + +def output(): + print 'ocean',ocean + al = arches.keys() + al.sort() + for a in al: + print '',a + for island in arches[a]: + print ' ',island def main(): + global ocean + global opts + + pa = OptionParser( +'''usage: .../yppedia-ocean-scraper [--debug] [--chart] OCEAN''') + ao = pa.add_option + + ao('--chart', action='store_true', dest='chart', + help='print chart source rather than arch/island info') + ao('--debug', action='count', dest='debug', default=0, + help='enable debugging output') + + (opts,args) = pa.parse_args() + if len(args) != 1: + print >>sys.stderr, copyright_info + pa.error('need an ocean argument') + ocean = args[0] + fetch() - parse() + if opts.chart: + print parse_chart() + else: + parse_ocean() + output() main()