X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-test.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;h=476c1cd9b381bb0d5d5ec1979cc4928bff9fa9f4;hp=c99a84efe5dbfcb0dc556886dfc7d7797e30b387;hb=555b3391b3cd9967a29b219fff242b583137d2b8;hpb=8fe4fd369f976fad429139aca1885af6a67e983d diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index c99a84e..476c1cd 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -1,66 +1,192 @@ #!/usr/bin/python +# helper program for getting information from yppedia + +# This is part of ypp-sc-tools, a set of third-party tools for assisting +# players of Yohoho Puzzle Pirates. +# +# Copyright (C) 2009 Ian Jackson +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Yohoho and Puzzle Pirates are probably trademarks of Three Rings and +# are used without permission. This program is not endorsed or +# sponsored by Three Rings. + +copyright_info = ''' +yppedia-ocean-scraper is part of ypp-sc-tools Copyright (C) 2009 Ian Jackson +This program comes with ABSOLUTELY NO WARRANTY; this is free software, +and you are welcome to redistribute it under certain conditions. For +details, read the top of the yppedia-ocean-scraper file. +''' + import signal signal.signal(signal.SIGINT, signal.SIG_DFL) +import sys import os import urllib import urllib2 import re as regexp -#from optparse import OptionParser - +from optparse import OptionParser from BeautifulSoup import BeautifulSoup -ocean = 'Opal' -url = 'http://yppedia.puzzlepirates.com/%s_Ocean' % urllib.quote(ocean,'') -dataf = urllib2.urlopen(url) +# For fuck's sake! +import codecs +import locale +def fix_stdout(): + sys.stdout = codecs.EncodedFile(sys.stdout, locale.getpreferredencoding()) + def null_decode(input, errors='strict'): + return input, len(input) + sys.stdout.decode = null_decode +# From +# http://ewx.livejournal.com/457086.html?thread=3016574 +# http://ewx.livejournal.com/457086.html?thread=3016574 +# lightly modified. +# See also Debian #415968. +fix_stdout() + + +ocean = None +soup = None +opts = None +arches = {} -soup = BeautifulSoup(dataf) +def debug(k,v): + if opts.debug: + print >>sys.stderr, k,`v` -title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$') +def fetch(): + global soup + if opts.chart: + url_base = 'index.php?title=Template:Map:%s_Ocean&action=edit' + else: + url_base = '%s_Ocean' + url = ('http://yppedia.puzzlepirates.com/' + + (url_base % urllib.quote(ocean,''))) + debug('fetching',url) + dataf = urllib2.urlopen(url) + debug('fetched',dataf) + soup = BeautifulSoup(dataf) + + +title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$') title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') href_img_re = regexp.compile('\\.png$') -def title_arch_arch(t): - print 'checking ',t - if t is None: return None +def title_arch_info(t): + # returns (arch,ocean) + debug('checking',t) + if t is None: return (None,None) m = title_arch_re.match(t) - if not m: return None - return m.group(1) + if not m: return (None,None) + return m.groups() def title_arch_ok(t): - a = title_arch_arch(t) - if a is None: return False - return a == ocean - -firstarch = soup.find('a', attrs = {'title': title_arch_ok}) -print 'fa',`firstarch` - -archestable = firstarch.findParent( - lambda u: u.name == 'table' and - len(u.findAll('a', attrs = {'title': title_arch_ok})) > 1 - ) - -print 'at',`archestable` - -arches = archestable.findAll('a', attrs = {'title': title_arch_ok}) -print 'ac', `arches` - -arches = map((lambda u: u.findParent( - lambda v: len(v.findAll(text= regexp.compile('.*Large'))) > 0 - )), arches) -print 'ac2', `arches` - -for arch in arches: - links = arch.findAll('a', href=True) - print 'links', `links` - a = title_arch_arch(links[0]['title']) - assert(a) - print a - for link in links[1:]: - print 'link', `link` - if href_img_re.search(link['href']): continue - m = title_any_re.match(link['title']) - assert(m.group(2) == ocean) - print m.group(1) + (a,o) = title_arch_info(t) + if o is None: return False + return o == ocean + +def parse_chart(): + ta = soup.find('textarea') + debug('ta',ta) + s = ta.string + debug('s',s) + s = regexp.sub(r'\<\;', '<', s) + s = regexp.sub(r'\>\;', '>', s) + s = regexp.sub(r'\"\;', '"', s) + s = regexp.sub(r'\&\;', '&', s) + debug('s',s) + return s + +def parse_ocean(): + content = soup.find('div', attrs = {'id': 'content'}) + + def findall_title_arch_ok(t): + return t.findAll('a', attrs = {'title': title_arch_ok}) + + def is_archestable(u): + if u.name != 'table': return False + return len(findall_title_arch_ok(u)) > 1 + + archestable = content.findChild('table', attrs={'border':'1'}) + debug('at',archestable) + + archsoups = [] + for row in archestable.findAll('tr',recursive=False): + archsoups += row.findAll('td',recursive=False) + debug('ac',archsoups) + + def is_island(v): + return len(v.findAll(text = regexp.compile('.*Large'))) > 0 + def arch_up_map(u): + return u.findParent(is_island) + + for arch in archsoups: + links = arch.findAll('a', href=True) + debug('links',links) + if not links: continue + (a,o) = title_arch_info(links[0]['title']) + debug('arch-ocean', (a,o)) + assert(o == ocean) + assert(a not in arches) + isles = [] + for link in links[1:]: + debug('link',link) + if href_img_re.search(link['href']): continue + m = title_any_re.match(link['title']) + assert(m.group(2) == ocean) + island = m.group(1) + debug('island', island) + isles.append(island) + isles.sort() + arches[a] = isles + +def output(): + print 'ocean',ocean + al = arches.keys() + al.sort() + for a in al: + print '',a + for island in arches[a]: + print ' ',island + +def main(): + global ocean + global opts + + pa = OptionParser( +'''usage: .../yppedia-ocean-scraper [--debug] [--chart] OCEAN''') + ao = pa.add_option + + ao('--chart', action='store_true', dest='chart', + help='print chart source rather than arch/island info') + ao('--debug', action='count', dest='debug', default=0, + help='enable debugging output') + + (opts,args) = pa.parse_args() + if len(args) != 1: + print >>sys.stderr, copyright_info + pa.error('need an ocean argument') + ocean = args[0] + + fetch() + if opts.chart: + print parse_chart() + else: + parse_ocean() + output() + +main()