X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.db-live.git;a=blobdiff_plain;f=yarrg%2Fyppedia-ocean-scraper;h=ba145eafcc247b6bae90d13244b159d62570928d;hp=104a408047ba15adfeaa7d09de4c015827d3f03d;hb=0505b9e0fd61bfaa4c84968dfba65404b408cbf6;hpb=1d5d1c30049c38ef1fc002b7403c695a6bfc469b diff --git a/yarrg/yppedia-ocean-scraper b/yarrg/yppedia-ocean-scraper index 104a408..ba145ea 100755 --- a/yarrg/yppedia-ocean-scraper +++ b/yarrg/yppedia-ocean-scraper @@ -1,32 +1,104 @@ #!/usr/bin/python +# helper program for getting information from yppedia + +# This is part of the YARRG website. YARRG is a tool and website +# for assisting players of Yohoho Puzzle Pirates. +# +# Copyright (C) 2009 Ian Jackson +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Yohoho and Puzzle Pirates are probably trademarks of Three Rings and +# are used without permission. This program is not endorsed or +# sponsored by Three Rings. + +copyright_info = ''' +yppedia-ocean-scraper is part of ypp-sc-tools Copyright (C) 2009 Ian Jackson +This program comes with ABSOLUTELY NO WARRANTY; this is free software, +and you are welcome to redistribute it under certain conditions. For +details, read the top of the yppedia-ocean-scraper file. +''' + import signal signal.signal(signal.SIGINT, signal.SIG_DFL) +import sys import os import urllib -import urllib2 import re as regexp -#from optparse import OptionParser - +import subprocess +from optparse import OptionParser from BeautifulSoup import BeautifulSoup -ocean = 'Opal' + +# For fuck's sake! +import codecs +import locale +def fix_stdout(): + sys.stdout = codecs.EncodedFile(sys.stdout, locale.getpreferredencoding()) + def null_decode(input, errors='strict'): + return input, len(input) + sys.stdout.decode = null_decode +# From +# http://ewx.livejournal.com/457086.html?thread=3016574 +# http://ewx.livejournal.com/457086.html?thread=3016574 +# lightly modified. +# See also Debian #415968. +fix_stdout() + + +# User agent: +class YarrgURLopener(urllib.FancyURLopener): + base_version= urllib.URLopener().version + proc= subprocess.Popen( + ["./database-info-fetch", "useragentstringmap", + base_version, "manual islands/topology fetch"], + shell=False, + stderr=None, + stdout=subprocess.PIPE, + ) + version = proc.communicate()[0].rstrip('\n'); + assert(proc.returncode is not None and proc.returncode == 0) +urllib._urlopener = YarrgURLopener() + +ocean = None soup = None +opts = None +arches = {} def debug(k,v): -# print k,`v` - pass + if opts.debug: + print >>sys.stderr, k,`v` def fetch(): global soup - url = ('http://yppedia.puzzlepirates.com/%s_Ocean' % - urllib.quote(ocean,'')) - dataf = urllib2.urlopen(url) + if opts.chart: + url_base = 'index.php?title=Template:Map:%s_Ocean&action=edit' + else: + url_base = '%s_Ocean' + url_base = url_base % urllib.quote(ocean,'') + if opts.localhtml is None: + url = ('http://yppedia.puzzlepirates.com/' + url_base) + debug('fetching',url) + dataf = urllib.urlopen(url) + debug('fetched',dataf) + else: + dataf = file(opts.localhtml + '/' + url_base, 'r') soup = BeautifulSoup(dataf) - -title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$') +title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$') title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$') href_img_re = regexp.compile('\\.png$') @@ -43,9 +115,20 @@ def title_arch_ok(t): if o is None: return False return o == ocean -def parse(): - firstarch = soup.find('a', attrs = {'title': title_arch_ok}) - debug('fa',firstarch) +def parse_chart(): + ta = soup.find('textarea') + debug('ta',ta) + s = ta.string + debug('s',s) + s = regexp.sub(r'\<\;', '<', s) + s = regexp.sub(r'\>\;', '>', s) + s = regexp.sub(r'\"\;', '"', s) + s = regexp.sub(r'\&\;', '&', s) + debug('s',s) + return s + +def parse_ocean(): + content = soup.find('div', attrs = {'id': 'content'}) def findall_title_arch_ok(t): return t.findAll('a', attrs = {'title': title_arch_ok}) @@ -54,34 +137,75 @@ def parse(): if u.name != 'table': return False return len(findall_title_arch_ok(u)) > 1 - archestable = firstarch.findParent(is_archestable) + archestable = content.findChild('table', attrs={'border':'1'}) debug('at',archestable) - arches = findall_title_arch_ok(archestable) - debug('ac',arches) + archsoups = [] + for row in archestable.findAll('tr',recursive=False): + archsoups += row.findAll('td',recursive=False) + debug('ac',archsoups) def is_island(v): return len(v.findAll(text = regexp.compile('.*Large'))) > 0 def arch_up_map(u): return u.findParent(is_island) - arches = map(arch_up_map, arches) - debug('ac2',arches) - for arch in arches: + for arch in archsoups: links = arch.findAll('a', href=True) debug('links',links) + if not links: continue (a,o) = title_arch_info(links[0]['title']) + debug('arch-ocean', (a,o)) assert(o == ocean) - print 'arch', a + assert(a not in arches) + isles = [] for link in links[1:]: debug('link',link) if href_img_re.search(link['href']): continue m = title_any_re.match(link['title']) assert(m.group(2) == ocean) - print 'island', m.group(1) + island = m.group(1) + debug('island', island) + isles.append(island) + isles.sort() + arches[a] = isles + +def output(): + print 'ocean',ocean + al = arches.keys() + al.sort() + for a in al: + print '',a + for island in arches[a]: + print ' ',island def main(): + global ocean + global opts + + pa = OptionParser( +'''usage: .../yppedia-ocean-scraper [--debug] [--chart] OCEAN''') + ao = pa.add_option + + ao('--chart', action='store_true', dest='chart', + help='print chart source rather than arch/island info') + ao('--debug', action='count', dest='debug', default=0, + help='enable debugging output') + ao('--local-html-dir', action='store', dest='localhtml', + help='get yppedia pages from local directory LOCALHTML'+ + ' instead of via HTTP') + + (opts,args) = pa.parse_args() + if len(args) != 1: + print >>sys.stderr, copyright_info + pa.error('need an ocean argument') + ocean = args[0] + fetch() - parse() + if opts.chart: + print parse_chart() + else: + parse_ocean() + output() main()