yarrg/yppedia-ocean-scraper

   1 #!/usr/bin/python
   2
   3 # helper program for getting information from yppedia
   4
   5 # This is part of the YARRG website.  YARRG is a tool and website
   6 # for assisting players of Yohoho Puzzle Pirates.
   7 #
   8 # Copyright (C) 2009 Ian Jackson <ijackson@chiark.greenend.org.uk>
   9 #
  10 # This program is free software: you can redistribute it and/or modify
  11 # it under the terms of the GNU Affero General Public License as
  12 # published by the Free Software Foundation, either version 3 of the
  13 # License, or (at your option) any later version.
  14 #
  15 # This program is distributed in the hope that it will be useful,
  16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 # GNU Affero General Public License for more details.
  19 #
  20 # You should have received a copy of the GNU Affero General Public License
  21 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  22 #
  23 # Yohoho and Puzzle Pirates are probably trademarks of Three Rings and
  24 # are used without permission.  This program is not endorsed or
  25 # sponsored by Three Rings.
  26
  27 copyright_info = '''
  28 yppedia-ocean-scraper is part of ypp-sc-tools Copyright (C) 2009 Ian Jackson
  29 This program comes with ABSOLUTELY NO WARRANTY; this is free software,
  30 and you are welcome to redistribute it under certain conditions.  For
  31 details, read the top of the yppedia-ocean-scraper file.
  32 '''
  33
  34 import signal
  35 signal.signal(signal.SIGINT, signal.SIG_DFL)
  36
  37 import sys
  38 import os
  39 import urllib
  40 import re as regexp
  41 import subprocess
  42 from optparse import OptionParser
  43 from BeautifulSoup import BeautifulSoup
  44
  45
  46 # For fuck's sake!
  47 import codecs
  48 import locale
  49 def fix_stdout():
  50     sys.stdout = codecs.EncodedFile(sys.stdout, locale.getpreferredencoding())
  51     def null_decode(input, errors='strict'):
  52         return input, len(input)
  53     sys.stdout.decode = null_decode
  54 # From
  55 #  http://ewx.livejournal.com/457086.html?thread=3016574
  56 #  http://ewx.livejournal.com/457086.html?thread=3016574
  57 # lightly modified.
  58 # See also Debian #415968.
  59 fix_stdout()
  60
  61
  62 # User agent:
  63 class YarrgURLopener(urllib.FancyURLopener):
  64         base_version= urllib.URLopener().version
  65         proc= subprocess.Popen(
  66                 ["./database-info-fetch", "useragentstringmap",
  67                  base_version, "manual islands/topology fetch"],
  68                 shell=False,
  69                 stderr=None,
  70                 stdout=subprocess.PIPE,
  71                 )
  72         version = proc.communicate()[0].rstrip('\n');
  73         assert(proc.returncode is not None and proc.returncode == 0)
  74 urllib._urlopener = YarrgURLopener()
  75
  76 ocean = None
  77 soup = None
  78 opts = None
  79 arches = {}
  80
  81 def debug(k,v):
  82         if opts.debug:
  83                 print >>sys.stderr, k,`v`
  84
  85 def fetch():
  86         global soup
  87         if opts.chart:
  88                 url_base = 'index.php?title=Template:Map:%s_Ocean&action=edit'
  89         else:
  90                 url_base = '%s_Ocean'
  91         url_base = url_base % urllib.quote(ocean,'')
  92         if opts.localhtml is None:
  93                 url = ('http://yppedia.puzzlepirates.com/' + url_base)
  94                 debug('fetching',url)
  95                 dataf = urllib.urlopen(url)
  96                 debug('fetched',dataf)
  97         else:
  98                 dataf = file(opts.localhtml + '/' + url_base, 'r')
  99         soup = BeautifulSoup(dataf)
 100
 101 title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$')
 102 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
 103 href_img_re = regexp.compile('\\.png$')
 104
 105 def title_arch_info(t):
 106         # returns (arch,ocean)
 107         debug('checking',t)
 108         if t is None: return (None,None)
 109         m = title_arch_re.match(t)
 110         if not m: return (None,None)
 111         return m.groups()
 112
 113 def title_arch_ok(t):
 114         (a,o) = title_arch_info(t)
 115         if o is None: return False
 116         return o == ocean
 117
 118 def parse_chart():
 119         ta = soup.find('textarea')
 120         debug('ta',ta)
 121         s = ta.string
 122         debug('s',s)
 123         s = regexp.sub(r'\&lt\;', '<', s)
 124         s = regexp.sub(r'\&gt\;', '>', s)
 125         s = regexp.sub(r'\&quot\;', '"', s)
 126         s = regexp.sub(r'\&amp\;', '&', s)
 127         debug('s',s)
 128         return s
 129
 130 def parse_ocean():
 131         content = soup.find('div', attrs = {'id': 'content'})
 132
 133         def findall_title_arch_ok(t):
 134                 return t.findAll('a', attrs = {'title': title_arch_ok})
 135
 136         def is_archestable(u):
 137                 if u.name != 'table': return False
 138                 return len(findall_title_arch_ok(u)) > 1
 139
 140         archestable = content.findChild('table', attrs={'border':'1'})
 141         debug('at',archestable)
 142
 143         archsoups = []
 144         for row in archestable.findAll('tr',recursive=False):
 145                 archsoups += row.findAll('td',recursive=False)
 146         debug('ac',archsoups)
 147
 148         def is_island(v):
 149                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
 150         def arch_up_map(u):
 151                 return u.findParent(is_island)
 152
 153         for arch in archsoups:
 154                 links = arch.findAll('a', href=True)
 155                 debug('links',links)
 156                 if not links: continue
 157                 (a,o) = title_arch_info(links[0]['title'])
 158                 debug('arch-ocean', (a,o))
 159                 assert(o == ocean)
 160                 assert(a not in arches)
 161                 isles = []
 162                 for link in links[1:]:
 163                         debug('link',link)
 164                         if href_img_re.search(link['href']): continue
 165                         m = title_any_re.match(link['title'])
 166                         assert(m.group(2) == ocean)
 167                         island = m.group(1)
 168                         debug('island', island)
 169                         isles.append(island)
 170                 isles.sort()
 171                 arches[a] = isles
 172
 173 def output():
 174         print 'ocean',ocean
 175         al = arches.keys()
 176         al.sort()
 177         for a in al:
 178                 print '',a
 179                 for island in arches[a]:
 180                         print ' ',island
 181
 182 def main():
 183         global ocean
 184         global opts
 185
 186         pa = OptionParser(
 187 '''usage: .../yppedia-ocean-scraper [--debug] [--chart] OCEAN''')
 188         ao = pa.add_option
 189
 190         ao('--chart', action='store_true', dest='chart',
 191                 help='print chart source rather than arch/island info')
 192         ao('--debug', action='count', dest='debug', default=0,
 193                 help='enable debugging output')
 194         ao('--local-html-dir', action='store', dest='localhtml',
 195                 help='get yppedia pages from local directory LOCALHTML'+
 196                         ' instead of via HTTP')
 197
 198         (opts,args) = pa.parse_args()
 199         if len(args) != 1:
 200                 print >>sys.stderr, copyright_info
 201                 pa.error('need an ocean argument')
 202         ocean = args[0]
 203
 204         fetch()
 205         if opts.chart:
 206                 print parse_chart()
 207         else:
 208                 parse_ocean()
 209                 output()
 210
 211 main()