yarrg/yppedia-ocean-scraper

   1 #!/usr/bin/python
   2
   3 # helper program for getting information from yppedia
   4
   5 # This is part of ypp-sc-tools, a set of third-party tools for assisting
   6 # players of Yohoho Puzzle Pirates.
   7 #
   8 # Copyright (C) 2009 Ian Jackson <ijackson@chiark.greenend.org.uk>
   9 #
  10 # This program is free software: you can redistribute it and/or modify
  11 # it under the terms of the GNU General Public License as published by
  12 # the Free Software Foundation, either version 3 of the License, or
  13 # (at your option) any later version.
  14 #
  15 # This program is distributed in the hope that it will be useful,
  16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 # GNU General Public License for more details.
  19 #
  20 # You should have received a copy of the GNU General Public License
  21 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  22 #
  23 # Yohoho and Puzzle Pirates are probably trademarks of Three Rings and
  24 # are used without permission.  This program is not endorsed or
  25 # sponsored by Three Rings.
  26
  27 copyright_info = '''
  28 yppedia-ocean-scraper is part of ypp-sc-tools Copyright (C) 2009 Ian Jackson
  29 This program comes with ABSOLUTELY NO WARRANTY; this is free software,
  30 and you are welcome to redistribute it under certain conditions.  For
  31 details, read the top of the yppedia-ocean-scraper file.
  32 '''
  33
  34 import signal
  35 signal.signal(signal.SIGINT, signal.SIG_DFL)
  36
  37 import sys
  38 import os
  39 import urllib
  40 import urllib2
  41 import re as regexp
  42 from optparse import OptionParser
  43 from BeautifulSoup import BeautifulSoup
  44
  45 ocean = None
  46 soup = None
  47 opts = None
  48 arches = {}
  49
  50 def debug(k,v):
  51         if opts.debug:
  52                 print >>sys.stderr, k,`v`
  53
  54 def fetch():
  55         global soup
  56         url = ('http://yppedia.puzzlepirates.com/%s_Ocean' %
  57                 urllib.quote(ocean,''))
  58         debug('fetching',url)
  59         dataf = urllib2.urlopen(url)
  60         debug('fetched',dataf)
  61         soup = BeautifulSoup(dataf)
  62
  63
  64 title_arch_re = regexp.compile('(\\S.*\\S) Archipelago \\((\\S+)\\)$')
  65 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
  66 href_img_re = regexp.compile('\\.png$')
  67
  68 def title_arch_info(t):
  69         # returns (arch,ocean)
  70         debug('checking',t)
  71         if t is None: return (None,None)
  72         m = title_arch_re.match(t)
  73         if not m: return (None,None)
  74         return m.groups()
  75
  76 def title_arch_ok(t):
  77         (a,o) = title_arch_info(t)
  78         if o is None: return False
  79         return o == ocean
  80
  81 def parse():
  82         firstarch = soup.find('a', attrs = {'title': title_arch_ok})
  83         debug('fa',firstarch)
  84
  85         def findall_title_arch_ok(t):
  86                 return t.findAll('a', attrs = {'title': title_arch_ok})
  87
  88         def is_archestable(u):
  89                 if u.name != 'table': return False
  90                 return len(findall_title_arch_ok(u)) > 1
  91
  92         archestable = firstarch.findParent('table', attrs={'border':'1'})
  93         debug('at',archestable)
  94
  95         archsoups = []
  96         for row in archestable.findAll('tr',recursive=False):
  97                 archsoups += row.findAll('td',recursive=False)
  98         debug('ac',archsoups)
  99
 100         def is_island(v):
 101                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
 102         def arch_up_map(u):
 103                 return u.findParent(is_island)
 104
 105         for arch in archsoups:
 106                 links = arch.findAll('a', href=True)
 107                 debug('links',links)
 108                 if not links: continue
 109                 (a,o) = title_arch_info(links[0]['title'])
 110                 debug('arch-ocean', (a,o))
 111                 assert(o == ocean)
 112                 assert(a not in arches)
 113                 isles = []
 114                 for link in links[1:]:
 115                         debug('link',link)
 116                         if href_img_re.search(link['href']): continue
 117                         m = title_any_re.match(link['title'])
 118                         assert(m.group(2) == ocean)
 119                         island = m.group(1)
 120                         debug('island', island)
 121                         isles.append(island)
 122                 isles.sort()
 123                 arches[a] = isles
 124
 125 def output():
 126         print 'ocean',ocean
 127         al = arches.keys()
 128         al.sort()
 129         for a in al:
 130                 print '',a
 131                 for island in arches[a]:
 132                         print ' ',island
 133
 134 def main():
 135         global ocean
 136         global opts
 137
 138         pa = OptionParser(
 139                 '''usage: .../yppedia-ocean-scraper [--debug] OCEAN''')
 140         ao = pa.add_option
 141         ao('--debug', action='count', dest='debug', default=0,
 142                 help='enable debugging output')
 143
 144         (opts,args) = pa.parse_args()
 145         if len(args) != 1:
 146                 print >>sys.stderr, copyright_info
 147                 pa.error('need an ocean argument')
 148         ocean = args[0]
 149
 150         fetch()
 151         parse()
 152         output()
 153
 154 main()