yarrg/yppedia-ocean-scraper

   1 #!/usr/bin/python
   2
   3 import signal
   4 signal.signal(signal.SIGINT, signal.SIG_DFL)
   5
   6 import os
   7 import urllib
   8 import urllib2
   9 import re as regexp
  10 #from optparse import OptionParser
  11
  12 from BeautifulSoup import BeautifulSoup
  13
  14 ocean = 'Opal'
  15 soup = None
  16
  17 def debug(k,v):
  18         pass
  19
  20 def fetch():
  21         global soup
  22         url = ('http://yppedia.puzzlepirates.com/%s_Ocean' %
  23                 urllib.quote(ocean,''))
  24         dataf = urllib2.urlopen(url)
  25         soup = BeautifulSoup(dataf)
  26
  27
  28 def title_arch_arch(t):
  29         debug('checking',t)
  30         if t is None: return None
  31         m = title_arch_re.match(t)
  32         if not m: return None
  33         return m.group(1)
  34
  35 def title_arch_ok(t):
  36         a = title_arch_arch(t)
  37         if a is None: return False
  38         return a == ocean
  39
  40 title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
  41 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
  42 href_img_re = regexp.compile('\\.png$')
  43
  44 def parse():
  45         firstarch = soup.find('a', attrs = {'title': title_arch_ok})
  46         debug('fa',firstarch)
  47
  48         def findall_title_arch_ok(t):
  49                 return t.findAll('a', attrs = {'title': title_arch_ok})
  50
  51         def is_archestable(u):
  52                 if u.name != 'table': return False
  53                 return len(findall_title_arch_ok(u)) > 1
  54
  55         archestable = firstarch.findParent(is_archestable)
  56         debug('at',archestable)
  57
  58         arches = findall_title_arch_ok(archestable)
  59         debug('ac',arches)
  60
  61         def is_island(v):
  62                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
  63         def arch_up_map(u):
  64                 return u.findParent(is_island)
  65         arches = map(arch_up_map, arches)
  66         debug('ac2',arches)
  67
  68         for arch in arches:
  69                 links = arch.findAll('a', href=True)
  70                 debug('links',links)
  71                 a = title_arch_arch(links[0]['title'])
  72                 assert(a)
  73                 print 'arch', a
  74                 for link in links[1:]:
  75                         debug('link',link)
  76                         if href_img_re.search(link['href']): continue
  77                         m = title_any_re.match(link['title'])
  78                         assert(m.group(2) == ocean)
  79                         print 'island', m.group(1)
  80
  81 def main():
  82         fetch()
  83         parse()
  84
  85 main()