chiark / gitweb /
104a408047ba15adfeaa7d09de4c015827d3f03d
[ypp-sc-tools.db-live.git] / yarrg / yppedia-ocean-scraper
1 #!/usr/bin/python
2
3 import signal
4 signal.signal(signal.SIGINT, signal.SIG_DFL)
5
6 import os
7 import urllib
8 import urllib2
9 import re as regexp
10 #from optparse import OptionParser
11
12 from BeautifulSoup import BeautifulSoup
13
14 ocean = 'Opal'
15 soup = None
16
17 def debug(k,v):
18 #       print k,`v`
19         pass
20
21 def fetch():
22         global soup
23         url = ('http://yppedia.puzzlepirates.com/%s_Ocean' %
24                 urllib.quote(ocean,''))
25         dataf = urllib2.urlopen(url)
26         soup = BeautifulSoup(dataf)
27
28
29 title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$')
30 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
31 href_img_re = regexp.compile('\\.png$')
32
33 def title_arch_info(t):
34         # returns (arch,ocean)
35         debug('checking',t)
36         if t is None: return (None,None)
37         m = title_arch_re.match(t)
38         if not m: return (None,None)
39         return m.groups()
40
41 def title_arch_ok(t):
42         (a,o) = title_arch_info(t)
43         if o is None: return False
44         return o == ocean
45
46 def parse():
47         firstarch = soup.find('a', attrs = {'title': title_arch_ok})
48         debug('fa',firstarch)
49
50         def findall_title_arch_ok(t):
51                 return t.findAll('a', attrs = {'title': title_arch_ok})
52
53         def is_archestable(u):
54                 if u.name != 'table': return False
55                 return len(findall_title_arch_ok(u)) > 1
56
57         archestable = firstarch.findParent(is_archestable)
58         debug('at',archestable)
59
60         arches = findall_title_arch_ok(archestable)
61         debug('ac',arches)
62
63         def is_island(v):
64                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
65         def arch_up_map(u):
66                 return u.findParent(is_island)
67         arches = map(arch_up_map, arches)
68         debug('ac2',arches)
69
70         for arch in arches:
71                 links = arch.findAll('a', href=True)
72                 debug('links',links)
73                 (a,o) = title_arch_info(links[0]['title'])
74                 assert(o == ocean)
75                 print 'arch', a
76                 for link in links[1:]:
77                         debug('link',link)
78                         if href_img_re.search(link['href']): continue
79                         m = title_any_re.match(link['title'])
80                         assert(m.group(2) == ocean)
81                         print 'island', m.group(1)
82
83 def main():
84         fetch()
85         parse()
86
87 main()