chiark / gitweb /
WIP ocean scraper seems to mostly work, need to do arg parsing and IO
[ypp-sc-tools.db-test.git] / yarrg / yppedia-ocean-scraper
1 #!/usr/bin/python
2
3 import signal
4 signal.signal(signal.SIGINT, signal.SIG_DFL)
5
6 import os
7 import urllib
8 import urllib2
9 import re as regexp
10 #from optparse import OptionParser
11
12 from BeautifulSoup import BeautifulSoup
13
14 ocean = 'Opal'
15 soup = None
16
17 def debug(k,v):
18 #       print k,`v`
19         pass
20
21 def fetch():
22         global soup
23         url = ('http://yppedia.puzzlepirates.com/%s_Ocean' %
24                 urllib.quote(ocean,''))
25         dataf = urllib2.urlopen(url)
26         soup = BeautifulSoup(dataf)
27
28
29 title_arch_re = regexp.compile('(\\S+) Archipelago \\((\\S+)\\)$')
30 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
31 href_img_re = regexp.compile('\\.png$')
32
33 def title_arch_info(t):
34         # returns (arch,ocean)
35         debug('checking',t)
36         if t is None: return (None,None)
37         m = title_arch_re.match(t)
38         if not m: return (None,None)
39         return m.groups()
40
41 def title_arch_ok(t):
42         (a,o) = title_arch_info(t)
43         if o is None: return False
44         return o == ocean
45
46 def parse():
47         firstarch = soup.find('a', attrs = {'title': title_arch_ok})
48         debug('fa',firstarch)
49
50         def findall_title_arch_ok(t):
51                 return t.findAll('a', attrs = {'title': title_arch_ok})
52
53         def is_archestable(u):
54                 if u.name != 'table': return False
55                 return len(findall_title_arch_ok(u)) > 1
56
57         archestable = firstarch.findParent('table', attrs={'border':'1'})
58         debug('at',archestable)
59
60         arches = []
61         for row in archestable.findAll('tr',recursive=False):
62                 arches += row.findAll('td',recursive=False)
63         debug('ac',arches)
64
65         def is_island(v):
66                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
67         def arch_up_map(u):
68                 return u.findParent(is_island)
69
70         for arch in arches:
71                 links = arch.findAll('a', href=True)
72                 debug('links',links)
73                 if not links: continue
74                 (a,o) = title_arch_info(links[0]['title'])
75                 assert(o == ocean)
76                 print 'arch', a
77                 for link in links[1:]:
78                         debug('link',link)
79                         if href_img_re.search(link['href']): continue
80                         m = title_any_re.match(link['title'])
81                         assert(m.group(2) == ocean)
82                         print 'island', m.group(1)
83
84 def main():
85         fetch()
86         parse()
87
88 main()