chiark / gitweb /
WIP scraper still not working right
[ypp-sc-tools.web-live.git] / yarrg / yppedia-ocean-scraper
1 #!/usr/bin/python
2
3 import signal
4 signal.signal(signal.SIGINT, signal.SIG_DFL)
5
6 import os
7 import urllib
8 import urllib2
9 import re as regexp
10 #from optparse import OptionParser
11
12 from BeautifulSoup import BeautifulSoup
13
14 ocean = 'Opal'
15 soup = None
16
17 def debug(k,v):
18         pass
19
20 def fetch():
21         global soup
22         url = ('http://yppedia.puzzlepirates.com/%s_Ocean' %
23                 urllib.quote(ocean,''))
24         dataf = urllib2.urlopen(url)
25         soup = BeautifulSoup(dataf)
26
27
28 def title_arch_arch(t):
29         debug('checking',t)
30         if t is None: return None
31         m = title_arch_re.match(t)
32         if not m: return None
33         return m.group(1)
34
35 def title_arch_ok(t):
36         a = title_arch_arch(t)
37         if a is None: return False
38         return a == ocean
39
40 title_arch_re = regexp.compile('\\S+ Archipelago \\((\\S+)\\)$')
41 title_any_re = regexp.compile('(\\S.*\\S) \((\\S+)\\)$')
42 href_img_re = regexp.compile('\\.png$')
43
44 def parse():
45         firstarch = soup.find('a', attrs = {'title': title_arch_ok})
46         debug('fa',firstarch)
47
48         def findall_title_arch_ok(t):
49                 return t.findAll('a', attrs = {'title': title_arch_ok})
50
51         def is_archestable(u):
52                 if u.name != 'table': return False
53                 return len(findall_title_arch_ok(u)) > 1
54
55         archestable = firstarch.findParent(is_archestable)
56         debug('at',archestable)
57
58         arches = findall_title_arch_ok(archestable)
59         debug('ac',arches)
60
61         def is_island(v):
62                 return len(v.findAll(text = regexp.compile('.*Large'))) > 0
63         def arch_up_map(u):
64                 return u.findParent(is_island)
65         arches = map(arch_up_map, arches)
66         debug('ac2',arches)
67
68         for arch in arches:
69                 links = arch.findAll('a', href=True)
70                 debug('links',links)
71                 a = title_arch_arch(links[0]['title'])
72                 assert(a)
73                 print 'arch', a
74                 for link in links[1:]:
75                         debug('link',link)
76                         if href_img_re.search(link['href']): continue
77                         m = title_any_re.match(link['title'])
78                         assert(m.group(2) == ocean)
79                         print 'island', m.group(1)
80
81 def main():
82         fetch()
83         parse()
84
85 main()