chiark / gitweb /
WIP fixes and also do entity conversion
[ypp-sc-tools.db-live.git] / yoweb-scrape
1 #!/usr/bin/python
2
3 import signal
4 signal.signal(signal.SIGINT, signal.SIG_DFL)
5
6 import os
7 import time
8 import urllib
9 import urllib2
10 import errno
11 import sys
12 import re as regexp
13 from optparse import OptionParser
14
15 from BeautifulSoup import BeautifulSoup
16
17 opts = None
18
19 duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
20         '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
21         '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
22         '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
23
24 standingvals = ('Able/Distinguished/Respected/Master/Renowned'+
25                 '/Grand-Master/Legendary/Ultimate').split('/')
26
27 def debug(m):
28         if opts.debug:
29                 print >>sys.stderr, m
30
31 class Fetcher:
32         def __init__(self, ocean, cachedir):
33                 debug('Fetcher init %s' % cachedir)
34                 self.ocean = ocean
35                 self.cachedir = cachedir
36                 try: os.mkdir(cachedir)
37                 except (OSError,IOError), oe:
38                         if oe.errno != errno.EEXIST: raise
39
40         def _rate_limit_cache_clean(self, now):
41                 ages = []
42                 for path in os.listdir(self.cachedir):
43                         if not path.startswith('#'): continue
44                         try: s = os.stat(path)
45                         except (OSError,IOError), oe:
46                                 if oe.errno != errno.ENOENT: raise
47                                 continue
48                         age = now - s.st_mtime
49                         if age > opts.max_age:
50                                 debug('Fetcher expire %d %s' % (age, path))
51                                 try: os.remove(path)
52                                 except (OSError,IOError), oe:
53                                         if oe.errno != errno.ENOENT: raise
54                                 continue
55                         ages.append(age)
56                 ages.sort()
57                 debug('Fetcher ages ' + `ages`)
58                 min_age = 1
59                 need_wait = 0
60                 for age in ages:
61                         if age < min_age:
62                                 debug('Fetcher morewait min=%d age=%d' %
63                                         (min_age, age))
64                                 need_wait = max(need_wait, age - min_age)
65                         min_age *= 2
66                         min_age += 1
67                 if need_wait:
68                         debug('Fetcher wait %d' % need_wait)
69                         os.sleep(need_wait)
70
71         def fetch(self, url):
72                 debug('Fetcher fetch %s' % url)
73                 cache_corename = urllib.quote_plus(url)
74                 cache_item = "%s/#%s#" % (self.cachedir, cache_corename)
75                 try: f = file(cache_item, 'r')
76                 except (OSError,IOError), oe:
77                         if oe.errno != errno.ENOENT: raise
78                         f = None
79                 now = time.time()
80                 if f is not None:
81                         s = os.fstat(f.fileno())
82                         if now > s.st_mtime + opts.max_age:
83                                 debug('Fetcher  stale')
84                                 f = None
85                 if f is not None:
86                         data = f.read()
87                         f.close()
88                         debug('Fetcher  cached')
89                         return data
90
91                 debug('Fetcher  fetch')
92                 self._rate_limit_cache_clean(now)
93
94                 stream = urllib2.urlopen(url)
95                 data = stream.read()
96                 cache_tmp = "%s/#%s~%d#" % (
97                         self.cachedir, cache_corename, os.getpid())
98                 f = file(cache_tmp, 'w')
99                 f.write(data)
100                 f.close()
101                 os.rename(cache_tmp, cache_item)
102                 debug('Fetcher  stored')
103                 return data
104
105         def yoweb(self, kind, tail):
106                 url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
107                         self.ocean, kind, tail)
108                 return self.fetch(url)
109
110 class SoupLog:
111         def __init__(self):
112                 self.msgs = [ ]
113         def msg(self, m):
114                 self.msgs.append(m)
115         def soupm(self, obj, m):
116                 self.msg(m + '; in ' + `obj`)
117         def needs_msgs(self, child_souplog):
118                 self.msgs += child_souplog.msgs
119                 child_souplog.msgs = [ ]
120
121 class PirateInfo(SoupLog):
122         # Public data members:
123         #  pi.standings = { 'Treasure Haul': 'Able' ... }
124         #  pi.crew = (id, name)
125         #  pi.flag = (id, name)
126         #  pi.msgs = [ 'message describing problem with scrape' ]
127
128         def _find_standings(self):
129                 imgs = self.soup.findAll('img',
130                         src=regexp.compile('/yoweb/images/stat.*'))
131                 re = regexp.compile(
132 u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$'
133                         )
134                 standings = { }
135
136                 for skill in duties:
137                         standings[skill] = [ ]
138
139                 skl = SoupLog()
140
141                 for img in imgs:
142                         try: duty = img['alt']
143                         except KeyError: continue
144
145                         if not duty in duties:
146                                 skl.soupm(img, 'unknown duty: "%s"' % duty)
147                                 continue
148                         key = img.findParent('td')
149                         if key is None:
150                                 skl.soupm(img, 'duty at root! "%s"' % duty)
151                                 continue
152                         valelem = key.findNextSibling('td')
153                         if valelem is None:
154                                 skl.soupm(key, 'duty missing sibling "%s"'
155                                         % duty)
156                                 continue
157                         valstr = ''.join(valelem.findAll(text=True))
158                         match = re.match(valstr)
159                         if match is None:
160                                 skl.soupm(key, ('duty "%s" unparseable'+
161                                         ' standing "%s"') % (duty, valstr))
162                                 continue
163                         standing = match.group(match.lastindex)
164                         standings[duty].append(standing)
165
166                 self.standings = { }
167
168                 for duty in duties:
169                         sl = standings[duty]
170                         if len(sl) > 1:
171                                 skl.msg('duty "%s" multiple standings %s' %
172                                                 (duty, `sl`))
173                                 continue
174                         if not len(sl):
175                                 skl.msg('duty "%s" no standing found' % duty)
176                                 continue
177                         standing = sl[0]
178                         for i in range(0, len(standingvals)-1):
179                                 if standing == standingvals[i]:
180                                         self.standings[duty] = i
181                         if not duty in self.standings:
182                                 skl.msg('duty "%s" unknown standing "%s"' %
183                                         (duty, standing))
184
185                 all_standings_ok = True
186                 for duty in duties:
187                         if not duty in self.standings:
188                                 self.needs_msgs(skl)
189
190         def _find_crewflag(self, cf, yoweb_re):
191                 things = self.soup.findAll('a', href=regexp.compile(yoweb_re))
192                 if len(things) != 1:
193                         self.msg('zero or several %s id references found' % cf)
194                         return None
195                 thing = things[0]
196                 id_re = '\\b%sid\\=(\\w+)$' % cf
197                 id_haystack = thing['href']
198                 match = regexp.compile(id_re).search(id_haystack)
199                 if match is None:
200                         self.soupm(thing, ('incomprehensible %s id ref'+
201                                 ' (%s in %s)') % (cf, id_re, id_haystack))
202                         return None
203                 name = ''.join(thing.findAll(text=True))
204                 return (match.group(1), name)
205                 
206         def __init__(self, pirate):
207                 SoupLog.__init__(self)
208
209                 html = fetcher.yoweb('pirate.wm?target=', pirate)
210                 self.soup = BeautifulSoup(html,
211                         convertEntities=BeautifulSoup.HTML_ENTITIES
212                         )
213
214                 self._find_standings()
215
216                 self.crew = self._find_crewflag('crew',
217                         '^/yoweb/crew/info\\.wm')
218                 self.flag = self._find_crewflag('flag',
219                         '^/yoweb/flag/info\\.wm')
220
221         def __str__(self):
222                 return `(self.crew, self.flag, self.standings, self.msgs)`
223
224 def main():
225         global opts, fetcher
226
227         pa = OptionParser(
228 '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
229 actions:
230  yoweb-scrape [--ocean OCEAN ...] pirate PIRATE
231  yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE
232  yoweb-scrape [--ocean OCEAN ...] dutytab-crew-of PIRATE
233 ''')
234         ao = pa.add_option
235         ao('-O','--ocean',dest='ocean', metavar='OCEAN',
236                 default='ice',
237                 help='select ocean OCEAN')
238         ao('--cache-dir', dest='cache_dir', metavar='DIR',
239                 default='~/.yoweb-scrape-cache',
240                 help='cache yoweb pages in DIR')
241         ao('-D','--debug', action='store_true', dest='debug', default=False,
242                 help='enable debugging output')
243         ao('-q','--quiet', action='store_true', dest='quiet',
244                 help='suppress warning output')
245         (opts,args) = pa.parse_args()
246
247         # fixed parameters
248         opts.max_age = 240
249         if opts.cache_dir.startswith('~/'):
250                 opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:]
251
252         fetcher = Fetcher(opts.ocean, opts.cache_dir)
253
254         # test program:
255         test = PirateInfo('Anaplian')
256         print test
257
258 main()