4 signal.signal(signal.SIGINT, signal.SIG_DFL)
13 from optparse import OptionParser
15 from BeautifulSoup import BeautifulSoup
19 duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
20 '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
21 '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
22 '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
24 standingvals = ('Able/Distinguished/Respected/Master/Renowned'+
25 '/Grand-Master/Legendary/Ultimate').split('/')
32 def __init__(self, ocean, cachedir):
33 debug('Fetcher init %s' % cachedir)
35 self.cachedir = cachedir
36 try: os.mkdir(cachedir)
37 except (OSError,IOError), oe:
38 if oe.errno != errno.EEXIST: raise
40 def _rate_limit_cache_clean(self, now):
42 for path in os.listdir(self.cachedir):
43 if not path.startswith('#'): continue
44 try: s = os.stat(path)
45 except (OSError,IOError), oe:
46 if oe.errno != errno.ENOENT: raise
48 age = now - s.st_mtime
49 if age > opts.max_age:
50 debug('Fetcher expire %d %s' % (age, path))
52 except (OSError,IOError), oe:
53 if oe.errno != errno.ENOENT: raise
57 debug('Fetcher ages ' + `ages`)
62 debug('Fetcher morewait min=%d age=%d' %
64 need_wait = max(need_wait, age - min_age)
68 debug('Fetcher wait %d' % need_wait)
72 debug('Fetcher fetch %s' % url)
73 cache_corename = urllib.quote_plus(url)
74 cache_item = "%s/#%s#" % (self.cachedir, cache_corename)
75 try: f = file(cache_item, 'r')
76 except (OSError,IOError), oe:
77 if oe.errno != errno.ENOENT: raise
81 s = os.fstat(f.fileno())
82 if now > s.st_mtime + opts.max_age:
83 debug('Fetcher stale')
88 debug('Fetcher cached')
91 debug('Fetcher fetch')
92 self._rate_limit_cache_clean(now)
94 stream = urllib2.urlopen(url)
96 cache_tmp = "%s/#%s~%d#" % (
97 self.cachedir, cache_corename, os.getpid())
98 f = file(cache_tmp, 'w')
101 os.rename(cache_tmp, cache_item)
102 debug('Fetcher stored')
105 def yoweb(self, kind, tail):
106 url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
107 self.ocean, kind, tail)
108 return self.fetch(url)
115 def soupm(self, obj, m):
116 self.msg(m + '; in ' + `obj`)
117 def needs_msgs(self, child_souplog):
118 self.msgs += child_souplog.msgs
119 child_souplog.msgs = [ ]
121 class PirateInfo(SoupLog):
122 # Public data members:
123 # pi.standings = { 'Treasure Haul': 'Able' ... }
124 # pi.crew = (id, name)
125 # pi.flag = (id, name)
126 # pi.msgs = [ 'message describing problem with scrape' ]
128 def _find_standings(self):
129 imgs = self.soup.findAll('img',
130 src=regexp.compile('/yoweb/images/stat.*'))
132 u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A-Za-z]+)\\)\\s*$'
137 standings[skill] = [ ]
142 try: duty = img['alt']
143 except KeyError: continue
145 if not duty in duties:
146 skl.soupm(img, 'unknown duty: "%s"' % duty)
148 key = img.findParent('td')
150 skl.soupm(img, 'duty at root! "%s"' % duty)
152 valelem = key.findNextSibling('td')
154 skl.soupm(key, 'duty missing sibling "%s"'
157 valstr = ''.join(valelem.findAll(text=True))
158 match = re.match(valstr)
160 skl.soupm(key, ('duty "%s" unparseable'+
161 ' standing "%s"') % (duty, valstr))
163 standing = match.group(match.lastindex)
164 standings[duty].append(standing)
171 skl.msg('duty "%s" multiple standings %s' %
175 skl.msg('duty "%s" no standing found' % duty)
178 for i in range(0, len(standingvals)-1):
179 if standing == standingvals[i]:
180 self.standings[duty] = i
181 if not duty in self.standings:
182 skl.msg('duty "%s" unknown standing "%s"' %
185 all_standings_ok = True
187 if not duty in self.standings:
190 def _find_crewflag(self, cf, yoweb_re):
191 things = self.soup.findAll('a', href=regexp.compile(yoweb_re))
193 self.msg('zero or several %s id references found' % cf)
196 id_re = '\\b%sid\\=(\\w+)$' % cf
197 id_haystack = thing['href']
198 match = regexp.compile(id_re).search(id_haystack)
200 self.soupm(thing, ('incomprehensible %s id ref'+
201 ' (%s in %s)') % (cf, id_re, id_haystack))
203 name = ''.join(thing.findAll(text=True))
204 return (match.group(1), name)
206 def __init__(self, pirate):
207 SoupLog.__init__(self)
209 html = fetcher.yoweb('pirate.wm?target=', pirate)
210 self.soup = BeautifulSoup(html,
211 convertEntities=BeautifulSoup.HTML_ENTITIES
214 self._find_standings()
216 self.crew = self._find_crewflag('crew',
217 '^/yoweb/crew/info\\.wm')
218 self.flag = self._find_crewflag('flag',
219 '^/yoweb/flag/info\\.wm')
222 return `(self.crew, self.flag, self.standings, self.msgs)`
228 '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
230 yoweb-scrape [--ocean OCEAN ...] pirate PIRATE
231 yoweb-scrape [--ocean OCEAN ...] crew-of PIRATE
232 yoweb-scrape [--ocean OCEAN ...] dutytab-crew-of PIRATE
235 ao('-O','--ocean',dest='ocean', metavar='OCEAN',
237 help='select ocean OCEAN')
238 ao('--cache-dir', dest='cache_dir', metavar='DIR',
239 default='~/.yoweb-scrape-cache',
240 help='cache yoweb pages in DIR')
241 ao('-D','--debug', action='store_true', dest='debug', default=False,
242 help='enable debugging output')
243 ao('-q','--quiet', action='store_true', dest='quiet',
244 help='suppress warning output')
245 (opts,args) = pa.parse_args()
249 if opts.cache_dir.startswith('~/'):
250 opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:]
252 fetcher = Fetcher(opts.ocean, opts.cache_dir)
255 test = PirateInfo('Anaplian')