From 46a1fd2eb3beef6334ea615c1dc97596ea08c35c Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Fri, 15 May 2009 19:24:37 +0100 Subject: [PATCH] Can get crew info --- yoweb-scrape | 148 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 119 insertions(+), 29 deletions(-) diff --git a/yoweb-scrape b/yoweb-scrape index 4c7cee5..54ef0fe 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -24,6 +24,8 @@ duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+ standingvals = ('Able/Distinguished/Respected/Master/Renowned'+ '/Grand-Master/Legendary/Ultimate').split('/') +pirate_ref_re = regexp.compile('^/yoweb/pirate\\.wm') + def debug(m): if opts.debug: print >>sys.stderr, m @@ -36,37 +38,45 @@ class Fetcher: try: os.mkdir(cachedir) except (OSError,IOError), oe: if oe.errno != errno.EEXIST: raise + self._cache_scan(time.time()) - def _rate_limit_cache_clean(self, now): + def _cache_scan(self, now): + # returns list of ages, unsorted ages = [] - for path in os.listdir(self.cachedir): - if not path.startswith('#'): continue + debug('Fetcher scan_cache') + for leaf in os.listdir(self.cachedir): + if not leaf.startswith('#'): continue + path = self.cachedir + '/' + leaf try: s = os.stat(path) except (OSError,IOError), oe: if oe.errno != errno.ENOENT: raise continue age = now - s.st_mtime if age > opts.max_age: - debug('Fetcher expire %d %s' % (age, path)) + debug('Fetcher expire %d %s' % (age, path)) try: os.remove(path) except (OSError,IOError), oe: if oe.errno != errno.ENOENT: raise continue ages.append(age) + return ages + + def _rate_limit_cache_clean(self, now): + ages = self._cache_scan(now) ages.sort() - debug('Fetcher ages ' + `ages`) + debug('Fetcher ages ' + `ages`) min_age = 1 need_wait = 0 for age in ages: if age < min_age: - debug('Fetcher morewait min=%d age=%d' % + debug('Fetcher morewait min=%d age=%d' % (min_age, age)) - need_wait = max(need_wait, age - min_age) + need_wait = max(need_wait, min_age - age) min_age *= 2 min_age += 1 - if need_wait: - debug('Fetcher wait %d' % need_wait) - os.sleep(need_wait) + if need_wait > 0: + debug('Fetcher wait %d' % need_wait) + time.sleep(need_wait) def fetch(self, url): debug('Fetcher fetch %s' % url) @@ -118,12 +128,33 @@ class SoupLog: self.msgs += child_souplog.msgs child_souplog.msgs = [ ] -class PirateInfo(SoupLog): +def soup_text(obj): + str = ''.join(obj.findAll(text=True)) + return str.strip() + +class SomethingSoupInfo(SoupLog): + def __init__(self, kind, tail): + SoupLog.__init__(self) + html = fetcher.yoweb(kind, tail) + self.soup = BeautifulSoup(html, + convertEntities=BeautifulSoup.HTML_ENTITIES + ) + +class PirateInfo(SomethingSoupInfo): # Public data members: # pi.standings = { 'Treasure Haul': 'Able' ... } # pi.crew = (id, name) # pi.flag = (id, name) # pi.msgs = [ 'message describing problem with scrape' ] + + def __init__(self, pirate): + SomethingSoupInfo.__init__(self, + 'pirate.wm?target=', pirate) + self._find_standings() + self.crew = self._find_crewflag('crew', + '^/yoweb/crew/info\\.wm') + self.flag = self._find_crewflag('flag', + '^/yoweb/flag/info\\.wm') def _find_standings(self): imgs = self.soup.findAll('img', @@ -154,7 +185,7 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A- skl.soupm(key, 'duty missing sibling "%s"' % duty) continue - valstr = ''.join(valelem.findAll(text=True)) + valstr = soup_text(valelem) match = re.match(valstr) if match is None: skl.soupm(key, ('duty "%s" unparseable'+ @@ -200,26 +231,78 @@ u'\\s*\\S*/([-A-Za-z]+)\\s*$|\\s*\\S*/\\S*\\s*\\(ocean\\-wide(?:\\s|\\xa0)+([-A- self.soupm(thing, ('incomprehensible %s id ref'+ ' (%s in %s)') % (cf, id_re, id_haystack)) return None - name = ''.join(thing.findAll(text=True)) + name = soup_text(thing) return (match.group(1), name) - - def __init__(self, pirate): - SoupLog.__init__(self) - html = fetcher.yoweb('pirate.wm?target=', pirate) - self.soup = BeautifulSoup(html, - convertEntities=BeautifulSoup.HTML_ENTITIES - ) + def __str__(self): + return `(self.crew, self.flag, self.standings, self.msgs)` - self._find_standings() +class CrewInfo(SomethingSoupInfo): + # Public data members: + # ci.crew = [ ('Captain', ['Pirate', ...]), + # ('Senior Officer', [...]), + # ... ] + # pi.msgs = [ 'message describing problem with scrape' ] - self.crew = self._find_crewflag('crew', - '^/yoweb/crew/info\\.wm') - self.flag = self._find_crewflag('flag', - '^/yoweb/flag/info\\.wm') + def __init__(self, crewid): + SomethingSoupInfo.__init__(self, + 'crew/info.wm?crewid=', crewid) + self._find_crew() + + def _find_crew(self): + self.crew = [] + capts = self.soup.findAll('img', + src='/yoweb/images/crew-captain.png') + if len(capts) != 1: + self.msg('crew members: no. of captain images != 1') + return + tbl = capts[0] + while not tbl.find('a', href=pirate_ref_re): + tbl = tbl.findParent('table') + if not tbl: + self.msg('crew members: cannot find table') + return + current_rank_crew = None + crew_rank_re = regexp.compile('/yoweb/images/crew') + for row in tbl.contents: + # findAll(recurse=False) + if isinstance(row, unicode): + continue + + is_rank = row.find('img', attrs={'src': crew_rank_re}) + if is_rank: + rank = soup_text(row) + current_rank_crew = [] + self.crew.append((rank, current_rank_crew)) + continue + for cell in row.findAll('a', href=pirate_ref_re): + if current_rank_crew is None: + self.soupm(cell, 'crew members: crew' + ' before rank') + continue + current_rank_crew.append(soup_text(cell)) def __str__(self): - return `(self.crew, self.flag, self.standings, self.msgs)` + return `(self.crew, self.msgs)` + +def do_pirate(pirates, bu): + print '{' + for pirate in pirates: + info = PirateInfo(pirate) + print '%s: %s,' % (`pirate`, info) + print '}' + +def prep_crew_of(args, bu): + if len(args) != 1: bu('crew-of takes one pirate name') + pi = PirateInfo(args[0]) + return CrewInfo(pi.crew[0]) + +def do_crew_of(args, bu): + ci = prep_crew_of(args, bu) + print ci + +#def do_dutytab_crew_of(pirates, badusage): +# if len(pirates) != 1: badusage('dutytab-crew-of takes one pirate name') def main(): global opts, fetcher @@ -244,15 +327,22 @@ actions: help='suppress warning output') (opts,args) = pa.parse_args() + if len(args) < 1: + pa.error('need a mode argument') + + mode = args[0] + mode_fn_name = 'do_' + mode.replace('_','#').replace('-','_') + try: mode_fn = globals()[mode_fn_name] + except KeyError: pa.error('unknown mode "%s"' % mode) + # fixed parameters opts.max_age = 240 + if opts.cache_dir.startswith('~/'): opts.cache_dir = os.getenv('HOME') + opts.cache_dir[1:] fetcher = Fetcher(opts.ocean, opts.cache_dir) - # test program: - test = PirateInfo('Anaplian') - print test + mode_fn(args[1:], pa.error) main() -- 2.30.2