chiark / gitweb /
yoweb-scrape: wip new flag and ocean functionality - can fetch island ownerships too
[ypp-sc-tools.web-live.git] / yoweb-scrape
index 68df2163812afe1b409937bd3fff9c24147073f7..d2557b594a86b8811c3323c375bd4f5cd14c1b57 100755 (executable)
@@ -110,19 +110,14 @@ def yppsc_dir():
 #---------- caching and rate-limiting data fetcher ----------
 
 class Fetcher:
-       def __init__(self, ocean, cachedir):
+       def __init__(self, cachedir):
                debug('Fetcher init %s' % cachedir)
-               self.ocean = ocean
                self.cachedir = cachedir
                try: os.mkdir(cachedir)
                except (OSError,IOError), oe:
                        if oe.errno != errno.EEXIST: raise
                self._cache_scan(time.time())
 
-       def default_ocean(self, ocean='ice'):
-               if self.ocean is None:
-                       self.ocean = ocean
-
        def _cache_scan(self, now):
                # returns list of ages, unsorted
                ages = []
@@ -204,12 +199,38 @@ class Fetcher:
                debug('Fetcher  stored')
                return data
 
+class Yoweb(Fetcher):
+       def __init__(self, ocean, cachedir):
+               debug('Yoweb init %s' % cachedir)
+               self.ocean = ocean
+               Fetcher.__init__(self, cachedir)
+
+       def default_ocean(self, ocean='ice'):
+               if self.ocean is None:
+                       self.ocean = ocean
+
        def yoweb(self, kind, tail, max_age):
                self.default_ocean()
+               assert(self.ocean)
                url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (
                        self.ocean, kind, tail)
                return self.fetch(url, max_age)
 
+class Yppedia(Fetcher):
+       def __init__(self, cachedir):
+               debug('Yoweb init %s' % cachedir)
+               self.base = 'http://yppedia.puzzlepirates.com/'
+               self.localhtml = opts.localhtml
+               Fetcher.__init__(self, cachedir)
+
+       def __call__(self, rhs):
+               if self.localhtml is None:
+                       url = self.base + rhs
+                       debug('Yppedia retrieving YPP '+url);
+                       return self.fetch(url, 3000)
+               else:
+                       return file(opts.localhtml + '/' + rhs, 'r')
+
 #---------- logging assistance for troubled screenscrapers ----------
 
 class SoupLog:
@@ -429,18 +450,13 @@ class FlagInfo(SomethingSoupInfo):
                                (`head`, ``waritem``))
 
                def wihelp_item(waritem, thing):
-                       if waritem.name == 'a':
-                               url = waritem.get('href', None)
-                               if url is None:
-                                       return ('no url for '+thing,None,None)
-                       else:
-                               hr = waritem.find('a',{'href':True})
-                               if not hr: return ('no a for '+thing,None,None)
-                               url = hr['href']
+                       url = waritem.get('href', None)
+                       if url is None:
+                               return ('no url for '+thing,None,None)
                        m = regexp.search('\?'+thing+'id=(\d+)$', url)
                        if not m: return ('no '+thing+'id',None,None)
                        tid = m.group(1)
-                       tname = m.string
+                       tname = waritem.string
                        if tname is None:
                                return (thing+' name not just string',None,None)
                        return (None,tid,tname)
@@ -452,7 +468,7 @@ class FlagInfo(SomethingSoupInfo):
                        if rel: return 'flag id twice!'
                        if flagname in self.relation_byname:
                                return 'flag name twice!'
-                       rel = (flagname,flagid,[], thisdecl,othermin,othermax)
+                       rel = (flagname,flagid,head, thisdecl,othermin,othermax)
                        self.relations.append(rel)
                        self.relation_byid[flagid] = rel
                        self.relation_byname[flagid] = rel
@@ -472,20 +488,24 @@ class FlagInfo(SomethingSoupInfo):
 
                how = (wi_warn, None)
 
-               for waritem in warinfo.contents:
-                       debug('WARITEM '+``waritem``)
-                       if isinstance(waritem, unicode):
-                               waritem = waritem.strip()
-                               if waritem: warn('unknown waritem '+``waritem``)
-                               continue
-                       if waritem.name == 'br':
-                               continue
-                       if waritem.name == 'b':
-                               head = ''.join(waritem.findAll(text=True))
+               for waritem in warinfo.findAll(['font','a']):
+                       if waritem is None: break
+                       if waritem.name == 'font':
+                               colour = waritem.get('color',None)
+                               if colour.lstrip('#') != '958A5F':
+                                       warn('strange colour %s in %s' %
+                                               (colour,``waritem``))
+                                       continue
+                               head = waritem.string
+                               if head is None:
+                                       warn('no head string in '+``waritem``)
+                                       continue
                                head = regexp.sub('\\s+', ' ', head).strip()
                                head = head.rstrip(':')
                                how = (head,) + warmap.get(head, (wi_warn,))
                                continue
+                       assert(waritem.name == 'a')                             
+
                        debug('WARHOW %s(%s, waritem, *%s)' %
                                (how[1], `how[0]`, `how[2:]`))
                        bad = how[1](how[0], waritem, *how[2:])
@@ -498,47 +518,84 @@ class FlagInfo(SomethingSoupInfo):
 
 #---------- scraper for ocean info incl. embargoes etc. ----------
 
-class IslandInfo():
+class IslandBasicInfo():
+       # Public members:
+       #  ocean
+       #  name
+       # Public members maybe set by caller:
+       #  arch
        def __init__(self, ocean, islename):
                self.ocean = ocean
                self.name = islename
        def collect(self):
                pass
-       def yppedia_dataf(self):
+       def yppedia(self):
                def q(x): return urllib.quote(x.replace(' ','_'))
                url_rhs = q(self.name) + '_(' + q(self.ocean) + ')'
-               if opts.localhtml is None:
-                       url = 'http://yppedia.puzzlepirates.com/' + url_rhs
-                       debug('IslandInfo retrieving YPP '+url);
-                       return urllib.urlopen(url)
-               else:
-                       return file(opts.localhtml + '/' + url_rhs, 'r')
-       def yoweb_url(self):
-               soup = BeautifulSoup(self.yppedia_dataf())
+               return yppedia(url_rhs)
+       def __str__(self):
+               return `(self.ocean, self.name)`
+
+class IslandExtendedInfo(IslandBasicInfo):
+       # Public members (inherited):
+       #  ocean
+       #  name
+       # Public members (additional):
+       #  islandid
+       #  yoweb_url
+       #  flagid
+       def collect(self):
+               IslandBasicInfo.collect(self)
+               self._collect_yoweb()
+               self._collect_flagid()
+
+       def _collect_yoweb(self):
+               debug('IEI COLLECT YOWEB '+`self.name`)
+               self.islandid = None
+               self.yoweb_url = None
+
+               soup = BeautifulSoup(self.yppedia())
                content = soup.find('div', attrs = {'id': 'content'})
                yoweb_re = regexp.compile('^http://\w+\.puzzlepirates\.com/'+
-                       'yoweb/island/info\.wm\?islandid=\d+$')
+                       'yoweb/island/info\.wm\?islandid=(\d+)$')
                a = soup.find('a', attrs = { 'href': yoweb_re })
-               if a is None: return None
-               return a['href']
-       def ruling_flag_id(self):
-               yo = self.yoweb_url()
+               if a is None:
+                       debug('IEI COLLECT YOWEB '+`self.name`+' NONE')
+                       return
+
+               debug('IEI COLLECT YOWEB '+`self.name`+' GOT '+``a``)
+               self.yoweb_url = a['href']
+               m = yoweb_re.search(self.yoweb_url)
+               self.islandid = m.group(1)
+
+       def _collect_flagid(self):
+               self.flagid = None
+
+               yo = self.yoweb_url
+               debug('IEI COLLECT FLAGID '+`self.name`+' URL '+`yo`)
                if yo is None: return None
-               dataf = fetcher.fetch(yo, 600)
+               dataf = fetcher.fetch(yo, 1800)
                soup = BeautifulSoup(dataf)
-               ruler_re = regexp.compile('http://\w+\.puzzlepirates\.com/'+
-                       'yoweb/flag/info\.wm\?flagid=(\d+)$')
+               ruler_re = regexp.compile(
+                       '/yoweb/flag/info\.wm\?flagid=(\d+)$')
                ruler = soup.find('a', attrs = { 'href': ruler_re })
-               if not ruler: return None
-               m = ruler_re.find(ruler['href'])
-               return m.group(1)
+               if not ruler: 
+                       debug('IEI COLLECT FLAGID '+`self.name`+' NONE')
+                       return
+               debug('IEI COLLECT FLAGID '+`self.name`+' GOT '+``ruler``)
+               m = ruler_re.search(ruler['href'])
+               self.flagid = m.group(1)
+
+       def __str__(self):
+               return `(self.ocean, self.islandid, self.name,
+                       self.yoweb_url, self.flagid)`
 
 class OceanInfo():
        # Public data attributes (valid after collect()):
        #   oi.islands[islename] = IslandInfo(...)
        #   oi.arches[archname][islename] = IslandInfo(...)
-       def __init__(self):
-               self.isleclass = IslandInfo
+       def __init__(self, isleclass=IslandBasicInfo):
+               self.isleclass = isleclass
                self.ocean = fetcher.ocean.lower().capitalize()
        def collect(self):
                cmdl = ['./yppedia-ocean-scraper']
@@ -571,6 +628,7 @@ class OceanInfo():
                                islename = m.group(1)
                                isle = self.isleclass(self.ocean, islename)
                                isle.arch = archname
+                               isle.collect()
                                self.islands[islename] = isle
                                self.arches[archname][islename] = isle
                                continue
@@ -583,6 +641,8 @@ class OceanInfo():
                        assert(False)
                oscraper.wait()
                assert(oscraper.returncode == 0)
+       def __str__(self):
+               return `(self.islands, self.arches)`
 
 #---------- pretty-printer for tables of pirate puzzle standings ----------
 
@@ -1249,7 +1309,7 @@ def do_flag_of(args, bu):
        pi = PirateInfo(args[0], max_age)
        if pi.flag is None: fi = None
        else: fi = FlagInfo(pi.flag[0], max_age)
-       print `fi`
+       print fi
 
 def do_standings_crew_of(args, bu):
        ci = prep_crew_of(args, bu, 60)
@@ -1266,12 +1326,12 @@ def do_standings_crew_of(args, bu):
 def do_ocean(args, bu):
        if (len(args)): bu('ocean takes no further arguments')
        fetcher.default_ocean()
-       oi = OceanInfo()
+       oi = OceanInfo(IslandExtendedInfo)
        oi.collect()
+       print oi
        for islename in sorted(oi.islands.keys()):
                isle = oi.islands[islename]
-               yoweb_url = isle.yoweb_url()
-               print " %s -- %s" % (islename, yoweb_url)
+               print isle
 
 #----- modes which use the chat log parser are quite complex -----
 
@@ -1594,7 +1654,7 @@ class KeystrokeReader(DummyKeystrokeReader):
 #---------- main program ----------
 
 def main():
-       global opts, fetcher
+       global opts, fetcher, yppedia
 
        pa = OptionParser(
 '''usage: .../yoweb-scrape [OPTION...] ACTION [ARGS...]
@@ -1676,7 +1736,8 @@ display modes (for --display) apply to ship-aid:
                else:
                        opts.display = 'overwrite'
 
-       fetcher = Fetcher(opts.ocean, opts.cache_dir)
+       fetcher = Yoweb(opts.ocean, opts.cache_dir)
+       yppedia = Yppedia(opts.cache_dir)
 
        mode_fn(args[1:], pa.error)