chiark / gitweb /
yoweb-scrape: apply rate limiter per http:// site
[ypp-sc-tools.main.git] / yoweb-scrape
index cee5c1781aafbb9505f25647a8d10ed1f6fc10de..717b539131d8ffb0e4ba6abba5f6cddc3b38b199 100755 (executable)
@@ -131,12 +131,24 @@ class Fetcher:
                        if oe.errno != errno.EEXIST: raise
                self._cache_scan(time.time())
 
-       def _cache_scan(self, now):
+       def _match_url_normalise(self, url):
+               without_scheme = regexp.sub('^[a-z]+://', '', url)
+               without_tail = regexp.sub('/.*', '', without_scheme)
+               return without_tail
+
+       def _cache_scan(self, now, match_url=None):
                # returns list of ages, unsorted
+               if match_url is not None:
+                       match_url = self._match_url_normalise(match_url)
                ages = []
                debug('Fetcher   scan_cache')
                for leaf in os.listdir(self.cachedir):
                        if not leaf.startswith('#'): continue
+                       if match_url is not None:
+                               leaf_url = urllib.unquote_plus(leaf.strip('#'))
+                               leaf_url = self._match_url_normalise(leaf_url)
+                               if leaf_url != match_url:
+                                       continue
                        path = self.cachedir + '/' + leaf
                        try: s = os.stat(path)
                        except (OSError,IOError), oe:
@@ -152,8 +164,8 @@ class Fetcher:
                        ages.append(age)
                return ages
 
-       def need_wait(self, now, imaginary=[]):
-               ages = self._cache_scan(now)
+       def need_wait(self, now, imaginary=[], next_url=None):
+               ages = self._cache_scan(now, match_url=next_url)
                ages += imaginary
                ages.sort()
                debug('Fetcher   ages ' + `ages`)
@@ -169,8 +181,8 @@ class Fetcher:
                        need_wait += random.random() - 0.5
                return need_wait
 
-       def _rate_limit_cache_clean(self, now):
-               need_wait = self.need_wait(now)
+       def _rate_limit_cache_clean(self, now, next_url=None):
+               need_wait = self.need_wait(now, next_url=next_url)
                if need_wait > 0:
                        debug('Fetcher   wait %f' % need_wait)
                        sleep(need_wait)
@@ -198,7 +210,7 @@ class Fetcher:
                        return data
 
                debug('Fetcher  fetch')
-               self._rate_limit_cache_clean(now)
+               self._rate_limit_cache_clean(now, next_url=url)
 
                stream = urllib2.urlopen(url)
                data = stream.read()