yoweb-scrape: apply rate limiter per http:// site

[ypp-sc-tools.web-live.git] / yoweb-scrape
diff --git a/yoweb-scrape b/yoweb-scrape

index b4a5d6375f00662866a1a48ad5f71e719ac1e666..717b539131d8ffb0e4ba6abba5f6cddc3b38b199 100755 (executable)
--- a/yoweb-scrape
+++ b/yoweb-scrape
@@ -131,12 +131,24 @@ class Fetcher:
                         if oe.errno != errno.EEXIST: raise
                 self._cache_scan(time.time())
  
-       def _cache_scan(self, now):
+       def _match_url_normalise(self, url):
+               without_scheme = regexp.sub('^[a-z]+://', '', url)
+               without_tail = regexp.sub('/.*', '', without_scheme)
+               return without_tail
+
+       def _cache_scan(self, now, match_url=None):
                 # returns list of ages, unsorted
+               if match_url is not None:
+                       match_url = self._match_url_normalise(match_url)
                 ages = []
                 debug('Fetcher   scan_cache')
                 for leaf in os.listdir(self.cachedir):
                         if not leaf.startswith('#'): continue
+                       if match_url is not None:
+                               leaf_url = urllib.unquote_plus(leaf.strip('#'))
+                               leaf_url = self._match_url_normalise(leaf_url)
+                               if leaf_url != match_url:
+                                       continue
                         path = self.cachedir + '/' + leaf
                         try: s = os.stat(path)
                         except (OSError,IOError), oe:
@@ -152,8 +164,8 @@ class Fetcher:
                         ages.append(age)
                 return ages
  
-       def need_wait(self, now, imaginary=[]):
-               ages = self._cache_scan(now)
+       def need_wait(self, now, imaginary=[], next_url=None):
+               ages = self._cache_scan(now, match_url=next_url)
                 ages += imaginary
                 ages.sort()
                 debug('Fetcher   ages ' + `ages`)
@@ -165,13 +177,12 @@ class Fetcher:
                                         (min_age, age))
                                 need_wait = max(need_wait, min_age - age)
                         min_age += 3
-                       min_age *= 1.25
                 if need_wait > 0:
                         need_wait += random.random() - 0.5
                 return need_wait
  
-       def _rate_limit_cache_clean(self, now):
-               need_wait = self.need_wait(now)
+       def _rate_limit_cache_clean(self, now, next_url=None):
+               need_wait = self.need_wait(now, next_url=next_url)
                 if need_wait > 0:
                         debug('Fetcher   wait %f' % need_wait)
                         sleep(need_wait)
@@ -199,7 +210,7 @@ class Fetcher:
                         return data
  
                 debug('Fetcher  fetch')
-               self._rate_limit_cache_clean(now)
+               self._rate_limit_cache_clean(now, next_url=url)
  
                 stream = urllib2.urlopen(url)
                 data = stream.read()