From ecf2c0123bbeda951c5e7d9a832d20eaf99291ce Mon Sep 17 00:00:00 2001 From: Ian Jackson Date: Sun, 30 Oct 2011 13:46:23 +0000 Subject: [PATCH] yoweb-scrape: apply rate limiter per http:// site --- yoweb-scrape | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/yoweb-scrape b/yoweb-scrape index cee5c17..717b539 100755 --- a/yoweb-scrape +++ b/yoweb-scrape @@ -131,12 +131,24 @@ class Fetcher: if oe.errno != errno.EEXIST: raise self._cache_scan(time.time()) - def _cache_scan(self, now): + def _match_url_normalise(self, url): + without_scheme = regexp.sub('^[a-z]+://', '', url) + without_tail = regexp.sub('/.*', '', without_scheme) + return without_tail + + def _cache_scan(self, now, match_url=None): # returns list of ages, unsorted + if match_url is not None: + match_url = self._match_url_normalise(match_url) ages = [] debug('Fetcher scan_cache') for leaf in os.listdir(self.cachedir): if not leaf.startswith('#'): continue + if match_url is not None: + leaf_url = urllib.unquote_plus(leaf.strip('#')) + leaf_url = self._match_url_normalise(leaf_url) + if leaf_url != match_url: + continue path = self.cachedir + '/' + leaf try: s = os.stat(path) except (OSError,IOError), oe: @@ -152,8 +164,8 @@ class Fetcher: ages.append(age) return ages - def need_wait(self, now, imaginary=[]): - ages = self._cache_scan(now) + def need_wait(self, now, imaginary=[], next_url=None): + ages = self._cache_scan(now, match_url=next_url) ages += imaginary ages.sort() debug('Fetcher ages ' + `ages`) @@ -169,8 +181,8 @@ class Fetcher: need_wait += random.random() - 0.5 return need_wait - def _rate_limit_cache_clean(self, now): - need_wait = self.need_wait(now) + def _rate_limit_cache_clean(self, now, next_url=None): + need_wait = self.need_wait(now, next_url=next_url) if need_wait > 0: debug('Fetcher wait %f' % need_wait) sleep(need_wait) @@ -198,7 +210,7 @@ class Fetcher: return data debug('Fetcher fetch') - self._rate_limit_cache_clean(now) + self._rate_limit_cache_clean(now, next_url=url) stream = urllib2.urlopen(url) data = stream.read() -- 2.30.2