chiark
/
gitweb
/
~yarrgweb
/
ypp-sc-tools.db-live.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
source-info: add Kraken's ink
[ypp-sc-tools.db-live.git]
/
yoweb-scrape
diff --git
a/yoweb-scrape
b/yoweb-scrape
index 2bcd518c44d036ad8d990305ab7e03488ba195e5..717b539131d8ffb0e4ba6abba5f6cddc3b38b199 100755
(executable)
--- a/
yoweb-scrape
+++ b/
yoweb-scrape
@@
-131,12
+131,24
@@
class Fetcher:
if oe.errno != errno.EEXIST: raise
self._cache_scan(time.time())
if oe.errno != errno.EEXIST: raise
self._cache_scan(time.time())
- def _cache_scan(self, now):
+ def _match_url_normalise(self, url):
+ without_scheme = regexp.sub('^[a-z]+://', '', url)
+ without_tail = regexp.sub('/.*', '', without_scheme)
+ return without_tail
+
+ def _cache_scan(self, now, match_url=None):
# returns list of ages, unsorted
# returns list of ages, unsorted
+ if match_url is not None:
+ match_url = self._match_url_normalise(match_url)
ages = []
debug('Fetcher scan_cache')
for leaf in os.listdir(self.cachedir):
if not leaf.startswith('#'): continue
ages = []
debug('Fetcher scan_cache')
for leaf in os.listdir(self.cachedir):
if not leaf.startswith('#'): continue
+ if match_url is not None:
+ leaf_url = urllib.unquote_plus(leaf.strip('#'))
+ leaf_url = self._match_url_normalise(leaf_url)
+ if leaf_url != match_url:
+ continue
path = self.cachedir + '/' + leaf
try: s = os.stat(path)
except (OSError,IOError), oe:
path = self.cachedir + '/' + leaf
try: s = os.stat(path)
except (OSError,IOError), oe:
@@
-152,8
+164,8
@@
class Fetcher:
ages.append(age)
return ages
ages.append(age)
return ages
- def need_wait(self, now, imaginary=[]):
- ages = self._cache_scan(now)
+ def need_wait(self, now, imaginary=[]
, next_url=None
):
+ ages = self._cache_scan(now
, match_url=next_url
)
ages += imaginary
ages.sort()
debug('Fetcher ages ' + `ages`)
ages += imaginary
ages.sort()
debug('Fetcher ages ' + `ages`)
@@
-165,15
+177,14
@@
class Fetcher:
(min_age, age))
need_wait = max(need_wait, min_age - age)
min_age += 3
(min_age, age))
need_wait = max(need_wait, min_age - age)
min_age += 3
- min_age *= 1.25
if need_wait > 0:
need_wait += random.random() - 0.5
return need_wait
if need_wait > 0:
need_wait += random.random() - 0.5
return need_wait
- def _rate_limit_cache_clean(self, now):
- need_wait = self.need_wait(now)
+ def _rate_limit_cache_clean(self, now
, next_url=None
):
+ need_wait = self.need_wait(now
, next_url=next_url
)
if need_wait > 0:
if need_wait > 0:
- debug('Fetcher wait %
d
' % need_wait)
+ debug('Fetcher wait %
f
' % need_wait)
sleep(need_wait)
def fetch(self, url, max_age):
sleep(need_wait)
def fetch(self, url, max_age):
@@
-199,7
+210,7
@@
class Fetcher:
return data
debug('Fetcher fetch')
return data
debug('Fetcher fetch')
- self._rate_limit_cache_clean(now)
+ self._rate_limit_cache_clean(now
, next_url=url
)
stream = urllib2.urlopen(url)
data = stream.read()
stream = urllib2.urlopen(url)
data = stream.read()