From: Ian Jackson Date: Thu, 14 May 2009 19:20:24 +0000 (+0100) Subject: Can fetch X-Git-Tag: 1.0~78 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~yarrgweb/git?p=ypp-sc-tools.web-live.git;a=commitdiff_plain;h=ed951644b1bdacabaa82cd5e854e567752101cfd Can fetch --- ed951644b1bdacabaa82cd5e854e567752101cfd diff --git a/yoweb-scrape b/yoweb-scrape new file mode 100755 index 0000000..0a54940 --- /dev/null +++ b/yoweb-scrape @@ -0,0 +1,76 @@ +#!/usr/bin/python + +import os +import time +import urllib +import urllib2 +import errno + +from BeautifulSoup import BeautifulSoup + +max_age = 120 +ocean = 'ice' + +now = time.time() + +def fetch(url): + cache_corename = urllib.quote_plus(url) + cache_basename = "#%s#" % cache_corename + try: f = file(cache_basename, 'r') + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + f = None + if f is not None: + s = os.fstat(f.fileno()) + if now > s.st_mtime + max_age: + f = None + if f is not None: + data = f.read() + f.close() + else: + stream = urllib2.urlopen(url) + data = stream.read() + cache_ourname = "#%s~%d#" % (cache_corename, os.getpid()) + f = file(cache_ourname, 'w') + f.write(data) + f.close() + os.rename(cache_ourname, cache_basename) + return data + +def yoweb_fetch(kind, tail): + url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail) + return fetch(url) + +def get_pirate_info(pirate): + html = yoweb_fetch('pirate.wm?target=', pirate) + soup = BeautifulSoup(html) + return `soup` + +def main(): + os.chdir(os.getenv('HOME')) + cache_dir = '.yoweb-scrape-cache' + try: + os.chdir(cache_dir) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + os.mkdir(cache_dir) + os.chdir(cache_dir) + + for path in os.listdir('.'): + if not path.startswith('#'): continue + max_time = max_age + if '~' in path: max_time = 10 + try: + s = os.stat(path) + if now > s.st_mtime + max_time: + os.remove(path) + except (OSError,IOError), oe: + if oe.errno != errno.ENOENT: raise + + # test program: + global ocean + ocean = 'midnight' + test = get_pirate_info('Aristarchus') + print test + +main()