chiark / gitweb /
Can fetch
authorIan Jackson <ian@liberator.relativity.greenend.org.uk>
Thu, 14 May 2009 19:20:24 +0000 (20:20 +0100)
committerIan Jackson <ian@liberator.relativity.greenend.org.uk>
Thu, 14 May 2009 19:20:24 +0000 (20:20 +0100)
yoweb-scrape [new file with mode: 0755]

diff --git a/yoweb-scrape b/yoweb-scrape
new file mode 100755 (executable)
index 0000000..0a54940
--- /dev/null
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+
+import os
+import time
+import urllib
+import urllib2
+import errno
+
+from BeautifulSoup import BeautifulSoup
+
+max_age = 120
+ocean = 'ice'
+
+now = time.time()
+
+def fetch(url):
+       cache_corename = urllib.quote_plus(url)
+       cache_basename = "#%s#" % cache_corename
+       try: f = file(cache_basename, 'r')
+       except (OSError,IOError), oe:
+               if oe.errno != errno.ENOENT: raise
+               f = None
+       if f is not None:
+               s = os.fstat(f.fileno())
+               if now > s.st_mtime + max_age:
+                       f = None
+       if f is not None:
+               data = f.read()
+               f.close()
+       else:
+               stream = urllib2.urlopen(url)
+               data = stream.read()
+               cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
+               f = file(cache_ourname, 'w')
+               f.write(data)
+               f.close()
+               os.rename(cache_ourname, cache_basename)
+       return data
+
+def yoweb_fetch(kind, tail):
+       url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
+       return fetch(url)
+
+def get_pirate_info(pirate):
+       html = yoweb_fetch('pirate.wm?target=', pirate)
+       soup = BeautifulSoup(html)
+       return `soup`
+
+def main():
+       os.chdir(os.getenv('HOME'))
+       cache_dir = '.yoweb-scrape-cache'
+       try:
+               os.chdir(cache_dir)
+       except (OSError,IOError), oe:
+               if oe.errno != errno.ENOENT: raise
+               os.mkdir(cache_dir)
+               os.chdir(cache_dir)
+
+       for path in os.listdir('.'):
+               if not path.startswith('#'): continue
+               max_time = max_age
+               if '~' in path: max_time = 10
+               try:
+                       s = os.stat(path)
+                       if now > s.st_mtime + max_time:
+                               os.remove(path)
+               except (OSError,IOError), oe:
+                       if oe.errno != errno.ENOENT: raise
+
+       # test program:
+       global ocean
+       ocean = 'midnight'
+       test = get_pirate_info('Aristarchus')
+       print test
+
+main()