chiark / gitweb /
Can fetch
[ypp-sc-tools.main.git] / yoweb-scrape
1 #!/usr/bin/python
2
3 import os
4 import time
5 import urllib
6 import urllib2
7 import errno
8
9 from BeautifulSoup import BeautifulSoup
10
11 max_age = 120
12 ocean = 'ice'
13
14 now = time.time()
15
16 def fetch(url):
17         cache_corename = urllib.quote_plus(url)
18         cache_basename = "#%s#" % cache_corename
19         try: f = file(cache_basename, 'r')
20         except (OSError,IOError), oe:
21                 if oe.errno != errno.ENOENT: raise
22                 f = None
23         if f is not None:
24                 s = os.fstat(f.fileno())
25                 if now > s.st_mtime + max_age:
26                         f = None
27         if f is not None:
28                 data = f.read()
29                 f.close()
30         else:
31                 stream = urllib2.urlopen(url)
32                 data = stream.read()
33                 cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
34                 f = file(cache_ourname, 'w')
35                 f.write(data)
36                 f.close()
37                 os.rename(cache_ourname, cache_basename)
38         return data
39
40 def yoweb_fetch(kind, tail):
41         url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
42         return fetch(url)
43
44 def get_pirate_info(pirate):
45         html = yoweb_fetch('pirate.wm?target=', pirate)
46         soup = BeautifulSoup(html)
47         return `soup`
48
49 def main():
50         os.chdir(os.getenv('HOME'))
51         cache_dir = '.yoweb-scrape-cache'
52         try:
53                 os.chdir(cache_dir)
54         except (OSError,IOError), oe:
55                 if oe.errno != errno.ENOENT: raise
56                 os.mkdir(cache_dir)
57                 os.chdir(cache_dir)
58
59         for path in os.listdir('.'):
60                 if not path.startswith('#'): continue
61                 max_time = max_age
62                 if '~' in path: max_time = 10
63                 try:
64                         s = os.stat(path)
65                         if now > s.st_mtime + max_time:
66                                 os.remove(path)
67                 except (OSError,IOError), oe:
68                         if oe.errno != errno.ENOENT: raise
69
70         # test program:
71         global ocean
72         ocean = 'midnight'
73         test = get_pirate_info('Aristarchus')
74         print test
75
76 main()