chiark / gitweb /
WIP
[ypp-sc-tools.db-test.git] / yoweb-scrape
1 #!/usr/bin/python
2
3 import os
4 import time
5 import urllib
6 import urllib2
7 import errno
8 import re as regexp
9
10 from BeautifulSoup import BeautifulSoup
11
12 max_age = 120
13 ocean = 'ice'
14
15 now = time.time()
16
17 duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigation'+
18         '/Battle Navigation/Carpentry/Rumble/Treasure Haul').split('/')
19
20 standings = ('Able/Distinguished/Respected/Master/Renowned'+
21                 '/Grand-Master/Legendary/Ultimate').split('/')
22
23 def fetch(url):
24         cache_corename = urllib.quote_plus(url)
25         cache_basename = "#%s#" % cache_corename
26         try: f = file(cache_basename, 'r')
27         except (OSError,IOError), oe:
28                 if oe.errno != errno.ENOENT: raise
29                 f = None
30         if f is not None:
31                 s = os.fstat(f.fileno())
32                 if now > s.st_mtime + max_age:
33                         f = None
34         if f is not None:
35                 data = f.read()
36                 f.close()
37         else:
38                 stream = urllib2.urlopen(url)
39                 data = stream.read()
40                 cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
41                 f = file(cache_ourname, 'w')
42                 f.write(data)
43                 f.close()
44                 os.rename(cache_ourname, cache_basename)
45         return data
46
47 def yoweb_fetch(kind, tail):
48         url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
49         return fetch(url)
50
51 class PirateInfo:
52         def __init__(self, pirate):
53                 html = yoweb_fetch('pirate.wm?target=', pirate)
54                 soup = BeautifulSoup(html,
55 #                       convertEntities=BeautifulSoup.HTML_ENTITIES
56                         )
57                 imgs = soup.findAll('img')
58                 re = regexp.compile(
59 u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
60                         )
61                 skills = { }
62                 for skill in duties:
63                         skills[skill] = [ ]
64                 for img in imgs:
65                         try: duty = img['alt']
66                         except KeyError: continue
67                         print `duty`
68                         if not duty in duties: continue
69                         key = img.findParent('td')
70                         if key is None: continue
71                         valelem = key.findNextSibling('td')
72                         if valelem is None: continue
73                         valstr = ''.join(valelem.findAll(text=True))
74                         print `duty`, `valstr`
75                         match = re.match(valstr)
76                         if match is None: continue
77                         standing = match.group(1)
78                         skills[duty] = standing
79
80 #                       print `duty`, `standing`
81 #                       if standing not in standings: continue
82 #                       for i in range(0, len(standings)):
83 #                               print `duty`, `standing`, i
84 #                               if standing == standings[i]:
85 #                                       print `skills[duty]`
86 #                                       skills[duty].append(i)
87
88 #               self.skills = { }
89 #               for skill in duties:
90                         
91                 self.skills = skills
92         def __str__(self):
93                 return `self.skills`
94
95 def main():
96         os.chdir(os.getenv('HOME'))
97         cache_dir = '.yoweb-scrape-cache'
98         try:
99                 os.chdir(cache_dir)
100         except (OSError,IOError), oe:
101                 if oe.errno != errno.ENOENT: raise
102                 os.mkdir(cache_dir)
103                 os.chdir(cache_dir)
104
105         for path in os.listdir('.'):
106                 if not path.startswith('#'): continue
107                 max_time = max_age
108                 if '~' in path: max_time = 10
109                 try:
110                         s = os.stat(path)
111                         if now > s.st_mtime + max_time:
112                                 os.remove(path)
113                 except (OSError,IOError), oe:
114                         if oe.errno != errno.ENOENT: raise
115
116         # test program:
117         global ocean
118         ocean = 'midnight'
119         test = PirateInfo('Anaplian')
120         print test
121
122 main()