chiark / gitweb /
WIP before option parser
[ypp-sc-tools.db-live.git] / yoweb-scrape
1 #!/usr/bin/python
2
3 import os
4 import time
5 import urllib
6 import urllib2
7 import errno
8 import sys
9 import re as regexp
10 import optparse
11
12 from BeautifulSoup import BeautifulSoup
13
14 max_age = 120
15 ocean = 'ice'
16
17 now = time.time()
18
19 duties = ('Swordfighting/Bilging/Sailing/Rigging/Navigating'+
20         '/Battle Navigation/Gunning/Carpentry/Rumble/Treasure Haul'+
21         '/Drinking/Spades/Hearts/Treasure Drop/Poker/Distilling'+
22         '/Alchemistry/Shipwrightery/Blacksmithing/Foraging').split('/')
23
24 standings = ('Able/Distinguished/Respected/Master/Renowned'+
25                 '/Grand-Master/Legendary/Ultimate').split('/')
26
27 def fetch(url):
28         cache_corename = urllib.quote_plus(url)
29         cache_basename = "#%s#" % cache_corename
30         try: f = file(cache_basename, 'r')
31         except (OSError,IOError), oe:
32                 if oe.errno != errno.ENOENT: raise
33                 f = None
34         if f is not None:
35                 s = os.fstat(f.fileno())
36                 if now > s.st_mtime + max_age:
37                         f = None
38         if f is not None:
39                 data = f.read()
40                 f.close()
41         else:
42                 os.sleep(1)
43                 stream = urllib2.urlopen(url)
44                 data = stream.read()
45                 cache_ourname = "#%s~%d#" % (cache_corename, os.getpid())
46                 f = file(cache_ourname, 'w')
47                 f.write(data)
48                 f.close()
49                 os.rename(cache_ourname, cache_basename)
50         return data
51
52 def yoweb_fetch(kind, tail):
53         url = 'http://%s.puzzlepirates.com/yoweb/%s%s' % (ocean, kind, tail)
54         return fetch(url)
55
56 class PirateInfo:
57         # Public data members:
58         #  pi.skills = { 'Treasure Haul': 'Able' ... }
59         #  pi.msgs = [ 'message describing problem with scrape' ]
60         def _log(self, m):
61                 self.msgs.append(m)
62
63         def _logsoup(self, soup, m):
64                 self._log(m + '; in ' + `soup`)
65
66         def __init__(self, pirate):
67                 html = yoweb_fetch('pirate.wm?target=', pirate)
68                 soup = BeautifulSoup(html,
69 #                       convertEntities=BeautifulSoup.HTML_ENTITIES
70                         )
71                 imgs = soup.findAll('img',
72                         src=regexp.compile('/yoweb/images/stat.*'))
73                 re = regexp.compile(
74 u'\s*\S*/([-A-Za-z]+)\s*$|\s*\S*/\S*\s*\(ocean\-wide\&nbsp\;([-A-Za-z]+)\)\s*$'
75                         )
76                 skills = { }
77                 self.msgs = [ ]
78
79                 for skill in duties:
80                         skills[skill] = [ ]
81
82                 for img in imgs:
83                         try: duty = img['alt']
84                         except KeyError: continue
85
86                         if not duty in duties:
87                                 self._logsoup(img, 'unknown duty: "%s"' % duty)
88                                 continue
89                         key = img.findParent('td')
90                         if key is None:
91                                 self._logsoup(img, 'duty at root! "%s"' % duty)
92                                 continue
93                         valelem = key.findNextSibling('td')
94                         if valelem is None:
95                                 self._logsoup(key, 'duty missing sibling "%s"'
96                                         % duty)
97                                 continue
98                         valstr = ''.join(valelem.findAll(text=True))
99                         match = re.match(valstr)
100                         if match is None:
101                                 self._logsoup(key, 'duty "%s" unparseable'+
102                                         ' standing "%s"' % (duty, valstr))
103                                 continue
104                         standing = match.group(match.lastindex)
105                         skills[duty].append(standing)
106
107                 self.skills = { }
108
109                 for duty in duties:
110                         sl = skills[duty]
111                         if len(sl) > 1:
112                                 self.log('duty "%s" multiple standings %s' %
113                                                 (duty, `sl`))
114                                 continue
115                         if not len(sl):
116                                 self.log('duty "%s" no standing found' % duty)
117                                 continue
118                         standing = sl[0]
119                         for i in range(0, len(standings)):
120                                 if standing == standings[i]:
121                                         self.skills[duty] = i
122                         if not duty in self.skills:
123                                 self.log('duty "%s" unknown standing "%s"' %
124                                         (duty, standing))
125                 all_skills_ok = True
126                 for duty in duties:
127                         if not duty in self.skills:
128                                 all_skills_ok = False
129                 if all_skills_ok:
130                         self.msgs = [ ]
131
132         def __str__(self):
133                 return `self.skills`
134
135 def main():
136         os.chdir(os.getenv('HOME'))
137         cache_dir = '.yoweb-scrape-cache'
138         try:
139                 os.chdir(cache_dir)
140         except (OSError,IOError), oe:
141                 if oe.errno != errno.ENOENT: raise
142                 os.mkdir(cache_dir)
143                 os.chdir(cache_dir)
144
145         for path in os.listdir('.'):
146                 if not path.startswith('#'): continue
147                 max_time = max_age
148                 if '~' in path: max_time = 10
149                 try:
150                         s = os.stat(path)
151                         if now > s.st_mtime + max_time:
152                                 os.remove(path)
153                 except (OSError,IOError), oe:
154                         if oe.errno != errno.ENOENT: raise
155
156         # test program:
157         global ocean
158         ocean = 'midnight'
159         test = PirateInfo('Anaplian')
160         print test
161
162 main()