chiark / gitweb /
adding cgiFiles to repo
[familyTree.git] / familyTree / checkWiki.py
diff --git a/familyTree/checkWiki.py b/familyTree/checkWiki.py
new file mode 100755 (executable)
index 0000000..4cba4ed
--- /dev/null
@@ -0,0 +1,239 @@
+#!/usr/bin/python
+import askQuestion as aQ
+import englishUtils as eU
+from string import Template
+import re
+import urllib2
+import json
+import codecs
+import pickle
+
+def rem_space(text):
+       text = re.sub(' *(?P<text>[^ ]*) *','\g<text>',text)
+       return text
+def print_utf(text):
+       print codecs.encode(text,'utf-8') 
+
+def split_info(info):
+       #form - | field = value  
+       #value might contain both | and ? inside [[]] or {{}}
+
+       countSq = 0
+       countCur = 0
+       breaks = []
+       for i in range(len(info)):
+               if info[i]=='|' and countSq==0 and countCur==0:
+                       breaks.append(i)
+               if info[i]=='[':
+                       countSq+=1
+               if info[i]==']':
+                       countSq-=1
+               if info[i]=='{':
+                       countCur+=1
+               if info[i]=='}':
+                       countCur-=1
+
+       breaks.append(-1)
+
+       out =[]
+       s = 0
+       for i in range(len(breaks)):
+               out.append(info[s:breaks[i]])
+               s = breaks[i]+1
+
+
+       return out
+
+def check():
+       file = open(filename2)
+        dicts = pickle.load(file)
+        file.close()
+
+       s = 'SELECT url,name,born,died FROM people WHERE id=?'
+       u = 'SELECT url,name from marriages INNER JOIN people'\
+               +' ON people.id = marriages.idb'\
+               +' WHERE marriages.ida=?'\
+               +' UNION'\
+               +' SELECT url,name from marriages INNER JOIN people'\
+               +' ON people.id = marriages.ida'\
+               +' WHERE marriages.idb = ?'
+       v = 'SELECT url,name from parents INNER JOIN people'\
+               +' ON people.id = parentID'\
+               +' WHERE parents.id = ?'
+       c = 'SELECT url,name from parents INNER JOIN people'\
+               +' ON people.id = parents.ID'\
+               +' WHERE parents.parentID = ?'
+
+#      for id in range(1,len(dicts)):
+       for id in range(1,3):   
+               t = (id,)
+
+               for r in aQ.run_query(s,t):
+                       url = r[0]
+                       name = r[1]
+                       born = r[2]
+                       died = r[3]
+               spurls=[]
+               sp=[]
+               for r in aQ.run_query(u,(id,id)):
+                       spurls.append(r[0])
+                       sp.append(r[1])
+               if url=='.':
+                       continue
+
+               purls=[]
+               ps=[]
+               for r in aQ.run_query(v,t):
+                       purls.append(r[0])
+                       ps.append(r[1])
+
+               curls = []
+               cs=[]
+               for r in aQ.run_query(c,t):
+                       curls.append(r[0])
+                       cs.append(r[1])                 
+
+               struct = dicts[id]
+
+               print 'born'
+               print born
+               if struct.has_key('birth_date'):
+                       print_utf(struct['birth_date'])
+#              print 'died'
+#              print died
+#              if struct.has_key('death_date'):
+#                      print_utf(struct['death_date'])
+#              print 'parents'
+#              print ps
+#              print purls
+#              if struct.has_key('father'):
+#                      print_utf(struct['father'])
+#              if struct.has_key('monther'):
+#                      print_utf(struct['mother'])
+#              print 'spouses'
+#              print sp
+#              print spurls
+#              if struct.has_key('spouse'):
+#                      print_utf(struct['spouse'])
+#              elif struct.has_key('spouses'):
+#                      print_utf(struct['spouses'])                    
+#              print 'children'
+#              print cs
+#              if struct.has_key('issue'):
+#                      print_utf(struct['issue'])
+
+def understand():
+       
+       file = open(filename)
+        data = pickle.load(file)
+        file.close()
+
+       dicts = ['None']
+       for id in range(1,len(data)):
+               t = (id,)
+               info = data[id]                 
+               info = split_info(info)
+               struct={}
+               for bit in info:
+                       line = bit.split('=',1)
+                       if len(line)<2:
+                               continue
+                       field = rem_space(line[0])
+                       value = rem_space(line[1])
+                       value = re.sub('\n',' ',value)
+                       struct[field]=value
+               dicts.append(struct)
+                               
+
+       file = open(filename2,'w')
+        pickle.dump(dicts,file)
+        file.close()
+
+
+def get():
+       s = 'SELECT url,name,born,died FROM people WHERE id=?'
+
+
+
+       data = ['blank']
+       for id in range(1,aQ.number_people()):
+               t = (id,)
+               for r in aQ.run_query(s,t):
+                       url = r[0]
+               if len(url)==0 or len(url)>0 and url[0]=='.':
+                       continue
+               
+               title = url.split('/')[-1]
+               print_utf(title)
+               url = 'http://en.wikipedia.org/w/api.php?'\
+               +'format=json&action=query'\
+               +'&titles='+title\
+               +'&prop=revisions&rvprop=content&redirects'
+
+               r = urllib2.urlopen(url)
+               t = r.read()
+               jd = json.JSONDecoder()
+               struct = jd.decode(t)
+               pages = struct['query']['pages'].keys()
+
+               startPatt = re.compile('{{',re.DOTALL)
+               endPatt = re.compile('}}',re.DOTALL)
+               infoboxPatt = re.compile('{{Infobox',re.DOTALL)
+               for p in pages:
+                       title = struct['query']['pages'][p]['title']
+
+                       try:
+                               page = struct['query']['pages'][p]['revisions'][0]['*']
+                       except:
+                               data.append('None')
+                               continue
+                       iBox = re.search(infoboxPatt,page)
+                       starts = re.finditer(startPatt,page)
+                       ends = re.finditer(endPatt,page)
+
+                       if iBox==None:
+                               data.append('None')
+                               continue
+       
+                       myStart = iBox.start()
+
+                       countMe = 0
+                       start = -1
+                       while start<myStart:
+                               start = starts.next().start()
+                       end = -1
+                       while end<myStart:
+                               end = ends.next().start()
+                       while 1==1:
+                               if start<end:
+                                       countMe+=1
+                                       start = starts.next().start()
+                               elif end<start:
+                                       countMe-=1
+                                       myEnd = end
+                                       end = ends.next().start()
+                               if countMe==0:
+                                       break
+
+               
+                       info = page[myStart+2:myEnd]
+                       data.append(info)
+
+       file = open(filename,'w')
+       pickle.dump(data,file)
+       file.close()
+
+
+aQ.connect()
+
+global filename
+filename = 'wikiData'
+global filename2
+filename2='wikiDicts'
+#get()
+understand()
+check()
+
+aQ.close()
+
+