#!/usr/bin/python # Tested with Python 2.6 on Ubuntu. Run in Essential_Japanese_Verbs/ # directory and send stdout to file. Prints verb numbers to stderr as progress indication. from BeautifulSoup import BeautifulSoup import glob import re import codecs import sys streamWriter = codecs.lookup('utf-8')[-1] sys.stdout = streamWriter(sys.stdout) # hack to work around Python's braindead ideas about locales def gettext(tag,c,sep=' '): # get the text within all td tags of class c below the given tag, filtering out numbering return sep.join([t for rl in [td.findAll(text=re.compile('^(?!([0-9]\.)?$)')) for td in tag.findAll('td',c)] for t in rl]).strip() def main(args=None): for kanji,kana in zip (glob.glob('data/250vb/*/c-vb*-body.html'), glob.glob('data/250vb/*/k-vb*-body.html')): verbno = kanji[-13:-10].lstrip('0') print >>sys.stderr, verbno with open(kanji) as kanjifile: with open(kana) as kanafile: kanjisoup, kanasoup = BeautifulSoup(kanjifile), BeautifulSoup(kanafile) group = type = '' for (gif,t) in (('bt-vi.gif', 'Intransitive'), ('bt-vt.gif', 'Transitive')): if kanjisoup.findAll(src=re.compile(gif)): type = t for (gif,g) in (('bt-grp1.gif', 'Group1'), ('bt-grp2.gif', 'Group2')): if kanjisoup.findAll(src=re.compile(gif)): group = g meaning = gettext(kanjisoup, 'vbimi', '; ') divdict = {'class': ['keypadd', 'keypadd2', 'sdlgpadd', 'sdlgpadd2']} for (ks,kks) in zip (kanjisoup.findAll('div', divdict), kanasoup.findAll('div', divdict)): audiofile = ks.find('a',href=re.compile('^JAVASCRIPT:playAudio'))['href'][22:-2] print '|'.join([audiofile, verbno, kanjisoup.title.string, gettext(ks, 'excap'), gettext(ks, 'exjpk'), gettext(kks, 'exjpk'), meaning, gettext(ks, 'exeng'), group, type]) if __name__ == '__main__': main()