#!/usr/bin/python import codecs import sys from sys import stdout # We always do I/O in utf-8 regardless of locale. This is # a bit broken but not as broken as Python's stupid ideas # about ignoring locale if stdout is a terminal. argc = len(sys.argv) if argc < 1 or argc > 3: print "Usage: %s [infile [outfile]]" % sys.argv[0] sys.exit(1) if argc < 3: streamWriter = codecs.getwriter('utf-8') outfile = streamWriter(sys.stdout) else: outfile = codecs.open(sys.argv[2], "w+", "utf-8") if argc < 2: streamReader = codecs.getreader('utf-8') infile = streamReader(sys.stdin) else: infile = codecs.open(sys.argv[1], "r+", "utf-8") text = infile.read() KANJI = 1 HIRAGANA = 2 KATAKANA = 3 RUBY_OPEN = 4 RUBY_CLOSE = 5 RUBY_START = 6 OTHER = 99 def lex(i): if (i >= u'\u4e00' and i <= u'\u9fbf') or (i == u'\u3005'): return KANJI elif i >= u'\u3040' and i <= u'\u309f': return HIRAGANA elif i >= u'\u30a0' and i<= u'\u30ff': return KATAKANA elif i == u'\u300a': return RUBY_OPEN elif i == u'\u300b': return RUBY_CLOSE elif i == u'\uff5c': return RUBY_START else: return OTHER offset = 0 ruby_start = False current = OTHER start = 0 def writesection(): # write the string we've seen thus far (not including current character) global start, offset, outfile if start < offset: outfile.write(text[start:offset]) start = offset while offset < len(text): charclass = lex(text[offset]); if charclass == RUBY_START: writesection() start = start + 1 ruby_start = True outfile.write("") elif charclass == RUBY_OPEN: if not ruby_start: # ruby applies to last section of contiguous same-type chars outfile.write("") writesection() start = start + 1 outfile.write("") elif charclass == RUBY_CLOSE: writesection() start = start + 1 outfile.write("") ruby_start = False elif charclass != current: # start of a different kind of character string writesection() current = charclass offset += 1 writesection() infile.close() outfile.close()