#!/usr/bin/python
import codecs
import sys
from sys import stdout

# We always do I/O in utf-8 regardless of locale. This is
# a bit broken but not as broken as Python's stupid ideas
# about ignoring locale if stdout is a terminal.

argc = len(sys.argv)
if argc < 1 or argc > 3:
        print "Usage: %s [infile [outfile]]" % sys.argv[0]
        sys.exit(1)

if argc < 3:
        streamWriter = codecs.getwriter('utf-8')
        outfile = streamWriter(sys.stdout)
else:
        outfile = codecs.open(sys.argv[2], "w+", "utf-8")

if argc < 2:
        streamReader = codecs.getreader('utf-8')
        infile = streamReader(sys.stdin)
else:
        infile = codecs.open(sys.argv[1], "r+", "utf-8")

text = infile.read()

KANJI      = 1
HIRAGANA   = 2
KATAKANA   = 3
RUBY_OPEN  = 4
RUBY_CLOSE = 5
RUBY_START = 6
OTHER      = 99

def lex(i):
        if (i >= u'\u4e00' and i <= u'\u9fbf') or (i == u'\u3005'):
                return KANJI
        elif i >= u'\u3040' and i <= u'\u309f':
                return HIRAGANA
        elif i >= u'\u30a0' and i<= u'\u30ff':
                return KATAKANA
        elif i == u'\u300a':
                return RUBY_OPEN
        elif i == u'\u300b':
                return RUBY_CLOSE
        elif i == u'\uff5c':
                return RUBY_START
        else:
                return OTHER

offset = 0
ruby_start = False
current = OTHER
start = 0

def writesection():
        # write the string we've seen thus far (not including current character)
        global start, offset, outfile
        if start < offset:
                outfile.write(text[start:offset])
        start = offset

while offset < len(text):
	charclass = lex(text[offset]);

	if charclass == RUBY_START:
                writesection()
                start = start + 1
		ruby_start = True
		outfile.write("<ruby><rb>")

	elif charclass == RUBY_OPEN:
		if not ruby_start:
                        # ruby applies to last section of contiguous same-type chars
                        outfile.write("<ruby><rb>")
                writesection()
                start = start + 1
		outfile.write("</rb><rt>")
	
	elif charclass == RUBY_CLOSE:
                writesection()
                start = start + 1
		outfile.write("</rt></ruby>")
		ruby_start = False

	elif charclass != current:
                # start of a different kind of character string
                writesection()

        current = charclass
	offset += 1

writesection()

infile.close()
outfile.close()
