#!/usr/bin/env python3

import sys
import string
import os
import codecs
try:
    import dbm # Python 3
except ImportError:
    import anydbm as dbm # Python 2
import zipfile
import io
try:
    from urllib.request import urlopen # Python 3
except ImportError:
    from urllib import urlopen # Python 2

def hexstr(x):
    s = hex(x)
    if s[-1:] == "L" or s[-1:] == "l":
        s = s[:-1]
    if s[:2] == "0x" or s[:2] == "0X":
        s = s[2:]
    return s

def charname(x):
    if 0xAC00 <= x < 0xAC00+11172:
        # Algorithmically decode Hangul.
        #
        # Decoding is arithmetic: the Hangul section of Unicode
        # consists of 19*21*28 code points based at 0xAC00, each of
        # whose offset from 0xAC00 is (L*21*28 + V*28 + T), where
        # (L,V,T) are the indices of the initial consonant, vowel, and
        # final consonant respectively.
        #
        # The Latin transliterations of the decoded values are given
        # by https://www.unicode.org/Public/13.0.0/ucd/Jamo.txt
        sindex = x - 0xAC00
        lindex = sindex // (21*28)
        vindex = sindex // 28 % 21
        tindex = sindex % 28
        l = ["G","GG","N","D","DD","R","M","B","BB","S","SS","","J","JJ","C",
             "K","T","P","H"][lindex]
        v = ["A","AE","YA","YAE","EO","E","YEO","YE","O","WA","WAE","OE","YO",
             "U","WEO","WE","WI","YU","EU","YI","I"][vindex]
        t = ["","G","GG","GS","N","NJ","NH","D","L","LG","LM","LB","LS","LT",
             "LP","LH","M","B","BS","S","SS","NG","J","C","K","T","P",
             "H"][tindex]
        return "HANGUL SYLLABLE " + l + v + t
    if db is not None:
        key = hexstr(x)
        while len(key) < 4: key = "0" + key
        key = key.upper()
        if han_translations:
            try:
                value = handb[key].decode('UTF-8')
                return "<han> " + value
            except KeyError:
                pass
        try:
            value = db[key].decode('UTF-8')
            return value.split(";")[1]
        except KeyError:
            return "<no name available>"
    else:
        return ""

def output(char, bytevals, errors):
    if output_analysis:
        if char == -1:
            s = "           "
        else:
            s = "U-%08X " % char
        for i in bytevals:
            s = s + " %02X" % i
        for i in range(6-len(bytevals)):
            s = s + "   "

        if char == -1:
            name = ""
        else:
            name = charname(char)
        if name != "":
            s = s + " " + name
        s = s + errors
        sys.stdout.write(s + "\n")
    else:
        if char == -1 or errors != "":
            # problem chars become U+FFFD REPLACEMENT CHARACTER
            sys.stdout.buffer.write(b"\xEF\xBF\xBD")
        else:
            sys.stdout.buffer.write(bytes(bytevals))

def process_ucs(x, bytes=[], errors=""):
    if x < 0x80:
        utf8 = [x]
        realbytes = 1
    else:
        if x < 0x800:
            tmp = (0xC0, 1)
        elif x < 0x10000:
            tmp = (0xE0, 2)
        elif x < 0x200000:
            tmp = (0xF0, 3)
        elif x < 0x4000000:
            tmp = (0xF8, 4)
        else:
            assert x < 0x80000000
            tmp = (0xFC, 5)
        realbytes = tmp[1] + 1
        utf8 = [tmp[0] + (x >> (6*tmp[1]))]
        for i in range(tmp[1]-1, -1, -1):
            utf8.append(0x80 + (0x3F & (x >> (i*6))))

    if bytes != [] and len(bytes) > realbytes:
        errors = errors + " (overlong form of"
        for i in utf8:
            errors = errors + " %02X" % i
        errors = errors + ")"
        utf8 = bytes
    if x >= 0xD800 and x <= 0xDFFF:
        errors = errors + " (surrogate)"
    if x >= 0xFFFE and x <= 0xFFFF:
        errors = errors + " (invalid char)"

    output(x, utf8, errors)

def process_utf8(next):
    c = next()
    while c != None:
        char = [c]
        i = c
        if i < 0x80:
            process_ucs(i) # single-byte char
            c = next()
        elif i == 0xfe or i == 0xff:
            output(-1, char, " (invalid UTF-8 byte)")
            c = next()
        elif i >= 0x80 and i <= 0xbf:
            output(-1, char, " (unexpected continuation byte)")
            c = next()
        else:
            if i >= 0xC0 and i <= 0xDF:
                acc = i &~ 0xC0
                cbytes = 1
            elif i >= 0xE0 and i <= 0xEF:
                acc = i &~ 0xE0
                cbytes = 2
            elif i >= 0xF0 and i <= 0xF7:
                acc = i &~ 0xF0
                cbytes = 3
            elif i >= 0xF8 and i <= 0xFB:
                acc = i &~ 0xF8
                cbytes = 4
            elif i >= 0xFC and i <= 0xFD:
                acc = i &~ 0xFC
                cbytes = 5
            gotone = 0
            while cbytes > 0:
                c = next()
                if c == None or c < 0x80 or c > 0xBF:
                    gotone = 1
                    break
                char.append(c)
                acc = (acc << 6) + (c & 0x3F)
                cbytes = cbytes - 1
            if cbytes > 0:
                output(-1, char, " (incomplete sequence)")
            else:
                process_ucs(acc, char)
            if not gotone:
                c = next()

def do(args):
    # Class to turn a list into a callable object that returns one
    # element at a time.
    class liststepper:
        def __init__(self, list):
            self.list = list
            self.index = 0
        def __call__(self):
            if self.index >= len(self.list):
                return None
            ret = self.list[self.index]
            self.index = self.index + 1
            return ret

    list = []
    for arg in args:
        got = ('none')
        if arg[0].upper() == "U":
            assert arg[1] == "+" or arg[1] == "-"
            got = ('ucs', int(arg[2:], 16))
        elif arg[:2] == "&#":
            # SGML character entity. Either &# followed by a
            # number, or &#x followed by a hex number.
            s = arg
            if s[-1:] == ";": s = s[:-1]
            if s[:3].upper() == "&#X":
                got = ('ucs', int(s[3:], 16))
            else:
                got = ('ucs', int(s[2:], 10))
        else:
            got = ('utf8', int(arg, 16))

        if got[0] == 'utf8':
            list.append(got[1])
        elif got[0] == 'ucs':
            if len(list) > 0:
                process_utf8(liststepper(list))
                list = []
            process_ucs(got[1])

    if len(list) > 0:
        process_utf8(liststepper(list))

def usage(arg):
    sys.stdout.write("""\
usage: cvt-utf8 [flags] <hex UTF-8 bytes, U+codepoints, SGML entities>
  e.g. cvt-utf8 e2 82 ac
    or cvt-utf8 U+20ac
    or cvt-utf8 U-10ffff
    or cvt-utf8 '&#8211;'

where: -o or --output        just output well-formed UTF-8 instead of
                             an analysis of the input data
       -h or --han           also give Han definitions from unihan db

 also: cvt-utf8 --test       run Markus Kuhn's decoder stress tests" #
       cvt-utf8 --input (or -i)
                             read, analyse and decode UTF-8 from stdin
""")

    if arg == "--help-admin":
        sys.stdout.write("""\
       cvt-utf8 --help       display user help text
       cvt-utf8 --help-admin display admin help text (this one)
       cvt-utf8 --build <infile> <outfile>
                             convert UnicodeData.txt to unicode db
       cvt-utf8 --build-unihan <infile> <outfile>
                             convert Unihan.txt to unihan db
       cvt-utf8 --fetch <outfile>
                             just download UnicodeData.txt
       cvt-utf8 --fetch-build <outfile>
                             build unicode db by download from unicode.org
       cvt-utf8 --fetch-build-unihan <outfile>
                             build Unihan db by download from unicode.org
""")
    else:
        sys.stdout.write("""\
       cvt-utf8 --help       display this help text
       cvt-utf8 --help-admin display admin help text
""")

    sys.stdout.write("""\
       cvt-utf8 --version    report version number
       cvt-utf8 --licence    display (MIT) licence text
""")

def licence():
    sys.stdout.write("""\
cvt-utf8 is copyright 2002-2004 Simon Tatham.

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
""")

def version():
    sys.stdout.write("cvt-utf8, version 20260326.66a3363\n")

args = sys.argv[1:]
output_analysis = 1
han_translations = 0
mode = "cmdline"

if args == []:
    usage("")
    sys.exit(0)

while len(args) > 0 and args[0][:1] == "-":
    if args[0] == "--help" or args[0] == "--help-admin":
        usage(args[0])
        sys.exit(0)

    elif args[0] == "--licence" or args[0] == "--license":
        licence()
        sys.exit(0)

    elif args[0] == "--version":
        version()
        sys.exit(0)

    elif args[0] == "-o" or args[0] == "--output":
        output_analysis = 0
        args = args[1:]

    elif args[0] == "-h" or args[0] == "--han":
        han_translations = 1
        args = args[1:]

    elif args[0] == "--build" or args[0] == "--fetch-build":
        if args[0] == "--build":
            if len(args) != 3:
                sys.exit("cvt-utf8: --build expects two filename arguments")
            infile = open(args[1], "rb")
            outfile = args[2]
        else:
            if len(args) != 2:
                sys.exit("cvt-utf8: --fetch-build expects one filename"
                         " argument")
            infile = urlopen("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
            outfile = args[1]
        # Now build the database.
        if outfile[-3:] == ".db":
            sys.stderr.write("cvt-utf8: warning: you should not append .db to"
                             " db name\n")

        db = dbm.open(outfile, "n")
        while 1:
            s = infile.readline().decode("UTF-8")
            if s == "": break
            ss = s.split(";")[0]
            db[ss] = s
        db.close()
        sys.exit(0)

    elif args[0] == "--fetch":
        if len(args) != 2:
            sys.exit("cvt-utf8: --fetch expects one filename argument")
        infile = urlopen("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
        outfile = args[1]
        with open(outfile, "wb") as outfh:
            while True:
                s = infile.read(65536)
                if len(s) == 0:
                    break
                outfh.write(s)
        sys.exit(0)

    elif args[0] == "--build-unihan" or args[0] == "--fetch-build-unihan":
        if args[0] == "--build-unihan":
            if len(args) != 3:
                sys.exit("cvt-utf8: --build expects two filename arguments")
            infile = open(args[1], "rb")
            data = infile.read()
            outfile = args[2]
        else:
            if len(args) != 2:
                sys.exit("cvt-utf8: --fetch-build-unihan expects one filename"
                         " argument")
            infile = urlopen("https://www.unicode.org/Public/UCD/latest/ucd/Unihan.zip")
            data = infile.read()
            outfile = args[1]

        # Now build the database.
        if outfile[-3:] == ".db":
            sys.stderr.write("cvt-utf8: warning: you should not append .db to"
                             " db name\n")
        db = dbm.open(outfile, "n")

        def process_file(data):
            for line in data.splitlines():
                sa = line.rstrip("\r\n").split("\t")
                if (len(sa) == 3 and sa[1] == "kDefinition" and
                    sa[0][:2] == "U+"):
                    db[sa[0][2:]] = sa[2].encode('UTF-8')

        if data[:2] != u"PK".encode('ASCII'):
            process_file(data.decode('UTF-8'))
        else:
            zf = zipfile.ZipFile(io.BytesIO(data), "r")
            for name in zf.namelist():
                with zf.open(name) as f:
                    process_file(codecs.getreader('UTF-8')(f).read())

        db.close()
        sys.exit(0)

    elif args[0] == "--test":
        mode = "test"
        args = args[1:]

    elif args[0] == "--input" or args[0] == "-i":
        mode = "input"
        args = args[1:]

    else:
        sys.stderr.write("cvt-utf8: unknown argument '%s'\n" % args[0])
        sys.exit(1)

locations = []
locations.append("/usr/share/unicode/unicode")
locations.append("/usr/lib/unicode/unicode")
locations.append("/usr/local/share/unicode/unicode")
locations.append("/usr/local/lib/unicode/unicode")
locations.append(os.environ["HOME"] + "/share/unicode/unicode")
locations.append(os.environ["HOME"] + "/lib/unicode/unicode")

for loc in locations:
    try:
        db = dbm.open(loc, "r")
    except IOError:
        db = None
    except dbm.error:
        db = None
    if db != None:
        break
if han_translations:
    i = loc.rfind("/")
    assert i >= 0
    hanloc = loc[:i+1] + "unihan"
    handb = dbm.open(hanloc, "r")
    # this has been explicitly required, so we don't squelch exceptions

if mode == "test":
    do(["CE","BA","E1","BD","B9","CF","83","CE","BC","CE","B5"])
    do(["00"])
    do(["C2","80"])
    do(["E0","A0","80"])
    do(["F0","90","80","80"])
    do(["F8","88","80","80","80"])
    do(["FC","84","80","80","80","80"])
    do(["7F"])
    do(["DF","BF"])
    do(["EF","BF","BF"])
    do(["F7","BF","BF","BF"])
    do(["FB","BF","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF","BF"])
    do(["ED","9F","BF"])
    do(["EE","80","80"])
    do(["EF","BF","BD"])
    do(["F4","8F","BF","BF"])
    do(["F4","90","80","80"])
    do(["80"])
    do(["BF"])
    do(["80","BF"])
    do(["80","BF","80"])
    do(["80","BF","80","BF"])
    do(["80","BF","80","BF","80"])
    do(["80","BF","80","BF","80","BF"])
    do(["80","BF","80","BF","80","BF","80"])
    do(["80","81","82","83","84","85","86","87",
    "88","89","8A","8B","8C","8D","8E","8F",
    "90","91","92","93","94","95","96","97",
    "98","99","9A","9B","9C","9D","9E","9F",
    "A0","A1","A2","A3","A4","A5","A6","A7",
    "A8","A9","AA","AB","AC","AD","AE","AF",
    "B0","B1","B2","B3","B4","B5","B6","B7",
    "B8","B9","BA","BB","BC","BD","BE","BF"])
    do(["C0","20","C1","20","C2","20","C3","20",
    "C4","20","C5","20","C6","20","C7","20",
    "C8","20","C9","20","CA","20","CB","20",
    "CC","20","CD","20","CE","20","CF","20",
    "D0","20","D1","20","D2","20","D3","20",
    "D4","20","D5","20","D6","20","D7","20",
    "D8","20","D9","20","DA","20","DB","20",
    "DC","20","DD","20","DE","20","DF","20"])
    do(["E0","20","E1","20","E2","20","E3","20",
    "E4","20","E5","20","E6","20","E7","20",
    "E8","20","E9","20","EA","20","EB","20",
    "EC","20","ED","20","EE","20","EF","20"])
    do(["F0","20","F1","20","F2","20","F3","20",
    "F4","20","F5","20","F6","20","F7","20"])
    do(["F8","20","F9","20","FA","20","FB","20"])
    do(["FC","20","FD","20"])
    do(["C0"])
    do(["E0","80"])
    do(["F0","80","80"])
    do(["F8","80","80","80"])
    do(["FC","80","80","80","80"])
    do(["DF"])
    do(["EF","BF"])
    do(["F7","BF","BF"])
    do(["FB","BF","BF","BF"])
    do(["FD","BF","BF","BF","BF"])
    do(["C0","E0","80","F0","80","80","F8","80",
    "80","80","FC","80","80","80","80",
    "DF","EF","BF","F7","BF","BF","FB",
    "BF","BF","BF","FD","BF","BF","BF","BF"])
    do(["FE"])
    do(["FF"])
    do(["FE","FE","FF","FF"])
    do(["C0","AF"])
    do(["E0","80","AF"])
    do(["F0","80","80","AF"])
    do(["F8","80","80","80","AF"])
    do(["FC","80","80","80","80","AF"])
    do(["C1","BF"])
    do(["E0","9F","BF"])
    do(["F0","8F","BF","BF"])
    do(["F8","87","BF","BF","BF"])
    do(["FC","83","BF","BF","BF","BF"])
    do(["C0","80"])
    do(["E0","80","80"])
    do(["F0","80","80","80"])
    do(["F8","80","80","80","80"])
    do(["FC","80","80","80","80","80"])
    do(["ED","A0","80"])
    do(["ED","AD","BF"])
    do(["ED","AE","80"])
    do(["ED","AF","BF"])
    do(["ED","B0","80"])
    do(["ED","BE","80"])
    do(["ED","BF","BF"])
    do(["ED","A0","80","ED","B0","80"])
    do(["ED","A0","80","ED","BF","BF"])
    do(["ED","AD","BF","ED","B0","80"])
    do(["ED","AD","BF","ED","BF","BF"])
    do(["ED","AE","80","ED","B0","80"])
    do(["ED","AE","80","ED","BF","BF"])
    do(["ED","AF","BF","ED","B0","80"])
    do(["ED","AF","BF","ED","BF","8F"])
    do(["EF","BF","BE"])
    do(["EF","BF","BF"])
elif mode == "input":
    def getchar():
        s = sys.stdin.buffer.read(1)
        if s == b"":
            return None
        return s[0] & 0xFF   # ensure it isn't negative
    process_utf8(getchar)
else:
    do(args)
