#!/usr/bin/env python

# Using the database presented in
# http://www.bortzmeyer.org/unicode-to-sql.html, this program
# decomposes an Unicode string in its ultimately decomposed form.

# By default, it does canonical decomposition only. Option -o performs
# also a compatibility decomposition.

# By default, the argument musts be an Unicode string in the encoding
# specified by locale_charset. Option -n changes that: the argument
# must be a group of Unicode codepoints expressed in the standard
# syntax (U+nnnn).

# With option -r, full treatment of RFC 5051 ("i;unicode-casemap -
# Simple Unicode Collation Algorithm") is done (in practice, it means
# we titlecase first).

# TODO: allows to specify the encoding on the CLI

import psycopg2
import getopt
import sys
import re
import quopri

db = "ucd"
locale_charset = "latin-1" # TODO: get it from the environment
debug = False
# Values that can be changed from the CLI
num_codepoints = False
compatibility_decomp = False
titlecase = False

def decompose(str, cursor, canonical_only=True, titlecase=False):
    result = u""
    for char in str:
        if debug:
            sys.stdout.write("Handling character %s...\n" % unicode_syntax(char))
        if titlecase:
            # TODO: implements SpecialCasing for titlecase, not just simple titlecase
            cursor.execute("""
             SELECT titlecase FROM Characters
                        WHERE codepoint=%s
                        """, (ord(char), ))
            title_tuple = cursor.fetchone()
            if title_tuple is None:
                raise Exception("No such Unicode character " + unicode_syntax(char))
            if title_tuple[0] is not None:
                char = unichr(title_tuple[0])
                if debug:
                    sys.stdout.write("Titlecasing to %s\n" % unicode_syntax(char))
        cursor.execute("""
             SELECT decomposition_type,decomposition FROM Characters
                        WHERE codepoint=%s
                        """, (ord(char), ))
        decomp_tuple = cursor.fetchone()
        decomposition_type = decomp_tuple[0]
        decomposition = decomp_tuple[1]
        if decomposition is None or (canonical_only and
                              decomposition_type is not None):
            if debug:
                sys.stdout.write("Character %s not decomposed\n" % unicode_syntax(char))
            result = result + char
        else:
            decomposition_result = decompose(unicode_string(decomposition),
                                        cursor, canonical_only)
            if debug:
                sys.stdout.write("Character %s decomposed in %s\n" % \
                                 (unicode_syntax(char),
                                  map(unicode_syntax, decomposition_result)))
            result = result + decomposition_result
    return result

def unicode_syntax(char):
    return ("U+%06x" % ord(char)).upper()

def unicode_string(int_array):
    result = u""
    for char in int_array:
        result = result + unichr(char)
    return result

def make_unicode_string(cp_array):
    """ Makes an Unicode string from an array of strings representing
    Unicode code points in the standard syntax, U+nnnn"""
    unicode_cp_re = re.compile("^(U\+)?([0-9a-f]+)$", re.IGNORECASE)
    result = u""
    for cp in cp_array:
        match = unicode_cp_re.match(cp)
        if not match:
            sys.stderr.write("Invalid Unicode code point \"%s\"\n" % cp)
            usage()
            sys.exit(1)
        else:
            char_cp = match.group(2)
            char = unichr(quopri.unhex(char_cp))
            result = result + char
    return result

def disassemble(str):
    result = ""
    for char in str:
        result = result + unicode_syntax(char) + " "
    return result

def usage(msg=None):
    if msg is not None:
        sys.stderr.write("%s\n" % msg)
    sys.stderr.write("Usage: %s [-n] [-o] [-r] string (Unicode codepoints if -n)\n" % \
                     sys.argv[0])
    
try:
    optlist, args = getopt.getopt (sys.argv[1:], "hnor",
                                   ["help", "numeric-codepoints", "compatibility",
                                    "rfc5051"])
    for option, value in optlist:
        if option == "--help" or option == "-h":
            usage()
            sys.exit(0)
        elif option == "--numeric-codepoints" or option == "-n":
            num_codepoints = True
        elif option == "--compatibility" or option == "-o":
            compatibility_decomp = True
        elif option == "--rfc5051" or option == "-r":
            compatibility_decomp = True
            titlecase = True
except getopt.error, reason:
    usage(reason)
    sys.exit(1)
if not num_codepoints and len(args) != 1:
    usage("Without -n, only one argument, the string to decompose")
    sys.exit(1)
if num_codepoints and len(args) <= 0:
    usage()
    sys.exit(1)

if num_codepoints:
    query = make_unicode_string(args)
else:
    query = args[0].decode(locale_charset)

connection = psycopg2.connect("dbname=%s" % db)
cursor = connection.cursor()

result = decompose(query, cursor, not compatibility_decomp, titlecase)
print(disassemble(result))

cursor.close()
connection.close()