#!/usr/bin/python

# Try to make the tables of PVALID or DISALLOWED according to RFC 5892

import psycopg2
import unicodedata
import sys

# Relies on the database presented in <http://www.bortzmeyer.org/unicode-to-sql.html>
db = "ucd"
debug = True

# Section 2.6
exceptions = {0x00DF: "PVALID",  0x03C2: "PVALID", 0x06FD: "PVALID", 0x06FE: "PVALID", 0x0F0B: "PVALID",   
 0x3007: "PVALID",
 0x00B7: "CONTEXTO", 
 0x0375: "CONTEXTO",
 0x05F3: "CONTEXTO",
 0x05F4: "CONTEXTO",
 0x30FB: "CONTEXTO",
 0x0660: "CONTEXTO",
 0x0661: "CONTEXTO",
 0x0662: "CONTEXTO",
 0x0663: "CONTEXTO",
 0x0664: "CONTEXTO",
 0x0665: "CONTEXTO",
 0x0666: "CONTEXTO",
 0x0667: "CONTEXTO",
 0x0668: "CONTEXTO",
 0x0669: "CONTEXTO",
 0x06F0: "CONTEXTO",
 0x06F1: "CONTEXTO",
 0x06F2: "CONTEXTO",
 0x06F3: "CONTEXTO",
 0x06F4: "CONTEXTO",
 0x06F5: "CONTEXTO",
 0x06F6: "CONTEXTO",
 0x06F7: "CONTEXTO",
 0x06F8: "CONTEXTO",
 0x06F9: "CONTEXTO",
 0x0640: "DISALLOWED",
 0x07FA: "DISALLOWED",
 0x302E: "DISALLOWED",
 0x302F: "DISALLOWED",
 0x3031: "DISALLOWED",
 0x3032: "DISALLOWED",
 0x3033: "DISALLOWED",
 0x3034: "DISALLOWED",
 0x3035: "DISALLOWED",
 0x303B: "DISALLOWED" 
}

# Section 2.7
backwards_compatible = {} # Currently empty

# Section 2.4
ignorable_blocks = ["Combining Diacritical Marks for Symbols",
                    "Musical Symbols", "Ancient Greek Musical Notation"]

connection = psycopg2.connect("dbname=%s" % db)
cursor = connection.cursor()
cursor2 = connection.cursor()
 
cursor.execute("""SELECT max(codepoint) FROM Characters""");
max_cp = int(cursor.fetchone()[0])

def tocasefold(str):
    result = u""
    for char in str:
        cursor2.execute("""SELECT casefolding FROM Characters 
             WHERE codepoint = %s""", (ord(char), ))
        casefold = cursor2.fetchone()[0]
        if casefold is None or len(casefold) == 0:
            result = result + char
            continue
        for cchar in casefold:
            result = result + unichr(int(cchar))
    return result

def utext(str):
    result = ""
    for char in str:
         # TODO: standard presentation, uppercase and padding
        result = result + "<U+%x>" % ord(char)
    return result


ignorable_blocks_id = []
for block in ignorable_blocks:
    # Block names are case-insensitive
    cursor.execute("""SELECT id FROM Blocks WHERE name ILIKE %s""", (block, ))
    block_id = cursor.fetchone()
    if block_id is None:
        raise Exception("Cannot get the ID of block \"%s\"" % block)
    ignorable_blocks_id.append(block_id)

cursor.execute("""SELECT codepoint, name, category, block FROM Characters 
                                ORDER BY codepoint;""");

all_characters = []
for ch in range(0, max_cp+1):
    all_characters.append(None)
if debug:
    sys.stderr.write("Handling %i characters, the maximum code point is %x\n" % \
        (cursor.rowcount, max_cp))

for character in cursor.fetchall():

    cp = int(character[0])
    cursor2.execute("""SELECT property FROM 
                          Characters_properties
                        WHERE codepoint = %s""", (cp, ));
    result = cursor2.fetchone()
    if result is not None:
        property = result[0] # TODO: only one property possible?
    else:
        property = None
    name = character[1]
    category = character[2]
    if debug:
        sys.stderr.write("Testing U+%x (%s, category %s)... " % (cp, name, category))
    category = character[2]
    block = character[3]

    #    o  If the codepoint is in Exceptions the value is
    #      according to the table in Section 2.6
    if cp in exceptions.keys():
        status = exceptions[cp]
        if debug:
            sys.stderr.write("Character %s because Exception\n" % status)
        all_characters[cp] = status
        continue

    #  o If the codepoint is in BackwardCompatible, the
    #      value is according to the table in Section 2.7.
    if cp in backwards_compatible.keys():
        status = backwards_compatible[cp]
        if debug:
            sys.stderr.write("Character %s because Backward Compatible\n" % status)
        all_characters[cp] = status
        continue

    #   o  If the codepoint is in Unassigned (Section 2.10), the value is
    #      UNASSIGNED.
    # With the regular UnicodeData.txt, it never matches, because this file 
    # does not contain Unassigned characters. U+038B, for instance.
    if category in ["Cn"] and property not in ["Noncharacter_Code_Point"]:
        if debug:
            sys.stderr.write("Character UNASSIGNED because Unassigned\n")
        all_characters[cp] = "UNASSIGNED"
        continue

    #   o  If the codepoint is in LDH (Section 2.5), the value is PVALID.
    if cp == 0x002D or (cp >= 0x0030 and cp <= 0x0039) or \
            (cp >= 0x0061 and cp <= 0x007A):
        if debug:
            sys.stderr.write("Character PVALID because LDH\n")
        all_characters[cp] = "PVALID"
        continue

    #  o  If the codepoint is in JoinControl (Section 2.2.4), the value is
    #   CONTEXTJ.
    if property == "Join_Control":
        if debug:
            sys.stderr.write("Character CONTEXTJ because Join Control\n")
        all_characters[cp] = "CONTEXTJ"
        continue

    # o  If the codepoint is in Unstable (Section 2.2), the value is
    #   DISALLOWED.
    # Warning: using the unicodedata module makes us depend on a
    # version of Unicode which may be different from the database. We
    # should use the database instead. TODO
    normalized = unicodedata.normalize("NFKC", unichr(cp))
    folded = tocasefold(normalized)
    renormalized = unicodedata.normalize("NFKC", folded)
    if len(renormalized) != 1 or ord(renormalized) != cp:
        if debug:
            sys.stderr.write("Character DISALLOWED because Unstable (%s != %s)\n" % \
                      (utext(renormalized), utext(unichr(cp))))
        all_characters[cp] = "DISALLOWED"
        continue

    # o  If the codepoint is in IgnorableProperties (Section 2.3), the
    #  value is DISALLOWED.
    if property in ["Default_Ignorable_Code_Point", "White_Space",
                          "Noncharacter_Code_Point"]:
        if debug:
            sys.stderr.write("Character DISALLOWED because Ignorable Properties\n")
        all_characters[cp] = "DISALLOWED"
        continue

    # o  If the codepoint is in IgnorableBlocks (Section 2.4), the value
    #  is DISALLOWED.
    if block in ignorable_blocks_id:
        if debug:
            sys.stderr.write("Character DISALLOWED because Ignorable Blocks")
        all_characters[cp] = "DISALLOWED"
        continue

    # TODO: test HangulJamo
    
    # o  If the codepoint is in LetterDigits (Section 2.1), the value is
    #  PVALID.
    if category in ["Ll", "Lu", "Lo", "Nd", "Lm", "Mn", "Mc"]:
        if debug:
            sys.stderr.write("Character PVALID because LetterDigits\n")
        all_characters[cp] = "PVALID"
        continue

    # o  If the codepoint is still there, the value
    #   is DISALLOWED.
    # This is the last test
    if debug:
        sys.stderr.write("Character DISALLOWED by default\n")
    all_characters[cp] = "DISALLOWED"
 
for cp in range(0, len(all_characters)):
    if all_characters[cp] is not None:
        # TODO: better presentation of Unicode code points, see the utext routine
        sys.stdout.write("U+%x ; %s\n" % (cp, all_characters[cp]))

cursor.close()
connection.close()


