#!/usr/bin/env python

"""

A partial implementation of the RFC 5147 "URI Fragment Identifiers for the
text/plain Media Type", hereafter named "the specification".

Use as you please, under whatever licence you choose.

Stephane Bortzmeyer <bortzmeyer@nic.fr>

"""

import string
import cStringIO as StringIO

class ParsingError(Exception):
    pass

class SemanticError(Exception):
    pass

class HashError(Exception):
    pass

class TextFragment:
    """ A TextFragment is the result of the application of a
    TextFragmentIdentifier to a string. It can be composed of only one
    string if a "range" method is used,
    or two strings if a "position" method is used. """

    def __init__(self, type, method, str1, str2=None, spaces_padding=20):
        self.method = method
        if type != "char" and type != "line":
            raise Exception("Invalid type \"%s\" for a text fragment")
        self.type = type
        self.spaces_padding = spaces_padding
        if self.method == "range":
            if str2 is not None:
                raise Exception("str2 is not None but method is range")
            self.content = str1            
        elif self.method == "position":
            self.before = str1
            self.after = str2
        else:
            raise Exception("Invalid method \"%s\" for a text fragment" % method)

    def __str__(self):
        if self.method == "range":
            return self.content
        else:
            padding = ""
            for i in range(0,int(self.spaces_padding/2)):
                padding = padding + " "
            # This is our replacement for a display cursor (section 2.1.1
            # of the specification)
            full_padding = padding + "<<<|>>>" + padding
            if self.type == "line":
                full_padding = full_padding + "\n"
            if self.before is not None:
                before = self.before
            else:
                before = "*** START ***\n"
            if self.after is not None:
                after = self.after
            else:
                after = "*** END ***"
            return (before + full_padding + after)
        
class TextFragmentIdentifier:

    def __init__(self, type, characters_around=10):
        self.type = type
        self.characters_around = characters_around
        self.hashes = []
        
    def __str__(self):
        if self.type == "char":
            if self.method == "position":
                result = "char=%i" % self.position
            elif self.method == "range":
                if self.stop is not None:
                    result = "char=%i,%i" % (self.start, self.stop)
                else:
                    result = "char=%i,MAX" % (self.start)
            else:
                raise Exception("Inconsistent TextFragmentIdentifier")
        elif self.type == "line":
            if self.method == "position":
                result = "line=%i" % self.position
            elif self.method == "range":
                if self.stop is not None:
                    result = "line=%i,%i" % (self.start, self.stop)
                else:
                    result = "line=%i,MAX" % (self.start)
            else:
                raise Exception("Inconsistent TextFragmentIdentifier")
        else:
            raise Exception("Inconsistent TextFragmentIdentifier")
        return result

    def get_fragment(self, str, characters_around=None):
        """ Applies the text fragment self to the string str.

        str is expected to be an Unicode string (but this is not
        tested).

        str is expected to be normalized, i.e. all line-endings
        reduced to a *single* character (see section 4.1 of the spec).

        get_fragment returns a string for "range" fragment identifiers and
        a couple of strings for "position" identifiers. The amount of characters
        on each side of the position is given by the characters_around attribute
        (which can be overriden by the parameter of the same name).

        For unknown reasons, there is no similar parameter "lines_around".

        """
        if characters_around is None:
            characters_around = self.characters_around
        method = self.method
        if len(self.hashes) >= 1:
            hash = self.hashes[0] # We are lazy, e use only the first one,
                # the specification is not clear, anyway, on how to interpret
                # several hashes
            if hash.type == "length":
                if hash.value != len(str):
                    raise HashError
            elif hash.type == "md5":
                # TODO: implement it
                pass # Not yet implemented
            else:
                pass # Ignore other hash schemes
        if method == "position":
            position = self.position
        elif method == "range":
            start = self.start
            stop = self.stop
        if method == "range": # Small special case, rare and annoying
            if self.type == "char":
                if start > len(str):
                    method = "position"
                    position = start # If both are larger than the document, they
                    # are equal (not obvious in ther specification)
            elif self.type == "line":
                if start >= lines_of(str): # Section 2.1.1 and 4.2
                    method = "position"
                    position = start
        if self.type == "char":
            if method == "position":
                if self.position > len(str): # Section 2.1.1 and 4.2
                    cursor = len(str)-1
                else:
                    cursor = self.position
                if cursor < self.characters_around:
                    start = 0
                else:
                    start = cursor - self.characters_around
                if len(str) - cursor < self.characters_around:
                    stop = len(str)
                    result = TextFragment("char", "position", str[start:stop],
                                          None)
                else:
                    stop = cursor + self.characters_around
                    if cursor == 0:
                        result = TextFragment("char", "position", None,
                                               str[0:stop])
                    else:
                        result =  TextFragment("char", "position",
                                               str[start:cursor], str[cursor:stop])
            elif method == "range":
                    result = TextFragment("char",
                                          "range", str[start:stop])
            else:
                raise Exception("Inconsistent TextFragmentIdentifier")
        elif self.type == "line":
            if method == "position":
                input = StringIO.StringIO(str)
                i = 0
                if position >= lines_of(str): # Section 2.1.1 and 4.2
                    cursor = lines_of(str) 
                else:
                    cursor = position
                previous_line = None
                # TODO: handle the case of empty inputs
                for line in input:
                    if i == cursor:
                        if previous_line is None:
                            result = TextFragment("line", "position", None, line)
                        else:
                            result = TextFragment("line", "position", previous_line,
                                      line)
                        break
                    i = i + 1
                    previous_line = line
                if i == lines_of(str):
                    result = TextFragment("line", "position", line, None)
            elif method == "range":
                input = StringIO.StringIO(str)
                if stop >= lines_of(str): # Section 2.1.1 and 4.2
                    stop = lines_of(str) 
                # TODO: handle the case where start > lines_of(str)
                # TODO: handle the case where, after this case, start and stop
                # are equal.
                result = ""
                i = 0
                state = "start"
                for line in input:
                    if i == start:
                        result = result + line
                        state = "middle"
                    elif i == stop:
                        state = "end"
                        break
                    elif state == "middle":
                        result = result + line
                    i = i + 1
                result = TextFragment("line", "range", result)
            else:
                raise Exception("Inconsistent TextFragmentIdentifier")
        else:
            raise Exception("Inconsistent TextFragmentIdentifier")
        return result

# End of class TextFragmentIdentifier

class HashInfo:

    def __init__(self, type, value):
        self.type = type
        if self.type == "length":
            self.value = int(value)
        elif self.type == "md5":
            self.value = long(value)
        else:
            raise Exception("Invalid type \"%s\" for a hash" % type)
        
def lines_of(str):
    input = StringIO.StringIO(str)
    i = 0
    for line in input:
        i = i + 1
    return i

def isidchar(ch):
    if isdigit(ch) or isletter(ch) or ch == "-":
        return True
    else:
        return False

def isletter(ch):
    return ord(string.lower(ch)) >= 97 and ord(string.lower(ch)) <= 122

def isdigit(ch):
    if ch == '0' or ch == '1' or ch == '2' or ch == '3' or ch == '4' or \
       ch == '5' or ch == '6' or ch == '7' or ch == '8' or ch == '9':
        return True
    else:
        return False
        
def ishexdigit(ch):
    if ch == '0' or ch == '1' or ch == '2' or ch == '3' or ch == '4' or \
       ch == '5' or ch == '6' or ch == '7' or ch == '8' or ch == '9' or \
       ch == 'A' or ch == 'B' or ch == 'C' or ch == 'D' or ch == 'E' or \
       ch == 'F' or \
       ch == 'a' or ch == 'b' or ch == 'c' or ch == 'd' or ch == 'e' or \
       ch == 'f':
        return True
    else:
        return False
        
def parse_number(str):
    if str == "":
        raise ParsingError
    i = 0
    number = ""
    while isdigit(str[i]):
        number = number + str[i]
        i = i + 1
        if i >= len(str):
            break
    if number == "":
        raise ParsingError
    return (int(number), str[i:])
            
def parse_hexnumber(str):
    if str == "":
        raise ParsingError
    i = 0
    number = ""
    while ishexdigit(str[i]):
        number = number + str[i]
        i = i + 1
        if i >= len(str):
            break
    if number == "":
        raise ParsingError
    return (long(number, 16), str[i:])
            
def parse_position(str):
    (n, rest) = parse_number(str)
    return (n, rest)

def parse_range(str):
    try:
        if str[0] == ",":
            # The first bound can be ommitted...
            m = 0
            rest = str
        else:
            (m, rest) = parse_position(str) 
            if string.lower(rest[0:1]) != ",":
                raise ParsingError
        try:
            (n, rest) = parse_position(rest[1:])
        except ParsingError:
            n = None # We use None to represent the upper bound of the future
            # string, Section 2.1.1
            rest = rest[1:]
    except ParsingError:
        if string.lower(str[0:1]) != ",":
            raise ParsingError
        (n, rest) = parse_position(str)
    # Section 2.1.1
    if n is not None and m is not None and n < m:
        raise SemanticError("Lower bound must be lower or equal than the upper bound")
    return (m, n, rest)

def parse_length_scheme(str):
    if string.lower(str[0:7]) != "length=":
        raise ParsingError
    str = str[7:]
    (n, rest) = parse_number(str)
    return (HashInfo("length", n), rest)

def parse_md5_scheme(str):
    if string.lower(str[0:4]) != "md5=":
        raise ParsingError
    str = str[4:]
    (n, rest) = parse_hexnumber(str)
    return (HashInfo("md5", n), rest)

def parse_char_scheme(str):
    if string.lower(str[0:5]) != "char=":
        raise ParsingError
    str = str[5:]
    text_fragment = TextFragmentIdentifier("char")
    try:
        (m, n, rest) = parse_range(str)
        if m != n:
            text_fragment.method = "range"
            text_fragment.start = m
            text_fragment.stop = n
        else:
            text_fragment.method = "position"
            text_fragment.position = n
    except ParsingError:
        (n, rest) = parse_position(str)
        text_fragment.method = "position"
        text_fragment.position = n
    return (text_fragment, rest)

def parse_line_scheme(str):
    if string.lower(str[0:5]) != "line=":
        raise ParsingError
    str = str[5:]
    text_fragment = TextFragmentIdentifier("line")
    try:
        (m, n, rest) = parse_range(str)
        if m != n:
            text_fragment.method = "range"
            text_fragment.start = m
            text_fragment.stop = n
        else:
            text_fragment.method = "position"
            text_fragment.position = n
    except ParsingError:
        (n, rest) = parse_position(str)
        text_fragment.method = "position"
        text_fragment.position = n
    return (text_fragment, rest)

# "mime-charset" in the specification
def parse_encoding(str):
    # TODO: a better grammar, from RFC 2978
    if str[0:1] != ",":
        raise ParsingError
    str = str[1:]
    encoding = ""
    i = 0
    while isidchar(str[i]):
        encoding = encoding + str[i]
        i = i + 1
        if i >= len(str):
            break
    return (encoding, str[i:])

def parse_hash_scheme(str):
    if str[0:1] != ";":
        raise ParsingError
    str = str[1:]
    try:
        (hash, rest) = parse_length_scheme(str)
    except ParsingError:
        (hash, rest) = parse_md5_scheme(str)        
    try:
        (encoding, rest) = parse_encoding(rest)
        # TODO: do something with the encoding
    except ParsingError:
        pass
    return (hash, rest)

def parse_text_scheme(str):
    try:
        (text_fragment, rest) = parse_char_scheme(str)
    except ParsingError:
        (text_fragment, rest) = parse_line_scheme(str)        
    return (text_fragment, rest)

def parse_text_fragment(str, accept_trailer=False):
    (text_fragment, rest) = parse_text_scheme(str)
    while rest != "":
        if rest[0:1] == ';':
            (hash, rest) = parse_hash_scheme(rest)
            text_fragment.hashes.append(hash)
        else:
            break
    if rest and not accept_trailer:
        raise ParsingError
    if accept_trailer:
        return (text_fragment, rest)
    else:
        return text_fragment

if __name__ == "__main__":
    import sys
    import urllib
    import urlparse
    if len(sys.argv) != 2:
        print >> sys.stderr, ("Usage: %s URL" % sys.argv[0]) 
        sys.exit(1)
    url = sys.argv[1]
    url_components = urlparse.urlparse(url)
    fragment = url_components.fragment
    if fragment == "":
        print >> sys.stderr, ("URL %s has no fragment (after the #)" % url) 
        sys.exit(1)
    try:
        (text_fragment, rest) = parse_text_fragment(fragment, accept_trailer=True)
        if rest:
            print >> sys.stderr, ("Warning: spurious text ignored: \"%s\"" % rest) 
        print("Text fragment \"%s\" for URL %s:" % (text_fragment, url))
    except ParsingError:
        print >> sys.stderr, ("Invalid syntax for fragment \"%s\"" % fragment)
        # A real client MUST not exit for a syntax error (section
        # 4.4), but this program is a testing tool...
        sys.exit(1)
    # urllib and urllib2 do not properly handle fragments in the
    # "file" scheme, they believe it is part of the path :-( Bug or do the scheme
    # disallows fragments?
    if url_components.scheme == "file":
        url = "file:///%s" % url_components.path
    url_input = urllib.urlopen(url)
    # TODO: test that the MIME type is really text/plain?
    # TODO: test the encoding because we need to convert to Unicode
    # characters. This is a MUST in the specification (sect. 2)
    contents = url_input.read()
    result = text_fragment.get_fragment(contents)
    print result