#!/usr/bin/python

import xml.etree.ElementTree as ElementTree
import os
import sys
import re

db = "so"
encoding = "UTF-8"

def escape(str):
    str = re.sub("\s", " ", str)
    # \ are special for Python strings *and* for regexps. Hence the
    # multiple escaping. Here,we just replace every \ by \\ for
    # PostgreSQL
    str = re.sub("\\\\", "\\\\\\\\", str)
    return str

def tag_parse(str):
    index = 0
    while index < len(str):
        if str[index] == '<':
            try:
                end_tag = str[index:].index('>')
                yield str[(index+1):(index+end_tag)]
                index += end_tag + 1
            except ValueError:
                raise Exception("Tag parsing error in \"%s\"" % str);
        else:
            raise Exception("Tag parsing error in \"%s\"" % str);

if len(sys.argv) != 2:
    raise Exception("Usage: %s so-files-directory" % sys.argv[0])

os.chdir(sys.argv[1])

filename = "users.xml"
tree = ElementTree.parse(filename)
users = tree.getroot()
print "COPY users (id, name, creation, reputation, website) FROM stdin;"
for user in users:
    id = int(user.attrib["Id"])
    if user.attrib.has_key("DisplayName"): # Yes, some users have no name, for instance 155 :-(
        name = escape(user.attrib["DisplayName"])
    else:
        name = "\\N"
    reputation = int(user.attrib["Reputation"])
    creation = user.attrib["CreationDate"]
    if user.attrib.has_key("WebsiteUrl"):
        website = user.attrib["WebsiteUrl"]
    else:
        website = "\\N"
    print "%i\t%s\t%s\t%i\t%s" % (id, name.encode(encoding), creation, reputation, website)
print "\."

filename = "comments.xml"
tree = ElementTree.parse(filename)
comments = tree.getroot()
print "COPY comments (id, post, creation, owner) FROM stdin;"
for comment in comments:
    id = int(comment.attrib["Id"])
    post = int(comment.attrib["PostId"])
    creation = comment.attrib["CreationDate"]
    if comment.attrib.has_key("UserId"): # Yes, we can have anonymous comments, although I don't know how
        user = comment.attrib["UserId"]
    else:
        user = "\\N"
    print "%i\t%i\t%s\t%s" % (id, post, creation, user)
print "\."

filename = "posts.xml"
tree = ElementTree.parse(filename)
posts = tree.getroot()
tags = {}
tag_id = 1
print "COPY posts (id, type, title, creation, owner, accepted_answer) FROM stdin;"
for post in posts:
    id = int(post.attrib["Id"])
    if post.attrib.has_key("PostTypeId"):
        type = int(post.attrib["PostTypeId"])
    else:
        type = "\\N"
    creation = post.attrib["CreationDate"]
    if post.attrib.has_key("OwnerUserId"):
        owner = post.attrib["OwnerUserId"]
    else:
        owner = "\\N"
    if post.attrib.has_key("Title"):
        title = escape(post.attrib["Title"])
    else:
        title = "\\N"
    if post.attrib.has_key("AcceptedAnswerId"):
        accepted_answer = post.attrib["AcceptedAnswerId"]
    else:
        accepted_answer = "\\N"
    print "%i\t%s\t%s\t%s\t%s\t%s" % (id, type, title.encode(encoding), creation, owner, accepted_answer)
    if post.attrib.has_key("Tags"):
        for tag in tag_parse(post.attrib["Tags"]):
            if tag in tags:
                tags[tag]["posts"].append(id)
            else:
                tags[tag] = {"name": tag, "id": tag_id}
                tags[tag]["posts"] = [id, ]
            tag_id += 1
print "\."

print "COPY tags (id, name) FROM stdin;"
for tag in tags:
    print "%i\t%s" % (tags[tag]["id"], tag.encode(encoding))
print "\."

print "COPY tagging (tag, post) FROM stdin;"
for tag in tags:
    for post in tags[tag]["posts"]:
        print "%i\t%i" % (tags[tag]["id"], post)
print "\."

filename = "votes.xml"
tree = ElementTree.parse(filename)
votes = tree.getroot()
print "COPY votes (id, type, post, creation) FROM stdin;"
for vote in votes:
    id = int(vote.attrib["Id"])
    type = int(vote.attrib["VoteTypeId"])
    post = int(vote.attrib["PostId"])
    creation = vote.attrib["CreationDate"]
    print "%i\t%i\t%i\t%s" % (id, type, post, creation)
print "\."

