#!/usr/bin/python

program_name = "disastrous"
reference_url = "http://www.bortzmeyer.org/disastrous.html"
__author__ = "Stephane Bortzmeyer <stephane+%s@bortzmeyer.fr>" % program_name
__version__ = "1.0.5"
__license__ = "GPL"
__copyright__ = "Copyright 2007 Stephane Bortzmeyer"
__docformat__ = 'restructuredtext'

main_doc = """

================================================
 %s, a del.icio.us link checker
================================================

%s is a link-checker for the del.icio.us_ service. It tries all
the URLs stored at your del.icio.us_ account and reports those that are
broken. The main features:

* it has a memory. It stores the results in a file and flags URLs as
  "broken" only after N consecutive tests failed.

* tags the URLs at del.icio.us_ to report the failures.

* designed to be run unattended (typically from cron on an Unix
  machine)

%s depends on a configuration file, $HOME/.%src, using the INI
format. 

%s is free software, distributed under the `GPL licence`_. The source
code is available and you can modify and redistribute it.

Author and maintainer: %s

.. _`GPL licence`: : http://www.gnu.org/licenses/licenses.html#GPL
.. _del.icio.us: http://del.icio.us/

""" % (program_name, program_name,
       program_name, program_name,
       program_name, __author__)

usage_doc = """

%s has very little command-line options. Intended to be run
automatically (for instance with cron on Unix systems), it is
configured in $HOME/.%src, a file in the INI format.

Among the command-line options:

-h: prints the complete help

-r filename: use this file name as the list of URLs to check, instead
of retrieving it through del.icio.us. This file must be in the XML
format, as used by del.icio.us.

-w filename: store the list of URLs returned by del.icio.us in this file.

-n: dry run, do not update the database

-t: do not perform the network tests, just read the bookmarks and update
the database

-s: synchronize your del.icio.us bookmarks from the content of the
local database.  Useful if, for a reason or another, del.icio.us
bookmarks were not tagged properly.

-d N: set the debug level

%s depends on a database (to keep the result of previous tests),
which is stored in $HOME/.%s_db

%s is typically run from cron on an Unix system. A typical cron configuration
is::

 30 3 * * * %s -d 2

It will run %s every day at 3:30 with the debug level set to 2.

""" % (program_name, program_name, program_name, program_name, program_name, 
       program_name, program_name)

rcfile_sample = """
Sample configuration file::

 # %s personal configuration file
 [%s]

 # Your account at del.icio.us
 name = smith
 password = MySecretPassword
 
 # The other options have sensible default values (displayed in the comment) 
 # but feel free to change them

 # The string to use for tagging 
 # broken_tag = broken
 # The number of tests failed in a row before we declare the link broken
 # failed_tests_required = 3

 # All delays are in seconds
 # The delay between two checking Web requests
 # web_delay = 2
 # The delay between two del.icio.us requests. Warning: you can increase it
 # but it is not recommended to lower it, or you could be throttled and may 
 # be banned by del.icio.us.
 # delicious_delay = 2
 # The delay between two taggings: we have to make it long because otherwise, 
 # we are throttled when we synchronize (tested).
 # tagging_delay = 120

""" % (program_name, program_name)

# Imports
# All these are available in the standard library
import string
import os
import sys
import ConfigParser
import getopt
import time
import httplib
import urllib
import urllib2
import base64
# The others are imported later, with more care

# A few utilities we need right now

def fatal(msg="Unknown error"):
    sys.stderr.write("%s: [fatal] %s\n" % (program_name, msg))
    sys.exit(1)

def debug(msg, level=3):
    if debug_level >= level:
        sys.stderr.write("%s: [debug] %s\n" % (program_name, msg))

"""
_SQLite: http://www.sqlite.org/ is used to store the results of the
previous tests. People used to other SQL databases should note that
the"lite" in its name means something. SQLite is very limited. For instance, it
has no real boolean type, hence the comparisons with 0 or 1 which are
common in our code.
"""
try:
    from pysqlite2 import dbapi2 as sqlite
except ImportError:
    fatal("You need the pysqlite module, available at http://initd.org/pub/software/pysqlite")

try:
    import cElementTree as ElementTree
except ImportError:
    try:
        import ElementTree
    except ImportError:
        try:
            import xml.etree.ElementTree as ElementTree
        except ImportError:
            fatal("You need the ElementTree module, available at " + \
                  "http://effbot.org/zone/element-index.htm or in Python >= 2.5")

# Constants
if sys.platform.startswith("win"): # MS-Windows, untested
    import win32api
    config_file_name = os.path.join(win32api.ExpandEnvironmentStrings("%HOMEPATH%"),
                                    "%s.INI" % program_name)
    database_name = os.path.join(win32api.ExpandEnvironmentStrings("%HOMEPATH%"),
                                 "%s.SQL" % program_name)
else:                       
    config_file_name = os.path.join(os.environ['HOME'], ".%src" % program_name)
    database_name = os.path.join(os.environ['HOME'], ".%s_db" % program_name) # TODO: make it configurable
config_section = program_name
delicious_base_url = "https://api.del.icio.us/v1/" # http://del.icio.us/help/api/
user_agent = "%s/%s (running with Python %s; %s; " % \
             (program_name, __version__, sys.version.split()[0], reference_url)

# Variables. Most can be changed in the configuration file (and/or the 
# command-line), only default values are set here.
debug_level = 0
filename_from_read = None
filename_to_write = None
dry_run = False
synchronize_only = False
n_failed_tests_required = 3
do_network_tests = True
test_command = "HEAD" # TODO: read it in the config file
broken_tag = "broken" 
# Delays are in seconds
web_delay = 1
tagging_delay = 120 # Otherwise, 503 errors come very quick... Yes, this is a
                    # long time but we need it.
delicious_delay = 2

class MyConfigParser(ConfigParser.SafeConfigParser):
    """ Small wrapping class for better error messages when reading the configuration file """
    def get(self, option):
        try:
            result = ConfigParser.SafeConfigParser.get(self, config_section, option)
        except (ConfigParser.NoOptionError):
            fatal("Error reading the configuration file \"%s\": \"%s\" is mandatory but was not found.\nRun %s with the -h option to have an example of configuration file." %
                  (config_file_name, option, program_name))
        return result

    def set(self, option, value):
        ConfigParser.SafeConfigParser.set(self, "DEFAULT", option, str(value))

class HTTPrequest(urllib2.Request):
    """ Small wrapping class to allow the request method to change ("HEAD") instead of the default
    "GET") """
    def get_method(self):
        return test_command

def usage(msg=None):
    if msg:
        sys.stderr.write("%s: %s\n" % (program_name, msg))
    sys.stderr.write("%s\n" % usage_doc)

def retrieve_bookmarks(name, password):
    """ Downloads the list of bookmarks from del.icio.us """
    request = urllib2.Request(delicious_base_url + "posts/all")
    request.add_header('User-Agent', user_agent)
    try:
        reply = urllib2.urlopen(request)
    except urllib2.HTTPError, error:
        if error.code == 503:
            fatal("Probably throttled by del.icio.us (%s), try again but much later" % \
                  error.msg)
        elif error.code == 401:
            fatal("Authentification error at del.icio.us, are you sure of the name and password?")
        else:
            fatal("HTTP error retrieving bookmarks %i: %s" % (error.code, error.msg))
    except urllib2.URLError, error:
        reason = error.reason
        if hasattr(reason, "args"):
            reason = reason.args[1]  
        fatal("Connection error retrieving bookmarks: %s" % reason)
    return reply.read()

def create_database(name):
    database = sqlite.connect(name, isolation_level=None)
    # Automatically created.
    # The database may be created with read-for-all
    # protections :-( TODO: fix it
    cursor = database.cursor()
    # TODO: add a table to store the last run and the last update of bookmarks
    cursor.execute("""
      CREATE TABLE Bookmarks (
           url TEXT,
           in_use INT,
           valid INT);
    """)
    cursor.execute("""
      CREATE TABLE Tests (
           url TEXT,
           date TIMESTAMP,
           result INT,
           details TEXT);
           """)
    cursor.close()
    database.close()
    debug("Database \"%s\" created" % name, 1)

def check_or_add(cursor, url):
    """ Checks if the given URL is already in the database and adds it if not """
    cursor.execute("SELECT url, in_use  FROM Bookmarks WHERE url=?;", (url,))
    result = cursor.fetchone()
    if not result:
        # TODO: may be it would be better to run these two SQL statements in a
        # transaction
         cursor.execute("INSERT INTO Bookmarks (url, in_use, valid) VALUES (?, 1, 1);",
                        (url, ))
         cursor.execute("INSERT INTO Tests (url, date, result) VALUES (?, ?, 1);",
                        (url, time.time()))
    else:  
        if result[1] == 0:
            debug("\"%s\" in use again" % url, 2)
            cursor.execute("UPDATE Bookmarks SET in_use=1 WHERE url=?;", (url, ))
        
def mark_not_in_use(cursor, cursor2, urls):
    """ Flags URLs which are no longer in the del.icio.us bookmarks """
    cursor.execute("SELECT url FROM Bookmarks WHERE in_use=1;")
    for result in cursor.fetchall():
        url = result[0]
        if url not in urls:
            debug("%s no longer in use" % url, 2)
            cursor2.execute("UPDATE Bookmarks SET in_use=0 WHERE url=?;", (url, ))

def test_urls(cursor):
     """ Retrieves - from the database - the list of URL that must be tested """
     cursor.execute("SELECT url,valid FROM Bookmarks WHERE in_use = 1;")
     for result in cursor.fetchall():
        url = result[0]
        valid = result[1]
        debug("Testing connectivity of %s" % url)
        result_network = test_url(url)
        if result_network[0]:
            result_boolean = 1
            if valid == 0: # One success is enough to declare it OK
                debug("%s works again" % url, 1)
                cursor2.execute("UPDATE Bookmarks SET valid=1 WHERE url=?;", (url, ))
                untag_bookmark(url)
        else:
            result_boolean = 0
        cursor.execute("INSERT INTO Tests (url, date, result, details) VALUES (?, ?, ?, ?);",
                       (url, time.time(), result_boolean, result_network[1]))
        # Be nice with the Web
        time.sleep(web_delay)
        
def mark_broken(cursor, cursor2):
    """ Find the URLs for which the last "n_failed_tests_required" tests failed """
    cursor.execute("""
        SELECT Tests.url, Bookmarks.valid, count(*) AS count FROM Bookmarks, Tests, 
              (SELECT url, max(date) AS m from Tests WHERE result = 1 GROUP BY url) AS Last_ok 
          WHERE Tests.url=Last_ok.url AND result = 0 AND date > Last_ok.m AND in_use=1 AND
                Tests.url=Bookmarks.url
            GROUP BY Tests.url HAVING count >= ?;
            """, (n_failed_tests_required, ))
    # Thanks to Samuel Tardieu for the nice SQL expression
    #
    # TODO: if the result of the last test is a 410 ("Gone"), it is enough to
    # declare it broken. But this complicates the code. Anyway, 410 seems to be
    # extremely rare in practice.
    for broken in cursor.fetchall():
        if broken[1] == 1:
            url = broken[0]
            # TODO: test there are at least N days between the first and the last test
            debug("\"%s\" is now broken" % url, 1)
            cursor2.execute("UPDATE Bookmarks SET valid=0 WHERE url=?;", (url, ))
            if not dry_run:
                tag_broken_bookmark(url) # If del.icio.us is down, tagging will
                # fail and, since the database has been updated, it will
                # not be done again. In that case, you have to run the
                # program again with the --synchronize option to force
                # resynchronization.

def synchronize(cursor):
    cursor.execute("""SELECT url FROM Bookmarks
                         WHERE in_use = 1 AND valid = 0;""")
    for result in cursor.fetchall():
        url = result[0]
        debug("(Re)tagging %s as broken" % url, 3)
        tag_broken_bookmark(url)
        time.sleep(tagging_delay) 
    cursor.execute("""SELECT url FROM Bookmarks
                         WHERE in_use = 1 AND valid = 1;""")
    for result in cursor.fetchall():
        url = result[0]
        debug("(Re)tagging %s as OK" % url, 3)
        untag_bookmark(url)
        time.sleep(tagging_delay)
        
def test_url(url):
    """ Actual test of the connectivity """
    request = HTTPrequest(url)
    request.add_header('User-Agent', user_agent)
    # TODO: it goes through the http_proxy, if it is set. May be warn the user or clean the
    # environment variable?
    try:
        # TODO: add an If-Modified-Since since we have the date in the Tests table?
        connection = urllib2.urlopen(request)
    except urllib2.HTTPError, error:
        return (False, "HTTP error %i" % error.code)
    except urllib2.URLError, error:
        reason = error.reason
        if hasattr(reason, "args"):
            reason = reason.args[1]  
        return (False, "Network problem: %s" % reason)
    except httplib.BadStatusLine: # Yes, seen in the wild
        return (False, "HTTP error: Bad status line")
    except Exception:
        return (False, "Unknown exception: %s - %s" % (str(sys.exc_type),
                                                       str(sys.exc_value)))
    return (True, "")

def tag_broken_bookmark(bookmark):
    """ Adds a tag at del.icio.us showing that the URL is broken """
    # First, we need to retrieve the current content (there is no
    # addition in del.icio.us API, just complete replacement)
    request = urllib2.Request(delicious_base_url + "posts/get" + \
                               "?url=%s" % \
                              urllib.quote_plus(bookmark.encode("UTF-8")))
    request.add_header('User-Agent', user_agent)
    reply = urllib2.urlopen(request)
    xml = ElementTree.fromstring(reply.read())
    post = xml.find("post")
    if post is None:
            fatal("Internal error: no such bookmark %s when trying to tag it" % bookmark)
    description = post.attrib["description"]
    tags = post.attrib["tag"] + " " + broken_tag
    time.sleep(delicious_delay)
    # Now, send it back, with the new tag
    # TODO: first check the old tags and do not send back if the tag is
    # already there!
    request = urllib2.Request(delicious_base_url + "posts/add" + \
                              "?url=%s&tags=%s&description=%s" % \
                              (urllib.quote_plus(bookmark.encode("UTF-8")),
                               urllib.quote_plus(tags.encode("UTF-8")),
                               urllib.quote_plus(description.encode("UTF-8"))))
    request.add_header('User-Agent', user_agent)
    reply = urllib2.urlopen(request)
    xml = ElementTree.fromstring(reply.read())
    if xml.attrib["code"] != "done":
            fatal("Update of %s on del.icio.us failed")
    time.sleep(delicious_delay) # del.icio.us advice, to avoid being throttled.
    
def untag_bookmark(bookmark):
    """ Removes a tag at del.icio.us showing that the URL is now OK """
    request = urllib2.Request(delicious_base_url + "posts/get" + \
                               "?url=%s" % urllib.quote_plus(bookmark.encode("UTF-8")))
    request.add_header('User-Agent', user_agent)
    reply = urllib2.urlopen(request)
    time.sleep(delicious_delay)
    xml = ElementTree.fromstring(reply.read())
    post = xml.find("post")
    if post is None:
        debug("Internal error: no such bookmark %s when trying to tag it" % bookmark, 1)
        return
    description = post.attrib["description"]
    tags = post.attrib["tag"].split(' ')
    new_tags = []
    for tag in tags:
        if tag != broken_tag:
            new_tags.append(tag)
    # TODO: first check the old tags and do not send back if the tag is not
    # present.
    tags = " ".join(new_tags)
    request = urllib2.Request(delicious_base_url + "posts/add" + \
                              "?url=%s&tags=%s&description=%s" % \
                              (urllib.quote_plus(bookmark.encode("UTF-8")),
                               urllib.quote_plus(tags.encode("UTF-8")),
                               urllib.quote_plus(description.encode("UTF-8"))))
    request.add_header('User-Agent', user_agent)
    reply = urllib2.urlopen(request)
    xml = ElementTree.fromstring(reply.read())
    if xml.attrib["code"] != "done":
            fatal("Update of %s on del.icio.us failed")
    time.sleep(delicious_delay) # del.icio.us advice, to avoid being throttled.
    
if __name__ == '__main__':
    
    if os.path.exists(config_file_name):
        config_file = open(config_file_name)
        config = MyConfigParser()
        config.readfp(config_file)
        if not config.has_section(program_name):
            fatal("No section named %s found in the configuration file %s" % \
                  (program_name, config_file_name))
        # Set default values
        config.set("broken_tag", broken_tag)
        config.set("failed_tests_required", n_failed_tests_required)
        config.set("web_delay", web_delay)
        config.set("tagging_delay", tagging_delay)
        config.set("delicious_delay", delicious_delay)

        # Retrieve the values
        name = config.get("name")
        password = config.get("password")
        broken_tag = config.get("broken_tag")
        n_failed_tests_required = int(config.get("failed_tests_required")) # TODO: handle ValueError if the config file has a wrong syntax
        web_delay = int(config.get("web_delay"))
        tagging_delay = int(config.get("tagging_delay"))
        delicious_delay = int(config.get("delicious_delay"))
    else:
        fatal("Configuration file \"%s\" is mandatory but was not found.\n%s" %
                        (config_file_name, rcfile_sample))

    try:
        optlist, args = getopt.getopt (sys.argv[1:], "hd:r:w:nts",
                          ["help", "debug=", "dry-run", "synchronize",
                           "no-network-tests",
                           "read-from-file=", "write-to-file="])
        for option, value in optlist:
            if option in ['-h', '--help']:
                sys.stderr.write(main_doc)
                usage()
                sys.stderr.write(rcfile_sample)
                sys.exit(0)
            elif option in ['-d', '--debug']:
                try:
                    debug_level = int(value)
                except ValueError:
                    fatal("-d number: number is an integer")
            elif option in ['-r', '--read-from-file']: 
                filename_from_read = value
            elif option in ['-w', '--write-to-file']: 
                filename_to_write = value
            elif option in ['-s', '--synchronize']: 
                synchronize_only = True
            elif option in ['-n', '--dry-run']:
                dry_run = True
            elif option in ['-t', '--no-network-tests']:
                do_network_tests = False
            else:
                fatal("Unknown option \"%s\"" % option)
    except getopt.error, reason:
        usage("Bad usage: %s" % reason)
        fatal("Exiting")

    if not dry_run:
        if not os.path.exists(database_name):
            create_database(database_name)
        database= sqlite.connect(database_name) 
        cursor = database.cursor()
        # Test it
        cursor.execute("SELECT * FROM Bookmarks;")
        cursor.close()
        database.close()
        
    debug("Working for del.icio.us user \"%s\"..." % name)
    user_agent = user_agent + "%s@del.icio.us)" % name
    password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
    password_manager.add_password(
        None, delicious_base_url, name, password)
    auth_handler = urllib2.HTTPBasicAuthHandler(password_manager)
    opener = urllib2.build_opener(auth_handler)
    urllib2.install_opener(opener)
    # TODO: store automatically the bookmarks and use Update
    # (http://del.icio.us/help/api/update) to see if there's something
    # new?
    if not dry_run:
        database = sqlite.connect(database_name, isolation_level=None)
        cursor = database.cursor()
        cursor2 = database.cursor()
    if not synchronize_only:
        if not filename_from_read:
            bookmarks_raw = retrieve_bookmarks(name, password)
            if filename_to_write:
                file_to_write = open(filename_to_write, "w")
                file_to_write.write(bookmarks_raw)
                file_to_write.close()
                debug("Bookmarks stored in %s" % filename_to_write, 2)
        else:
            file_from_read = open(filename_from_read, "r")
            bookmarks_raw = file_from_read.read()
            file_from_read.close()
        bookmarks = ElementTree.fromstring(bookmarks_raw)
        debug("Last update of bookmarks on %s" % bookmarks.attrib["update"], 3)
        posts = bookmarks.getchildren()
        current_urls = {}
        for post in posts:
            url = post.attrib["href"]
            current_urls[url] = True
            debug("Handling \"%s\"" % url, 3)
            if not dry_run:
                check_or_add(cursor, url)
        if not dry_run:
            mark_not_in_use(cursor, cursor2, current_urls.keys())
        if not dry_run and do_network_tests:
            test_urls(cursor)
        if not dry_run:
            mark_broken(cursor, cursor2)
    else: # Synchronize only
        synchronize(cursor)
        
details_doc = """

Known competitors:

* Dead.licious http://www.malarkeysoftware.com/projects_dead-licious.html
Only MacOS. Probably non-free

* post checker http://www.tiede.dk:8080/roller/kim/entry/del_icio_us_post_checkerIn Java. Code available at
 http://code.google.com/p/delicious-post-checker/downloads/list


* Fresh delicious http://freshdelicious.googlepages.com/ Non-free, it seems

"""
