#!/usr/bin/env python

"""
Get a developer API key from:

  http://developer.opencalais.com

Put it in a file:

  ~/.calais

And run this script on a file:

  calais.py file.txt 

Or use it as a library to generate a rdf graph for querying:

  from calais import calais_graph 
  g = calais_graph(content)
  for row in g.query('...'):
      ...

"""

from sys import argv
from os import path, environ
from urllib import urlencode  
from httplib import HTTPConnection
from tempfile import NamedTemporaryFile
from rdflib import ConjunctiveGraph
from re import sub

import md5

config = """
<c:params 
  xmlns:c="http://s.opencalais.com/1/pred/"
  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
  <c:processingDirectives c:contentType="text/txt" c:outputFormat="XML/RDF" />
  <c:userDirectives c:allowDistribution="true" c:allowSearch="true" 
    c:externalID="%(id)s" c:submitter="%(submitter)s" />
  <c:externalMetadata />
</c:params>
"""

headers = {'Content-Type': 'application/x-www-form-urlencoded'}

def calais(content, id=None, submitter=None, license=None):
    """
    pass in content, an id for the content and optionally the submitter 
    name and the licene key. You will get back a chunk of rdf/xml
    """

    if id == None:
        id = gen_id(content)

    if submitter == None: 
        submitter = environ['USER']

    if license == None: 
        license = find_license()

    params = {
        'licenseID' :   license,
        'content' :     content,
        'paramsXML' :   config % {'id': id, 'submitter': submitter}}

    conn = HTTPConnection('api.opencalais.com')
    conn.request( 'POST', '/enlighten/calais.asmx/Enlighten', 
        urlencode(params), headers)

    response = conn.getresponse()
    rdf = response.read()
    # total hack because calais seems to be returning the rdf/xml
    # escaped within a single xml element, hopefully they'll fix
    # this and this hack to remove surrounding element and unescape
    # can be removed.
    rdf = rdf[80:-9]
    rdf = sub('&lt;', '<', rdf)
    rdf = sub('&gt;', '>', rdf)
    return rdf

def calais_graph(content, id=None, submitter=None, license=None):
    """
    pass in content, an id for the content and optionally the submitter 
    name and the licene key. You will get back a chunk of rdflib 
    ConjunctiveGraph.
    """
    rdf = calais(content, id, submitter, license)
    tmp = NamedTemporaryFile()
    tmp.write(rdf)
    tmp.flush()
    g = ConjunctiveGraph()
    g.parse(tmp.name)
    tmp.close()
    return g

def gen_id(content):
    """
    calculates an md5 checksum for the content to serve as an id 
    when an id is not supplied to calais()
    """
    m = md5.new()
    m.update(content)
    return m.hexdigest()
    
def find_license():
    """
    Looks for and returns a license found in ~/.calais, or throws
    an exception.
    """
    try:
        return file(path.expanduser('~/.calais')).read().strip()
    except:
        raise Exception("unable to find calais license key in ~/.calais")

def normalize(s):
    """
    handy for cleaning up whitespace in some literal text that calais returns.
    """
    return sub(r'[ \n]+', ' ', s)

if __name__ == '__main__':
    filename = argv[1]
    content = file(filename).read()
    g = calais_graph(content, filename)
    print g.serialize(format='n3')


