#!/usr/bin/env python """ Get a developer API key from: http://developer.opencalais.com Put it in a file: ~/.calais And run this script on a file: calais.py file.txt Or use it as a library to generate a rdf graph for querying: from calais import calais_graph g = calais_graph(content) for row in g.query('...'): ... """ from sys import argv from os import path, environ from urllib import urlencode from httplib import HTTPConnection from tempfile import NamedTemporaryFile from rdflib import ConjunctiveGraph from re import sub import md5 config = """ """ headers = {'Content-Type': 'application/x-www-form-urlencoded'} def calais(content, id=None, submitter=None, license=None): """ pass in content, an id for the content and optionally the submitter name and the licene key. You will get back a chunk of rdf/xml """ if id == None: id = gen_id(content) if submitter == None: submitter = environ['USER'] if license == None: license = find_license() params = { 'licenseID' : license, 'content' : content, 'paramsXML' : config % {'id': id, 'submitter': submitter}} conn = HTTPConnection('api.opencalais.com') conn.request( 'POST', '/enlighten/calais.asmx/Enlighten', urlencode(params), headers) response = conn.getresponse() rdf = response.read() # total hack because calais seems to be returning the rdf/xml # escaped within a single xml element, hopefully they'll fix # this and this hack to remove surrounding element and unescape # can be removed. rdf = rdf[80:-9] rdf = sub('<', '<', rdf) rdf = sub('>', '>', rdf) return rdf def calais_graph(content, id=None, submitter=None, license=None): """ pass in content, an id for the content and optionally the submitter name and the licene key. You will get back a chunk of rdflib ConjunctiveGraph. """ rdf = calais(content, id, submitter, license) tmp = NamedTemporaryFile() tmp.write(rdf) tmp.flush() g = ConjunctiveGraph() g.parse(tmp.name) tmp.close() return g def gen_id(content): """ calculates an md5 checksum for the content to serve as an id when an id is not supplied to calais() """ m = md5.new() m.update(content) return m.hexdigest() def find_license(): """ Looks for and returns a license found in ~/.calais, or throws an exception. """ try: return file(path.expanduser('~/.calais')).read().strip() except: raise Exception("unable to find calais license key in ~/.calais") def normalize(s): """ handy for cleaning up whitespace in some literal text that calais returns. """ return sub(r'[ \n]+', ' ', s) if __name__ == '__main__': filename = argv[1] content = file(filename).read() g = calais_graph(content, filename) print g.serialize(format='n3')