#!/usr/bin/env python

"""
Crawl the rdfa at data.australia.gov.uk and store them away in a BerkeleyDB 
triplestore.
"""

import re
import urllib

from rdflib.graph import ConjunctiveGraph
from rdflib.term import URIRef

graph = ConjunctiveGraph('Sleepycat')
graph.open('store', create=True)
html = urllib.urlopen('http://data.australia.gov.au/catalogue').read()

for dataset_url in re.findall(r'"(http://data.australia.gov.au/\d+)"', html):
    print "fetching dataset %s" % dataset_url
    graph.parse(location=dataset_url, format='rdfa', lax=True)

# no sense in keeping tons of css stylesheet assertions is there?
for s, p, o in graph:
    if 'xhtml' in p:
        graph.remove((s, p, o))

graph.serialize(open('data.rdf', 'w'))
graph.close()
