#!/usr/bin/env python

"""
A simplistic linked data crawler. 

  crawl.py http://libris.kb.se/resource/bib/5060570

Then you'll end up with a rdflib berkelydb triple store in a directory called
'store' which you can play with.

  >>> import rdflib
  >>> g = rdflib.ConjunctiveGraph('Sleepycat')
  >>> g.open('store')
  >>> g.serialize(file('dump.rdf', 'w'))

"""

import rdflib
import sys
import time

def crawl(uri):
    print uri
    g = rdflib.ConjunctiveGraph()
    g.load(uri)

    for s in g.subjects():
        if not s in seen:
            seen.add(s)
            uris.append(s)

    for o in g.objects():
        if isinstance(o, rdflib.URIRef) and not o in seen:
            seen.add(o)
            uris.append(o)

    for t in g:
        graph.add(t)

seen = set()
uris = [sys.argv[1]]
graph = rdflib.ConjunctiveGraph('Sleepycat')
graph.open('store', create=True)

while len(uris) > 0:
    time.sleep(2)
    uri = uris.pop(0)
    if 'libris.kb.se' not in uri: 
        continue
    try:
        crawl(uri)
    except KeyboardInterrupt:
        break
    except Exception, e:
        print e

graph.close()
