#!/usr/bin/env python """ A simplistic linked data crawler. crawl.py http://libris.kb.se/resource/bib/5060570 Then you'll end up with a rdflib berkelydb triple store in a directory called 'store' which you can play with. >>> import rdflib >>> g = rdflib.ConjunctiveGraph('Sleepycat') >>> g.open('store') >>> g.serialize(file('dump.rdf', 'w')) """ import rdflib import sys import time def crawl(uri): print uri g = rdflib.ConjunctiveGraph() g.load(uri) for s in g.subjects(): if not s in seen: seen.add(s) uris.append(s) for o in g.objects(): if isinstance(o, rdflib.URIRef) and not o in seen: seen.add(o) uris.append(o) for t in g: graph.add(t) seen = set() uris = [sys.argv[1]] graph = rdflib.ConjunctiveGraph('Sleepycat') graph.open('store', create=True) while len(uris) > 0: time.sleep(2) uri = uris.pop(0) if 'libris.kb.se' not in uri: continue try: crawl(uri) except KeyboardInterrupt: break except Exception, e: print e graph.close()