#!/usr/bin/env python

"""
Crawl O'Reilly's Book Catalog, extract RDFa about the books, and stash away 
triples in a rdflib BerkeleyDB store.

You will need the trunk version of rdflib installed, or otherwise available.
You also will need html5lib for the lax, tagsoup parsing--O'Reilly's html
for its book pages isn't well-formed at the moment.
"""

import re
import urllib

from rdflib.graph import ConjunctiveGraph
from rdflib.term import URIRef

catalog_urls = [
               "http://oreilly.com/store/complete.html",
               "http://oreilly.com/store/complete2.html",
               "http://oreilly.com/store/complete3.html",
               "http://oreilly.com/store/complete4.html",
               ]

graph = ConjunctiveGraph('Sleepycat')
graph.open('store', create=True)

for catalog_url in catalog_urls:
    html = urllib.urlopen(catalog_url).read()
    for book_url in re.findall(r'"(http://oreilly.com/catalog/\d+/)"', html):
        # TODO: make this smarter, crawl if running at a different time
        if URIRef(book_url) in graph.subjects():
            continue
        print "fetching url=%s [current graph size=%s]" % (book_url, len(graph))
        # some urls in the catalog 404 believe it or not
        try:
            graph.parse(location=book_url, format='rdfa', lax=True)
        except Exception, e:
            print e

graph.serialize(open('catalog.rdf', 'w'))
