#!/usr/bin/env python

"""
Little hack to scrape the topics out of New York Times and emit them as SKOS. 
The program basically will GET urls like:

  http://topics.nytimes.com/topics/reference/timestopics/all/[a-z]

and scrape the topics out of them, and then persist the SKOS data as 
nytags.rdf Here's an example chunk of rdf in turtle for Ray Bradbury:

    @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
    @prefix skos: <http://www.w3.org/2004/02/skos/core#> .

    <http://topics.nytimes.com/top/reference/timestopics/people/b/ray_bradbury#concept> a skos:Concept;
        skos:prefLabel "Bradbury, Ray";
        skos:broader <http://topics.nytimes.com/top/reference/timestopics/people#concept>;
        skos:inScheme <http://topics.nytimes.com/top/reference/timestopics#conceptScheme> 
        .
"""

import re 

from urllib import urlopen

from BeautifulSoup import BeautifulSoup
from rdflib import URIRef, ConjunctiveGraph, RDF, Namespace

# set up some namespaces and the basic structure of the graph

SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
TOPICS = Namespace('http://topics.nytimes.com/top/reference/timestopics/')
PEOPLE = TOPICS['people#concept']
ORGANIZATIONS = TOPICS['organizations#concept']

g = ConjunctiveGraph()
g.add((PEOPLE, RDF.type, SKOS.Concept))
g.add((ORGANIZATIONS, RDF.type, SKOS.Concept))

# do the scrape

url_template = 'http://topics.nytimes.com/top/reference/timestopics/all/%s'
for lowercase_letter in map(chr, range(97, 123)):
    html = urlopen(url_template % lowercase_letter).read()
    soup = BeautifulSoup(html)
    for a in soup.findAll('a', href=re.compile('^http://topics.*html$')):
        uri = URIRef(re.sub(r'/index.html$', '', a['href']) + '#concept')
        text = a.string
        # pull the text out of <i> tag or whatever is there if necessary
        if text == None:
            text = a.contents[0].string

        g.add((uri, RDF.type, SKOS.Concept))
        g.add((uri, SKOS.prefLabel, text))
        g.add((uri, SKOS.inScheme, 'http://topics.nytimes.com/top/reference/timestopics#conceptScheme'))

        if 'timestopics/people' in uri:
            g.add((uri, SKOS.broader, PEOPLE))
            g.add((PEOPLE, SKOS.narrower, uri))

        elif 'timestopcis/organizations' in uri:
            g.add((uri, SKOS.broader, ORGANIZATIONS))
            g.add((ORGANIZATIONS, SKOS.narrower, uri))

        print uri

g.bind('skos', SKOS)
g.serialize(file('timestopics.rdf', 'w'))
