#!/usr/bin/env python from sys import exit from datetime import datetime, date from unicodedata import normalize from optparse import OptionParser from traceback import print_exc from rdflib import ConjunctiveGraph, Namespace, RDF, Literal, URIRef, BNode from pymarc.marcxml import map_xml, record_to_xml from lcsh.namespaces import SKOS, LCSH, LCSH_CONCEPT_SCHEME, DCTERMS, OWL class ConversionException(Exception): def __init__(self, message): self.message = message def __unicode__(self): return unicode(self.message) def lccn(r): if r['001']: return r['001'].data.replace(' ', '') return ConversionException('missing LCCN') def concept_uri(lccn): return LCSH[lccn + '#concept'] def format_field(f, separator=' ', include=[]): if f == None: return None parts = [] has_include = len(include) > 0 for subfield in f: if has_include and subfield[0] not in include: continue parts.append(subfield[1]) return normalize('NFC', separator.join(parts)) def format_personal_field(f): return format_field(f, '--', ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'v', 'x', 'y', 'z']) def format_corporate_field(f): return format_field(f, '--', ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'x', 'y', 'z']) def format_meeting_field(f): return format_field(f, '--', ['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'p', 'q', 's', 't', 'v', 'x', 'y', 'z']) def format_title_field(f): return format_field(f, '--', ['a', 'd', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'x', 'y', 'z']) def format_chronological_field(f): return format_field(f, '--', ['a', 'i', 'v', 'x', 'y', 'z']) def format_topical_field(f): return format_field(f, '--', ['a', 'b', 'v', 'x', 'y', 'z']) def format_geographic_field(f): return format_field(f, '--', ['a', 'v', 'x', 'y', 'z']) def format_genre_field(f): return format_field(f, '--', ['a', 'v', 'x', 'y', 'z']) def format_general_subd_field(f): return format_field(f, '--', ['i', 'v', 'x', 'y', 'z']) def format_geographic_subd_field(f): return format_field(f, '--', ['i', 'v', 'x', 'y', 'z']) def format_chronological_subd_field(f): return format_field(f, '--', ['i', 'v', 'x', 'y', 'z']) def format_form_subd_field(f): return format_field(f, '--', ['i', 'v', 'x', 'y', 'z']) def corporate_heading(r): return format_corporate_field(r['110']) def topical_heading(r): return format_topical_field(r['150']) def geographic_heading(r): return format_geographic_field(r['151']) def genre_heading(r): return format_genre_field(r['155']) def pref_label(r): # TODO: need to get the subdivisions Out too label = topical_heading(r) or geographic_heading(r) \ or genre_heading(r) or corporate_heading(r) if not label: raise ConversionException("no prefLabel found") return label def alt_labels(r): return map(format_personal_field, r.get_fields('400')) + \ map(format_corporate_field, r.get_fields('410')) + \ map(format_meeting_field, r.get_fields('411')) + \ map(format_title_field, r.get_fields('430')) + \ map(format_chronological_field, r.get_fields('448')) + \ map(format_topical_field, r.get_fields('450')) + \ map(format_geographic_field, r.get_fields('451')) + \ map(format_genre_field, r.get_fields('455')) + \ map(format_general_subd_field, r.get_fields('480')) + \ map(format_geographic_subd_field, r.get_fields('481')) + \ map(format_chronological_subd_field, r.get_fields('482')) + \ map(format_form_subd_field, r.get_fields('485')) def is_broader(f): if f['w'] and f['w'] == 'g': return True return False def broader_terms(r): # add subdivisions too return \ map(format_topical_field, filter(is_broader, r.get_fields('550'))) \ + map(format_geographic_field, filter(is_broader, r.get_fields('551'))) \ + map(format_genre_field, filter(is_broader, r.get_fields('555'))) \ + map(format_corporate_field, filter(is_broader, r.get_fields('510'))) def is_related(f): return f['w'] == None or not (f['w'] == 'g' or f['w'] == 'h') def related_terms(r): return \ map(format_topical_field, filter(is_related, r.get_fields('550'))) \ + map(format_geographic_field, filter(is_related, r.get_fields('551'))) \ + map(format_genre_field, filter(is_related, r.get_fields('555'))) \ + map(format_corporate_field, filter(is_related, r.get_fields('510'))) def source_data_found_notes(r): return [format_field(f, include=['a', 'b', 'u']) for f in r.get_fields('670')] def source_data_not_found_notes(r): return [format_field(f, include=['a']) for f in r.get_fields('675')] def historical_notes(r): return [format_field(f, include=['a', 'b', 'u']) for f in r.get_fields('678')] def editorial_notes(r): return source_data_not_found_notes(r) def definition_notes(r): return historical_notes(r) def scope_notes(r): return [format_field(f, include=['a', 'i']) for f in r.get_fields('680')] def subject_example_tracing_note(r): return [format_field(f, include=['a', 'i']) for f in r.get_fields('681')] def deleted_heading_notes(r): return [format_field(f, include=['a', 'i']) for f in r.get_fields('682')] def application_history_notes(r): return [format_field(f, include=['a']) for f in r.get_fields('688')] def example_notes(r): return subject_example_tracing_note(r) def change_notes(r): return deleted_heading_notes(r) def history_notes(r): return application_history_notes(r) def source_notes(r): return source_data_found_notes(r) def general_notes(r): return nonpublic_general_note(r) def created(r): d = datetime.strptime(r['008'].data[0:6], '%y%m%d') return date(d.year, d.month, d.day) def modified(r): s = r['005'].data[0:-2] # remove the second fraction return datetime.strptime(s, '%Y%m%d%H%M%S') def lcc(r): return format_field(r['053'], include=['a']) def concept_uri_with_pref_label(label): subjects = list(G.subjects(SKOS['prefLabel'], Literal(label, 'en'))) if len(subjects) == 0: raise ConversionException("no concept with skos:prefLabel %s" % label) else: return subjects[0] def create_concept(r): try: s = concept_uri(lccn(r)) label = pref_label(r) G.add((s, RDF.type, SKOS['Concept'])) G.add((s, DCTERMS['created'], Literal(created(r)))) G.add((s, DCTERMS['modified'], Literal(modified(r)))) G.add((s, SKOS['prefLabel'], Literal(label, 'en'))) for label in alt_labels(r): G.add((s, SKOS['altLabel'], Literal(label, 'en'))) for note in editorial_notes(r): G.add((s, SKOS['editorialNote'], Literal(note, 'en'))) for note in scope_notes(r): G.add((s, SKOS['scopeNote'], Literal(note, 'en'))) for note in change_notes(r): G.add((s, SKOS['changeNote'], Literal(note, 'en'))) for note in history_notes(r): G.add((s, SKOS['historyNote'], Literal(note, 'en'))) for note in definition_notes(r): G.add((s, SKOS['definition'], Literal(note, 'en'))) for note in example_notes(r): G.add((s, SKOS['example'], Literal(note, 'en'))) for note in source_notes(r): G.add((s, DCTERMS['source'], Literal(note, 'en'))) lc_classification = lcc(r) if lc_classification: # some day it would be nice to link to LCC as a URI # but for now a blank node will have to do b = BNode() G.add((s, SKOS['relatedMatch'], b)) G.add((b, SKOS['notation'], Literal(lc_classification))) G.add((s, SKOS['inScheme'], LCSH_CONCEPT_SCHEME)) print ("created %s <%s>" % (label, s)).encode('utf-8') except Exception, e: print "error: " + unicode(e).encode('utf-8') create_errors.write(record_to_xml(r).encode('utf-8')+ "\n") def link_concept(r): try: label = pref_label(r) src = concept_uri(lccn(r)) for bt in broader_terms(r): target = concept_uri_with_pref_label(bt) print "linking nt %s to bt %s" % (src, target) G.add((src, SKOS['broader'], target)) G.add((target, SKOS['narrower'], src)) for rt in related_terms(r): target = concept_uri_with_pref_label(rt) print "linking %s to rt %s" % (src, target) G.add((src, SKOS['related'], target)) except ConversionException, e: print "error: " + unicode(e).encode('utf-8') link_errors.write(record_to_xml(r).encode('utf-8') + "\n") if __name__ == '__main__': opt_parser = OptionParser() opt_parser.set_usage("usage: lcsh2skos [options] file1 [file2]") opt_parser.add_option('--create', dest='create', help='create concepts', action='store_true', default=False) opt_parser.add_option('--link', dest='link', help='link concepts', action='store_true', default=False) opt_parser.add_option('--store', dest='store', default='store', help="the directory to use for the Sleepycat store") opts, files = opt_parser.parse_args() # open up triple store G = ConjunctiveGraph('Sleepycat') G.open(opts.store, create=True) G.bind('skos', SKOS) G.bind('lcsh', LCSH) G.bind('owl', OWL) # define the concept scheme G.add((LCSH_CONCEPT_SCHEME, RDF.type, SKOS['ConceptScheme'])) G.add((LCSH_CONCEPT_SCHEME, DCTERMS['title'], Literal('Library of Congress Subject Headings', 'en'))) G.add((LCSH_CONCEPT_SCHEME, DCTERMS['creator'], URIRef('http://inkdroid.org/ehs'))) G.add((LCSH_CONCEPT_SCHEME, DCTERMS['modified'], datetime.now())) G.add((LCSH_CONCEPT_SCHEME, DCTERMS['publisher'], URIRef('http://dbpedia.org/resource/Library_of_Congress'))) G.add((LCSH_CONCEPT_SCHEME, OWL['sameAs'], URIRef('http://dbpedia.org/resource/Library_of_Congress_Subject_Headings'))) # need to have some marcxml files if len(files) == 0: print opt_parser.usage exit # when action is unspecified do both creation and linking if (opts.create == False and opts.link == False): opts.create = True opts.link = True create_errors = None link_errors = None try: if opts.create: create_errors = file('create_errors.xml', 'w') create_errors.write("\n") map_xml(create_concept, *files) if opts.link: link_errors = file('link_errors.xml', 'w') link_errors.write("\n") map_xml(link_concept, *files) finally: if create_errors: create_errors.write("") create_errors.close() if link_errors: link_errors.write("") link_errors.close() G.close()