#!/usr/bin/env python from rdflib.Graph import ConjunctiveGraph from rdflib.URIRef import URIRef from rdflib import Namespace, RDF, RDFS, Literal from pymarc.marcxml import map_xml, record_to_xml from sys import exit from optparse import OptionParser class ConversionException(Exception): def __init__(self, message): self.message = message def __unicode__(self): return unicode(self.message) def lccn(r): if r['001']: return r['001'].data.replace(' ', '') return ConversionException('missing LCCN') def format_field(f, separator=' ', include=[]): if f == None: return None parts = [] has_include = len(include) > 0 for subfield in f: if has_include and subfield[0] not in include: continue parts.append(subfield[1]) return separator.join(parts) def format_topical_field(f): return format_field(f, '--', ['a', 'b', 'v', 'x', 'y', 'z']) def format_geographic_field(f): return format_field(f, '--', ['a', 'v', 'x', 'y', 'z']) def topical_heading(r): return format_topical_field(r['150']) def geographic_heading(r): return format_geographic_field(r['151']) def pref_label(r): label = topical_heading(r) or geographic_heading(r) if not label: raise ConversionException("no prefLabel found") return label def alt_labels(r): return map(format_topical_field, r.getFields('450')) + \ map(format_geographic_field, r.getFields('451')) def is_broader(f): if f['w'] and f['w'] == 'g': return True return False def broader_terms(r): return map(format_topical_field, filter(is_broader, r.getFields('550'))) + \ map(format_geographic_field, filter(is_broader, r.getFields('551'))) def is_related(f): return f['w'] == None def related_terms(r): return map(format_topical_field, filter(is_related, r.getFields('550','551'))) def source_data_found_notes(r): return [format_field(f, include=['a', 'b', 'u']) for f in r.getFields('670')] def source_data_not_found_notes(r): return [format_field(f, include=['a']) for f in r.getFields('675')] def historical_notes(r): return [format_field(f, include=['a', 'b', 'u']) for f in r.getFields('678')] def editorial_notes(r): return source_data_found_notes(r) + source_data_not_found_notes(r) + \ historical_notes(r) def scope_notes(r): return [format_field(f, include=['a', 'i']) for f in r.getFields('680')] def deleted_heading_notes(r): return [format_field(f, include=['a', 'i']) for f in r.getFields('682')] def application_history_notes(r): return [format_field(f, include=['a']) for f in r.getFields('688')] def change_notes(r): return deleted_heading_notes(r) + application_history_notes(r) def concept_for_label(label): triples = list(G.query("SELECT ?a WHERE {?a skos:prefLabel ?b .}", initBindings={'?b':Literal(label, 'en')}, initNs={'skos':SKOS})) if len(triples) == 0: raise ConversionException("no concept with skos:prefLabel %s" % label) else: return triples[0][0] def create_concept(r): try: s = LCSH[lccn(r)] label = pref_label(r) G.add((s, RDF.type, SKOS['Concept'])) G.add((s, SKOS['prefLabel'], Literal(label, 'en'))) for label in alt_labels(r): G.add((s, SKOS['altLabel'], Literal(label, 'en'))) for note in editorial_notes(r): G.add((s, SKOS['editorialNote'], Literal(note, 'en'))) for note in scope_notes(r): G.add((s, SKOS['scopeNote'], Literal(note, 'en'))) for note in change_notes(r): G.add((s, SKOS['changeNote'], Literal(note, 'en'))) G.add((s, OWL['sameAs'], LCCN[lccn(r)])) print ("created %s <%s>" % (label, s)).encode('utf-8') except Exception, e: print "error: " + unicode(e).encode('utf-8') create_errors.write(record_to_xml(r).encode('utf-8')+ "\n") def link_concept(r): try: label = pref_label(r) src = LCSH[lccn(r)] for bt in broader_terms(r): target = concept_for_label(bt) print "linking nt %s to bt %s" % (src, target) G.add((src, SKOS['broader'], target)) G.add((target, SKOS['narrower'], src)) for rt in related_terms(r): target = concept_for_label(rt) print "linking %s to rt %s" % (src, target) G.add((src, SKOS['related'], target)) except ConversionException, e: print "error: " + unicode(e).encode('utf-8') link_errors.write(record_to_xml(r).encode('utf-8') + "\n") if __name__ == '__main__': opt_parser = OptionParser() opt_parser.set_usage("usage: lcsh2skos [options] file1 [file2]") opt_parser.add_option('--create', dest='create', help='create concepts', action='store_true', default=False) opt_parser.add_option('--link', dest='link', help='link concepts', action='store_true', default=False) opt_parser.add_option('--store', dest='store', default='store', help="the directory to use for the Sleepycat store") opts, files = opt_parser.parse_args() # open up triple store G = ConjunctiveGraph('Sleepycat') G.open(opts.store, create=True) SKOS = Namespace('http://www.w3.org/2004/02/skos/core#') LCSH = Namespace('http://loc.gov/lcsh/') LCCN = Namespace('info:lccn/') OWL = Namespace('http://www.w3.org/2002/07/owl#') G.bind('skos', SKOS) G.bind('lcsh', LCSH) G.bind('lccn', LCCN) G.bind('owl', OWL) # need to have some marcxml files if len(files) == 0: print opt_parser.usage exit # when action is unspecified do both creation and linking if (opts.create == False and opts.link == False): opts.create = True opts.link = True create_errors = None link_errors = None try: if opts.create: create_errors = file('create_errors.xml', 'w') create_errors.write("\n") map_xml(create_concept, *files) if opts.link: link_errors = file('link_errors.xml', 'w') link_errors.write("\n") map_xml(link_concept, *files) finally: if create_errors: create_errors.write("") create_errors.close() if link_errors: link_errors.write("") link_errors.close() G.close()