#!/usr/bin/env python """ vdiff utility compares two published rdf vocabularies, and looks for changes in the names used, and their definitions, examining rdf, rdfs and owl assertions. The idea is that you can get an idea of the changes a versioned vocabulary has gone through. For example: vdiff 'http://www.w3.org/2004/02/skos/core#' 'http://www.w3.org/2008/05/skos#' Or if you want to ignore some version specific stuff like comments, and version information: vdiff --ignore-property 'http://www.w3.org/2000/01/rdf-schema#isDefinedBy' \ --ignore-property 'http://www.w3.org/2000/01/rdf-schema#comment' \ --ignore-property 'http://www.w3.org/2002/07/owl#versionInfo' \ 'http://www.w3.org/2004/02/skos/core#' 'http://www.w3.org/2008/05/skos#' """ import re import urllib2 import optparse import rdflib RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') RDFS = rdflib.Namespace('http://www.w3.org/2000/01/rdf-schema#') OWL = rdflib.Namespace('http://www.w3.org/2002/07/owl#') def diff(n1, n2, ignore_predicates=[]): """compare two vocabulary namespaces """ g1 = rdflib.ConjunctiveGraph() g1.load(_urlopen(n1)) g2 = rdflib.ConjunctiveGraph() g2.load(_urlopen(n2)) not_found = rdflib.ConjunctiveGraph() _compare(n1, g1, n2, g2, not_found, ignore_predicates=ignore_predicates) _compare(n2, g2, n1, g1, not_found, ignore_predicates=ignore_predicates) return not_found def output_diff(g): subjects = _uniq(list(g.subjects())) subjects.sort() for s in subjects: prefix = '<' if s.startswith(n1) else '>' for p, o in g.predicate_objects(s): print "%s %s %s %s ." % (prefix, s, p, o) print def _urlopen(url): """ A simplified urlopen since some vocab URIs may not understand q-values like SKOS Core for example. """ headers = {'Accept': 'application/rdf+xml'} return urllib2.urlopen(urllib2.Request(url, None, headers)) def _compare(n1, g1, n2, g2, not_found, ignore_predicates=[]): """ Looks at triples in g1 and looks for similarly named source URI in g2 and if the assertion is the same. If the assertion isn't found it is added to the not_found graph that is passed in. """ for s1, p1, o1 in g1: if p1 in ignore_predicates: continue # only want subjects defined in the namespace if not s1.startswith(n1): continue # blank nodes can't really be compared if isinstance(o1, rdflib.BNode): continue # only compare structural parts of the vocabulary if not(p1.startswith(RDF) or p1.startswith(RDFS) or p1.startswith(OWL)): continue # if the triple exists in the other vocabulary no need to compare if (s1, p1, o1) in g2: continue # rewrite the subject uri to use the new namesapce and see if that # triple exists if (_rewrite(s1, n2), p1, o1) in g2: continue # rewrite the subject and the object to appear in the new namesapce # and see if that triple exists if o1.startswith(n1) and (_rewrite(s1, n2), p1, _rewrite(o1, n2)) in g2: continue # oh well, done guessing not_found.add((s1, p1, o1)) def _rewrite(uri, namespace): last_part = re.split(r'[#/]', uri)[-1] if last_part: return namespace[last_part] return None def _uniq(l): s = set() map(lambda e: s.add(e), l) return list(s) if __name__ == '__main__': parser = optparse.OptionParser(usage="usage: %prog [options] v1 v2") parser.add_option('--ignore-property', action='append', dest='ignore_predicates', help='the URI of a property to ignore in the diff') options, args = parser.parse_args() if len(args) != 2: parser.error("need to supply two vocabulary uris") n1 = rdflib.Namespace(args[0]) n2 = rdflib.Namespace(args[1]) ignore_predicates = [rdflib.URIRef(p) for p in options.ignore_predicates] g = diff(n1, n2, ignore_predicates=ignore_predicates) output_diff(g)