updates

ehanson8 · ehanson8 · commit 331b3cf77f6a · 2018-08-10T15:04:51.000-04:00
diff --git a/addTriplesToRdfFile.py b/addTriplesToRdfFile.py
@@ -8,6 +8,7 @@
 from rdflib.namespace import RDF, SKOS, DC
 from rdflib import URIRef, BNode, Literal
 from rdflib.plugins.sparql import prepareQuery
+import os
 
 parser = argparse.ArgumentParser()
 parser.add_argument('-r', '--rdfFileName', help='the RDF file to which triples will be added (include the extension). optional - if not provided, the script will ask for input')
@@ -28,16 +29,18 @@
 else:
     directory = ''
 
+os.chdir(directory)
 startTime = time.time()
-date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
+date = datetime.datetime.today().strftime('%Y-%m-%d')
+timeStamp = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
 
 #import rdf file into graph
 g = Graph()
 g.parse(rdfFileName, format='n3')
 originalTripleCount = len(g)
 
 #create backup of rdf file before updates
-g.serialize(format='n3', destination=open(directory+rdfFileName[:rdfFileName.index('.')]+'Backup'+date+'.n3','wb'))
+g.serialize(format='n3', destination=open(rdfFileName[:rdfFileName.index('.')]+'Backup'+timeStamp+'.n3','wb'))
 
 #creating dict of existing labels for comparison
 q = prepareQuery('SELECT ?s ?o WHERE { ?s skos:prefLabel ?o }', initNs = {'skos': SKOS})
@@ -52,11 +55,11 @@
 uriNum = int(max(uriNums))
 
 #create log files
-f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+str(date)+'.csv','wb'))
+f=csv.writer(open(os.path.join('triplesAdded', rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+timeStamp+'.csv'),'wb'))
 f.writerow(['label']+['rdfLabel']+['uri']+['date'])
 
 #parse csv data and add triples to graph
-with open(directory+fileName) as csvfile:
+with open(fileName) as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         altLabel = row['originalLabel']
@@ -81,28 +84,28 @@
             f.writerow([])
 
 #create rdf file
-g.serialize(format='n3', destination=open(directory+rdfFileName,'wb'))
+g.serialize(format='n3', destination=open(rdfFileName,'wb'))
 print 'Original triples count: ', originalTripleCount
 print 'Updated triples count: ', len(g)
 
 #extract altLabels and prefLabels to csv for find and replace operations
-f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'FindAndReplace'+str(date)+'.csv','wb'))
+f=csv.writer(open(os.path.join('findAndReplace', rdfFileName[:rdfFileName.index('.')]+'FindAndReplace'+timeStamp+'.csv'),'wb'))
 f.writerow(['replacedValue']+['replacementValue'])
 q = prepareQuery('SELECT ?altLabel ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel. ?s skos:altLabel ?altLabel }', initNs = {'skos': SKOS})
 results = g.query(q)
 for row in results:
     f.writerow([row[0].encode('utf-8')]+[row[1].encode('utf-8')])
 
 #extract prefLabels to csv
-f=csv.writer(open(directory+'prefLabels'+str(date)+'.csv','wb'))
+f=csv.writer(open(os.path.join('prefLabels','prefLabels'+timeStamp+'.csv'),'wb'))
 f.writerow(['prefLabel'])
 q = prepareQuery('SELECT ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel }', initNs = {'skos': SKOS})
 results = g.query(q)
 for row in results:
     f.writerow([row[0].encode('utf-8')])
 
 #extract all triples to csv
-f=csv.writer(open(directory+'allTriples'+str(date)+'.csv','wb'))
+f=csv.writer(open(os.path.join('allTriples','allTriples'+timeStamp+'.csv'),'wb'))
 f.writerow(['subject']+['predicate']+['object'])
 for s, p, o in g:
     f.writerow([s.encode('utf-8')]+[p.encode('utf-8')]+[o.encode('utf-8')])
diff --git a/buildRdfFile.py b/buildRdfFile.py
@@ -8,16 +8,23 @@
 from rdflib.namespace import RDF, DC, SKOS
 from rdflib import URIRef, BNode, Literal
 from rdflib.plugins.sparql import prepareQuery
+import os
 
 parser = argparse.ArgumentParser()
 parser.add_argument('-f', '--fileName', help='the CSV file of source data. optional - if not provided, the script will ask for input')
+parser.add_argument('-d', '--directory', help='the directory for the input and output files. optional - if not provided, the script will assume null')
 args = parser.parse_args()
 
 if args.fileName:
     fileName = args.fileName
 else:
     fileName = raw_input('Enter the file name of the CSV of source data (including \'.csv\'): ')
+if args.directory:
+    directory = args.directory
+else:
+    directory = ''
 
+os.chdir(directory)
 startTime = time.time()
 date = datetime.datetime.today().strftime('%Y-%m-%d')
 nameUriDict = {}
@@ -54,6 +61,12 @@
 g.serialize(format='n3', destination=open(fileName[:fileName.index('.')]+'.n3','wb'))
 print g.serialize(format='n3')
 
+#extract all triples to csv
+f=csv.writer(open('allTriples'+str(date)+'.csv','wb'))
+f.writerow(['subject']+['predicate']+['object'])
+for s, p, o in g:
+    f.writerow([s.encode('utf-8')]+[p.encode('utf-8')]+[o.encode('utf-8')])
+
 elapsedTime = time.time() - startTime
 m, s = divmod(elapsedTime, 60)
 h, m = divmod(m, 60)
diff --git a/rdfFileReconciliation.py b/rdfFileReconciliation.py
@@ -8,6 +8,7 @@
 from rdflib import URIRef, BNode, Literal
 from rdflib.plugins.sparql import prepareQuery
 import argparse
+import os
 
 parser = argparse.ArgumentParser()
 parser.add_argument('-r', '--rdfFileName', help='the RDF file to be reconciled against (include the extension). optional - if not provided, the script will ask for input')
@@ -43,6 +44,7 @@ def retrievePrefLabel(uri):
     global match
     match = [label, str(prefLabel), uri, date]
 
+os.chdir(directory)
 startTime = time.time()
 date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
 
@@ -61,9 +63,9 @@ def retrievePrefLabel(uri):
 #create lists and csv files
 completeNearMatches = []
 completeExactMatches = []
-f=csv.writer(open(directory+'rdfExactMatches'+date+'.csv','wb'))
+f=csv.writer(open(os.path.join('reconciliationResults','rdfExactMatches'+date+'.csv'),'wb'))
 f.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
-f2=csv.writer(open(directory+'rdfNearAndNonMatches'+date+'.csv','wb'))
+f2=csv.writer(open(os.path.join('reconciliationResults','rdfNearAndNonMatches'+date+'.csv'),'wb'))
 f2.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
 
 #create counters
@@ -73,11 +75,15 @@ def retrievePrefLabel(uri):
 nonmatchedNewHeadings = 0
 
 #parse CSV data and compares against existingLabels dict for exact and near matches
-with open(directory+fileName) as csvfile:
+with open(fileName) as csvfile:
+    reader = csv.DictReader(csvfile)
+    rowCount = len(list(reader))
+with open(fileName) as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         label = row['name']
-        print label
+        rowCount -= 1
+        print 'Rows remaining: ', rowCount
         newHeadingsCount += 1
         preCount = len(completeNearMatches)
         for label2, uri in existingLabels.items():
@@ -87,7 +93,6 @@ def retrievePrefLabel(uri):
                 retrievePrefLabel(uri)
                 f.writerow([match[0]]+[match[1]]+[match[2]]+[match[3]])
         if label not in completeExactMatches:
-            print '2nd pass', label
             for label2, uri in existingLabels.items():
                 ratio = fuzz.ratio(label, label2)
                 partialRatio = fuzz.partial_ratio(label, label2)