updates

ehanson8 · ehanson8 · commit f08dca55623c · 2018-08-07T16:08:24.000-04:00
diff --git a/addTriplesToRdfFile.py b/addTriplesToRdfFile.py
@@ -12,6 +12,7 @@
 parser = argparse.ArgumentParser()
 parser.add_argument('-r', '--rdfFileName', help='the RDF file to which triples will be added (include the extension). optional - if not provided, the script will ask for input')
 parser.add_argument('-f', '--fileName', help='the CSV file of new triples (including \'.csv\'). optional - if not provided, the script will ask for input')
+parser.add_argument('-d', '--directory', help='the directory for the input and output files. optional - if not provided, the script will assume null')
 args = parser.parse_args()
 
 if args.rdfFileName:
@@ -22,6 +23,10 @@
     fileName = args.fileName
 else:
     fileName = raw_input('Enter the CSV file of headings to reconcile (including \'.csv\'): ')
+if args.directory:
+    directory = args.directory
+else:
+    directory = ''
 
 startTime = time.time()
 date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
@@ -31,6 +36,9 @@
 g.parse(rdfFileName, format='n3')
 originalTripleCount = len(g)
 
+#create backup of rdf file before updates
+g.serialize(format='n3', destination=open(directory+rdfFileName[:rdfFileName.index('.')]+'Backup'+date+'.n3','wb'))
+
 #creating dict of existing labels for comparison
 q = prepareQuery('SELECT ?s ?o WHERE { ?s skos:prefLabel ?o }', initNs = {'skos': SKOS})
 existingLabels = {}
@@ -44,11 +52,11 @@
 uriNum = int(max(uriNums))
 
 #create log files
-f=csv.writer(open(rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+str(date)+'.csv','wb'))
+f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+str(date)+'.csv','wb'))
 f.writerow(['label']+['rdfLabel']+['uri']+['date'])
 
 #parse csv data and add triples to graph
-with open(fileName) as csvfile:
+with open(directory+fileName) as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         altLabel = row['originalLabel']
@@ -73,26 +81,32 @@
             f.writerow([])
 
 #create rdf file
-g.serialize(format='n3', destination=open(rdfFileName[:rdfFileName.index('.')]+date+'.n3','wb'))
-print g.serialize(format='n3')
+g.serialize(format='n3', destination=open(directory+rdfFileName,'wb'))
 print 'Original triples count: ', originalTripleCount
 print 'Updated triples count: ', len(g)
 
-#extract altLabels and prefLabels to csv
-f=csv.writer(open(rdfFileName[:rdfFileName.index('.')]+'FindAndReplace.csv','wb'))
+#extract altLabels and prefLabels to csv for find and replace operations
+f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'FindAndReplace'+str(date)+'.csv','wb'))
 f.writerow(['replacedValue']+['replacementValue'])
 q = prepareQuery('SELECT ?altLabel ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel. ?s skos:altLabel ?altLabel }', initNs = {'skos': SKOS})
 results = g.query(q)
 for row in results:
     f.writerow([row[0].encode('utf-8')]+[row[1].encode('utf-8')])
 
-f=csv.writer(open('prefLabels.csv','wb'))
+#extract prefLabels to csv
+f=csv.writer(open(directory+'prefLabels'+str(date)+'.csv','wb'))
 f.writerow(['prefLabel'])
 q = prepareQuery('SELECT ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel }', initNs = {'skos': SKOS})
 results = g.query(q)
 for row in results:
     f.writerow([row[0].encode('utf-8')])
 
+#extract all triples to csv
+f=csv.writer(open(directory+'allTriples'+str(date)+'.csv','wb'))
+f.writerow(['subject']+['predicate']+['object'])
+for s, p, o in g:
+    f.writerow([s.encode('utf-8')]+[p.encode('utf-8')]+[o.encode('utf-8')])
+
 elapsedTime = time.time() - startTime
 m, s = divmod(elapsedTime, 60)
 h, m = divmod(m, 60)
diff --git a/rdfFileReconciliation.py b/rdfFileReconciliation.py
@@ -1,6 +1,7 @@
 import csv
 from fuzzywuzzy import fuzz
 import time
+import datetime
 import rdflib
 from rdflib import Graph
 from rdflib.namespace import RDF, SKOS, DC
@@ -11,6 +12,7 @@
 parser = argparse.ArgumentParser()
 parser.add_argument('-r', '--rdfFileName', help='the RDF file to be reconciled against (include the extension). optional - if not provided, the script will ask for input')
 parser.add_argument('-f', '--fileName', help='the CSV file of headings to reconcile (including \'.csv\'). optional - if not provided, the script will ask for input')
+parser.add_argument('-d', '--directory', help='the directory for the input and output files. optional - if not provided, the script will assume null')
 parser.add_argument('-t', '--threshold', help='the threshold (e.g. \'90\' means the strings are 90% similar and 10% different ). optional - if not provided, the script will default to 70')
 args = parser.parse_args()
 
@@ -26,6 +28,10 @@
     threshold = int(args.threshold)
 else:
     threshold = 70
+if args.directory:
+    directory = args.directory
+else:
+    directory = ''
 
 #define function for finding the prefLabel of a subject
 def retrievePrefLabel(uri):
@@ -38,6 +44,7 @@ def retrievePrefLabel(uri):
     match = [label, str(prefLabel), uri, date]
 
 startTime = time.time()
+date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
 
 #import rdf file into graph
 g = Graph()
@@ -54,9 +61,9 @@ def retrievePrefLabel(uri):
 #create lists and csv files
 completeNearMatches = []
 completeExactMatches = []
-f=csv.writer(open('rdfExactMatches.csv','wb'))
+f=csv.writer(open(directory+'rdfExactMatches'+date+'.csv','wb'))
 f.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
-f2=csv.writer(open('rdfNearAndNonMatches.csv','wb'))
+f2=csv.writer(open(directory+'rdfNearAndNonMatches'+date+'.csv','wb'))
 f2.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
 
 #create counters
@@ -66,7 +73,7 @@ def retrievePrefLabel(uri):
 nonmatchedNewHeadings = 0
 
 #parse CSV data and compares against existingLabels dict for exact and near matches
-with open(fileName) as csvfile:
+with open(directory+fileName) as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         label = row['name']