Skip to content

Commit 331b3cf

Browse files
committed
updates
1 parent e58b11b commit 331b3cf

3 files changed

Lines changed: 34 additions & 13 deletions

File tree

addTriplesToRdfFile.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from rdflib.namespace import RDF, SKOS, DC
99
from rdflib import URIRef, BNode, Literal
1010
from rdflib.plugins.sparql import prepareQuery
11+
import os
1112

1213
parser = argparse.ArgumentParser()
1314
parser.add_argument('-r', '--rdfFileName', help='the RDF file to which triples will be added (include the extension). optional - if not provided, the script will ask for input')
@@ -28,16 +29,18 @@
2829
else:
2930
directory = ''
3031

32+
os.chdir(directory)
3133
startTime = time.time()
32-
date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
34+
date = datetime.datetime.today().strftime('%Y-%m-%d')
35+
timeStamp = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
3336

3437
#import rdf file into graph
3538
g = Graph()
3639
g.parse(rdfFileName, format='n3')
3740
originalTripleCount = len(g)
3841

3942
#create backup of rdf file before updates
40-
g.serialize(format='n3', destination=open(directory+rdfFileName[:rdfFileName.index('.')]+'Backup'+date+'.n3','wb'))
43+
g.serialize(format='n3', destination=open(rdfFileName[:rdfFileName.index('.')]+'Backup'+timeStamp+'.n3','wb'))
4144

4245
#creating dict of existing labels for comparison
4346
q = prepareQuery('SELECT ?s ?o WHERE { ?s skos:prefLabel ?o }', initNs = {'skos': SKOS})
@@ -52,11 +55,11 @@
5255
uriNum = int(max(uriNums))
5356

5457
#create log files
55-
f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+str(date)+'.csv','wb'))
58+
f=csv.writer(open(os.path.join('triplesAdded', rdfFileName[:rdfFileName.index('.')]+'TriplesAdded'+timeStamp+'.csv'),'wb'))
5659
f.writerow(['label']+['rdfLabel']+['uri']+['date'])
5760

5861
#parse csv data and add triples to graph
59-
with open(directory+fileName) as csvfile:
62+
with open(fileName) as csvfile:
6063
reader = csv.DictReader(csvfile)
6164
for row in reader:
6265
altLabel = row['originalLabel']
@@ -81,28 +84,28 @@
8184
f.writerow([])
8285

8386
#create rdf file
84-
g.serialize(format='n3', destination=open(directory+rdfFileName,'wb'))
87+
g.serialize(format='n3', destination=open(rdfFileName,'wb'))
8588
print 'Original triples count: ', originalTripleCount
8689
print 'Updated triples count: ', len(g)
8790

8891
#extract altLabels and prefLabels to csv for find and replace operations
89-
f=csv.writer(open(directory+rdfFileName[:rdfFileName.index('.')]+'FindAndReplace'+str(date)+'.csv','wb'))
92+
f=csv.writer(open(os.path.join('findAndReplace', rdfFileName[:rdfFileName.index('.')]+'FindAndReplace'+timeStamp+'.csv'),'wb'))
9093
f.writerow(['replacedValue']+['replacementValue'])
9194
q = prepareQuery('SELECT ?altLabel ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel. ?s skos:altLabel ?altLabel }', initNs = {'skos': SKOS})
9295
results = g.query(q)
9396
for row in results:
9497
f.writerow([row[0].encode('utf-8')]+[row[1].encode('utf-8')])
9598

9699
#extract prefLabels to csv
97-
f=csv.writer(open(directory+'prefLabels'+str(date)+'.csv','wb'))
100+
f=csv.writer(open(os.path.join('prefLabels','prefLabels'+timeStamp+'.csv'),'wb'))
98101
f.writerow(['prefLabel'])
99102
q = prepareQuery('SELECT ?prefLabel WHERE { ?s skos:prefLabel ?prefLabel }', initNs = {'skos': SKOS})
100103
results = g.query(q)
101104
for row in results:
102105
f.writerow([row[0].encode('utf-8')])
103106

104107
#extract all triples to csv
105-
f=csv.writer(open(directory+'allTriples'+str(date)+'.csv','wb'))
108+
f=csv.writer(open(os.path.join('allTriples','allTriples'+timeStamp+'.csv'),'wb'))
106109
f.writerow(['subject']+['predicate']+['object'])
107110
for s, p, o in g:
108111
f.writerow([s.encode('utf-8')]+[p.encode('utf-8')]+[o.encode('utf-8')])

buildRdfFile.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,23 @@
88
from rdflib.namespace import RDF, DC, SKOS
99
from rdflib import URIRef, BNode, Literal
1010
from rdflib.plugins.sparql import prepareQuery
11+
import os
1112

1213
parser = argparse.ArgumentParser()
1314
parser.add_argument('-f', '--fileName', help='the CSV file of source data. optional - if not provided, the script will ask for input')
15+
parser.add_argument('-d', '--directory', help='the directory for the input and output files. optional - if not provided, the script will assume null')
1416
args = parser.parse_args()
1517

1618
if args.fileName:
1719
fileName = args.fileName
1820
else:
1921
fileName = raw_input('Enter the file name of the CSV of source data (including \'.csv\'): ')
22+
if args.directory:
23+
directory = args.directory
24+
else:
25+
directory = ''
2026

27+
os.chdir(directory)
2128
startTime = time.time()
2229
date = datetime.datetime.today().strftime('%Y-%m-%d')
2330
nameUriDict = {}
@@ -54,6 +61,12 @@
5461
g.serialize(format='n3', destination=open(fileName[:fileName.index('.')]+'.n3','wb'))
5562
print g.serialize(format='n3')
5663

64+
#extract all triples to csv
65+
f=csv.writer(open('allTriples'+str(date)+'.csv','wb'))
66+
f.writerow(['subject']+['predicate']+['object'])
67+
for s, p, o in g:
68+
f.writerow([s.encode('utf-8')]+[p.encode('utf-8')]+[o.encode('utf-8')])
69+
5770
elapsedTime = time.time() - startTime
5871
m, s = divmod(elapsedTime, 60)
5972
h, m = divmod(m, 60)

rdfFileReconciliation.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from rdflib import URIRef, BNode, Literal
99
from rdflib.plugins.sparql import prepareQuery
1010
import argparse
11+
import os
1112

1213
parser = argparse.ArgumentParser()
1314
parser.add_argument('-r', '--rdfFileName', help='the RDF file to be reconciled against (include the extension). optional - if not provided, the script will ask for input')
@@ -43,6 +44,7 @@ def retrievePrefLabel(uri):
4344
global match
4445
match = [label, str(prefLabel), uri, date]
4546

47+
os.chdir(directory)
4648
startTime = time.time()
4749
date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
4850

@@ -61,9 +63,9 @@ def retrievePrefLabel(uri):
6163
#create lists and csv files
6264
completeNearMatches = []
6365
completeExactMatches = []
64-
f=csv.writer(open(directory+'rdfExactMatches'+date+'.csv','wb'))
66+
f=csv.writer(open(os.path.join('reconciliationResults','rdfExactMatches'+date+'.csv'),'wb'))
6567
f.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
66-
f2=csv.writer(open(directory+'rdfNearAndNonMatches'+date+'.csv','wb'))
68+
f2=csv.writer(open(os.path.join('reconciliationResults','rdfNearAndNonMatches'+date+'.csv'),'wb'))
6769
f2.writerow(['originalLabel']+['standardizedLabel']+['uri']+['date'])
6870

6971
#create counters
@@ -73,11 +75,15 @@ def retrievePrefLabel(uri):
7375
nonmatchedNewHeadings = 0
7476

7577
#parse CSV data and compares against existingLabels dict for exact and near matches
76-
with open(directory+fileName) as csvfile:
78+
with open(fileName) as csvfile:
79+
reader = csv.DictReader(csvfile)
80+
rowCount = len(list(reader))
81+
with open(fileName) as csvfile:
7782
reader = csv.DictReader(csvfile)
7883
for row in reader:
7984
label = row['name']
80-
print label
85+
rowCount -= 1
86+
print 'Rows remaining: ', rowCount
8187
newHeadingsCount += 1
8288
preCount = len(completeNearMatches)
8389
for label2, uri in existingLabels.items():
@@ -87,7 +93,6 @@ def retrievePrefLabel(uri):
8793
retrievePrefLabel(uri)
8894
f.writerow([match[0]]+[match[1]]+[match[2]]+[match[3]])
8995
if label not in completeExactMatches:
90-
print '2nd pass', label
9196
for label2, uri in existingLabels.items():
9297
ratio = fuzz.ratio(label, label2)
9398
partialRatio = fuzz.partial_ratio(label, label2)

0 commit comments

Comments
 (0)