forked from EnnoMeijers/ner-not-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ner-not.py
141 lines (116 loc) · 3.84 KB
/
ner-not.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from python_graphql_client import GraphqlClient
import spacy
import json
import sys
from spacy.matcher import Matcher
if len(sys.argv) == 1:
print("Please give filename of textfile to process...")
sys.exit()
filename = sys.argv[1]
configFile = open('config.json')
config=json.load(configFile)
with open(filename) as f:
text = f.read()
# Specify the Network-of-Terms GraphQL API
client = GraphqlClient(endpoint="https://termennetwerk-api.netwerkdigitaalerfgoed.nl/graphql")
def queryTN(sources,searchTerm):
# Prepare the search query
query = """
query tn($sources: [ID]!, $searchTerm: String!) {
terms( sources: $sources, query: $searchTerm ) {
result {
__typename
... on Terms {
terms { uri prefLabel altLabel hiddenLabel scopeNote seeAlso }
}
... on Error {
message
}
}
}
}
"""
# Perform a synchronous request for simplicity
return client.execute(query=query, variables= {"sources": sources, "searchTerm": searchTerm })
def matchLabel(labels,searchLabel):
for label in labels:
if label.strip().lower() == searchLabel:
return label
return False
def Refine(ner,nerType):
# only proces nerTypes that are defined in the config file
if not (nerType in config):
return False
# use source selection from the config.json
sourceList=config[nerType]
# perform Network of Terms request for this NER
data=queryTN(sourceList,ner)
# select the resultLists per source
resultList = data['data']['terms']
for results in resultList:
if(results['result']['__typename']=="Terms"):
terms=results['result']['terms']
for term in terms:
found=matchLabel(term['prefLabel'],ner)
if(found):
return term
found=matchLabel(term['altLabel'],ner)
if(found):
return term
return False
def processKeywords():
token_details = []
print("Processing keywords: ",end="")
for token in doc:
if(token.pos_=="NOUN"):
if not token.text in termList:
print(".",end="",flush=True)
termFound=Refine(token.text,"CONCEPT")
if(termFound):
#print("TOKEN: Found matching URI:",termFound['uri'],"with prefLabel",termFound['prefLabel'],"for",token.text)
termList[token.text]=termFound
print("\nKeywords processing finshed!")
def processNERs():
ner_details = []
for ent in doc.ents:
row=(ent.text, ent.label_,spacy.explain(ent.label_))
if not (row in ner_details):
ner_details.append(row)
print("Processing named entities: ",end="")
for row in ner_details:
ner=row[0].strip().lower()
nerType=row[1]
if not ner in termList:
print(".",end="",flush=True)
termFound=Refine(ner,nerType)
if(termFound):
termList[ner]=termFound
#print("NER: Found matching URI:",termFound['uri'],"with prefLabel",termFound['prefLabel'],"for",ner,)
print("\nNER processing finished!")
def writeCSV():
outFile=filename.rsplit('.',1)[0] + '.csv'
with open(outFile,"w") as fileHandle:
print('searchTerm;URI;prefLabel;altLabel;scopeNote',file=fileHandle)
for term in termList:
print(
term +";"+
termList[term]['uri'] +";"+
', '.join(termList[term]['prefLabel']) +";"+
', '.join(termList[term]['altLabel']) +";"+
', '.join(termList[term]['scopeNote']),
file=fileHandle
)
print("Results written to",outFile)
# load a Dutch language model
nlp = spacy.load("nl_core_news_lg")
# process the text read from the specified input file
doc = nlp(text)
# initialize resultlist
termList = {}
# find relevant concepts URIs based on the nouns in the text
processKeywords()
# find relevant URIs for locations and persons based
# on the namend entities in the text
processNERs()
# write resultlist out as CSV file
writeCSV()