Skip to content

Commit

Permalink
Merge pull request #22 from BaiBlanc/master
Browse files Browse the repository at this point in the history
Pipeline created + basic paraphrase function
  • Loading branch information
BaiBlanc authored Jun 14, 2020
2 parents de8de83 + 1b6a50e commit 06f20a1
Show file tree
Hide file tree
Showing 18 changed files with 9,942 additions and 177 deletions.
8 changes: 6 additions & 2 deletions gsoc/anand/pipeline_3/.pipeline_3/generate_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
import json
import sys
import urllib
from urllib2 import urlopen
import argparse
from bs4 import BeautifulSoup


def get_url(url):
"""Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
"""Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
page link for the given http://mappings.dbpedia.org/index.php/OntologyClass:<some entity>"""
page = urllib.request.urlopen(url)
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
link = soup.findAll('a', attrs={"rel": "nofollow"})[0]['href']
return link
Expand Down
6 changes: 5 additions & 1 deletion gsoc/anand/pipeline_3/.pipeline_3/get_properties.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import urllib
from urllib2 import urlopen
import json
import sys
import csv
Expand All @@ -17,7 +18,10 @@ class related information and data types as field values in each row.
- This function also returns a 2D list of the information mentioned above to the calling
function
"""
page = urllib.request.urlopen(url)
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
if(not os.path.isdir(project_name)):
os.makedirs(project_name)
Expand Down
183 changes: 99 additions & 84 deletions gsoc/anand/pipeline_3/.pipeline_3/sentence_and_template_generator.py
Original file line number Diff line number Diff line change
@@ -1,149 +1,164 @@
import argparse
from generate_url import generate_url_spec , generate_url
from generate_url import generate_url_spec, generate_url
from get_properties import get_properties
import urllib
from urllib2 import urlopen
import urllib.parse
from bs4 import BeautifulSoup
import os
from tqdm import tqdm

def rank_check(query,diction,count,original_count):

def rank_check(query, diction, count, original_count):
query_original = query
count = original_count-count
count = original_count - count
ques = " "
for value in range(count):
if(value == 0):
ques = ques+"?x "
if (value == 0):
ques = ques + "?x "
else:
ques = ques+"?x"+str(value+1)+" "
query = query.replace("(?a)","(?a)"+ ques) + " order by RAND() limit 100"
#print(query)
query = urllib.parse.quote_plus(query)
url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
#print(url)
ques = ques + "?x" + str(value + 1) + " "
query = query.replace("(?a)", "(?a)" + ques) + " order by RAND() limit 100"
# print(query)
try: # python3
query = urllib.parse.quote_plus(query)
except: # python2
query = urllib.quote_plus(query)
url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=" + query + "&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
# print(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
total = len(soup.find_all("tr"))
accum = 0
for rows in (soup.find_all("tr")):
for td in rows.find_all("a"):
damp=0.85
denom = 0
damp = 0.85
denom = 0
interaccum = 0
for a in td:
if(a in diction.keys()):
denom+=1
damp*=damp
interaccum+=damp*float(diction[a])
if (a in diction.keys()):
denom += 1
damp *= damp
interaccum += damp * float(diction[a])
""" print (a.get_text())
if(a.get_text() in diction.keys()):
print(diction(a.get_text())) """
if(denom):
interaccum = interaccum/denom
accum+=interaccum
return float(accum/total)
if (denom):
interaccum = interaccum / denom
accum += interaccum
return float(accum / total)


def check_query(log,query):
def check_query(log, query):
query_original = query
query = urllib.parse.quote(query)
url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
#print(url)
page = urllib.request.urlopen(url)
try: # python3
query = urllib.parse.quote_plus(query)
except: # python2
query = urllib.quote_plus(query))
url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query=" + query + "&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
# print(url)
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
#print((soup.text))
if(soup.text=="false"):
# print((soup.text))
if (soup.text == "false"):
log.error(url)
log.error(query_original )
log.error(query_original)
return False
elif(soup.text=="false"):
#print(query_original)
elif (soup.text == "false"):
# print(query_original)
return True
else:
log.error("Broken Link")
log.error(url)
log.error(query_original )
log.error(query_original)



def sentence_and_template_generator(log,mother_ontology,vessel,prop,project_name,output_file,diction,original_count=0,count=0, suffix = " of <A> ?", query_suffix = ""):
if(type(prop)==str):
def sentence_and_template_generator(log, mother_ontology, vessel, prop, project_name, output_file, diction,
original_count=0, count=0, suffix=" of <A> ?", query_suffix=""):
if (type(prop) == str):
prop = prop.split(',')
original_count = count
natural_language_question = []
sparql_query = []
question_form = open("../utility/question_form.csv",'r').readlines()
question_starts_with =question_form[0].split(',')
question_form = open("../utility/question_form.csv", 'r').readlines()
question_starts_with = question_form[0].split(',')
query_starts_with = question_form[1].split(',')
query_ends_with = question_form[2].split(',')
question_number=[2]
if(prop[3]=="owl:Thing" or prop[3]=="xsd:string"):
question_number=[2,4]
elif(prop[3]=="Place"):
question_number=[3,4]
elif(prop[3]=="Person"):
question_number=[1,4]
elif(prop[3]=="xsd:date" or "date" in prop[3] or "year" in prop[3].lower() or "date" in prop[3].lower() or "time" in prop[3].lower() ):
question_number=[0,4,5]
elif(prop[3]=="xsd:nonNegativeInteger" or "negative" in prop[3].lower() ):
question_number=[2,6]
elif(prop[3]=="xsd:integer" or "integer" in prop[3].lower() ):
question_number=[2,6]
question_number = [2]
if (prop[3] == "owl:Thing" or prop[3] == "xsd:string"):
question_number = [2, 4]
elif (prop[3] == "Place"):
question_number = [3, 4]
elif (prop[3] == "Person"):
question_number = [1, 4]
elif (prop[3] == "xsd:date" or "date" in prop[3] or "year" in prop[3].lower() or "date" in prop[
3].lower() or "time" in prop[3].lower()):
question_number = [0, 4, 5]
elif (prop[3] == "xsd:nonNegativeInteger" or "negative" in prop[3].lower()):
question_number = [2, 6]
elif (prop[3] == "xsd:integer" or "integer" in prop[3].lower()):
question_number = [2, 6]
else:
question_number=[2]
question_number = [2]

val = (generate_url_spec(prop[0]))
prop_link = val[0]
if(prop_link=="None" or prop_link== None):
if (prop_link == "None" or prop_link == None):
return
derived = val[1]
prop_link = "dbo:"+prop_link.strip().split('http://dbpedia.org/ontology/')[-1]
prop_link = "dbo:" + prop_link.strip().split('http://dbpedia.org/ontology/')[-1]

for number in question_number:
natural_language_question.append(question_starts_with[number]+prop[1]+ suffix)
sparql_query.append(query_starts_with[number]+"where { <A> "+ query_suffix + prop_link +" ?x "+ query_ends_with[number])

natural_language_question.append(question_starts_with[number] + prop[1] + suffix)
sparql_query.append(
query_starts_with[number] + "where { <A> " + query_suffix + prop_link + " ?x " + query_ends_with[number])

if(query_suffix==""):
query_answer = ("select distinct(?a) where { ?a "+prop_link+" [] } ")
else :
query_answer = ("select distinct(?a) where { ?a "+query_suffix.split(" ")[0]+" [] . ?a "+query_suffix +" "+ prop_link +" ?x } ")
if (query_suffix == ""):
query_answer = ("select distinct(?a) where { ?a " + prop_link + " [] } ")
else:
query_answer = ("select distinct(?a) where { ?a " + query_suffix.split(" ")[
0] + " [] . ?a " + query_suffix + " " + prop_link + " ?x } ")

if(query_suffix==""):
flag = (check_query(log=log,query =query_answer.replace("select distinct(?a)","ask")))
else :
flag = (check_query(log=log,query= query_answer.replace("select distinct(?a)","ask")))
if(not flag):
if (query_suffix == ""):
flag = (check_query(log=log, query=query_answer.replace("select distinct(?a)", "ask")))
else:
flag = (check_query(log=log, query=query_answer.replace("select distinct(?a)", "ask")))
if (not flag):
return
rank = rank_check(diction=diction,count=count, query=query_answer,original_count=original_count)

rank = rank_check(diction=diction, count=count, query=query_answer, original_count=original_count)

count = count - 1
if(count == 0):
if (count == 0):
variable = "?x"
else:
variable = "?x"+ str(count)
query_suffix = prop_link + " "+variable+" . "+variable+" "
variable = "?x" + str(count)
query_suffix = prop_link + " " + variable + " . " + variable + " "

for number in range(len(natural_language_question)):
vessel.append([mother_ontology,"","",natural_language_question[number],sparql_query[number],query_answer])
output_file.write((';'.join(vessel[-1])+";"+str(rank)+"\n").replace(" "," "))
log.info(';'.join(vessel[-1])+str(rank)+"\n")
#print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")
suffix = " of "+ prop[1] +" of <A> ?"
if(count>0):
vessel.append([mother_ontology, "", "", natural_language_question[number], sparql_query[number], query_answer])
output_file.write((';'.join(vessel[-1]) + ";" + str(rank) + "\n").replace(" ", " "))
log.info(';'.join(vessel[-1]) + str(rank) + "\n")
# print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")

suffix = " of " + prop[1] + " of <A> ?"

if (count > 0):
print(prop[3].split(":")[-1])
val = generate_url(prop[3].split(":")[-1])
url = val[0]
if(not url.startswith("http://mappings.dbpedia.org")):
if (not url.startswith("http://mappings.dbpedia.org")):
return
list_of_property_information = get_properties(url=url,project_name=project_name,output_file =prop[1]+".csv" )
list_of_property_information = get_properties(url=url, project_name=project_name, output_file=prop[1] + ".csv")
for property_line in tqdm(list_of_property_information):
prop_inside = property_line.split(',')
sentence_and_template_generator(log=log,original_count=original_count,diction=diction,output_file=output_file, mother_ontology=mother_ontology,vessel=vessel,prop=prop_inside, suffix = suffix,count = count, project_name=project_name, query_suffix = query_suffix )


sentence_and_template_generator(log=log, original_count=original_count, diction=diction,
output_file=output_file, mother_ontology=mother_ontology, vessel=vessel,
prop=prop_inside, suffix=suffix, count=count, project_name=project_name,
query_suffix=query_suffix)


if __name__ == "__main__":
Expand All @@ -154,7 +169,7 @@ def sentence_and_template_generator(log,mother_ontology,vessel,prop,project_name
requiredNamed = parser.add_argument_group('Required Arguments')

requiredNamed.add_argument('--prop', dest='prop', metavar='prop',
help='prop: person, place etc.', required=True)
help='prop: person, place etc.', required=True)
args = parser.parse_args()
prop = args.prop
sentence_and_template_generator(prop=prop)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def get_url(url):
"""Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
"""Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
page link for the given http://mappings.dbpedia.org/index.php/OntologyClass:<some entity>"""
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
Expand Down
Loading

0 comments on commit 06f20a1

Please sign in to comment.