Skip to content

Commit

Permalink
building the complete pipeline for reranking
Browse files Browse the repository at this point in the history
  • Loading branch information
cristinae committed Aug 28, 2018
1 parent cc8fba0 commit 223d24e
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 53 deletions.
34 changes: 24 additions & 10 deletions scripts/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,19 @@
import phonetics

bpeMark = '@@'
header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n'


def getHeader():
return header

def basicFeatures(w1, l1, w2, l2, isSubWord, bothBPE):
'''
Creates a string with a cvs format for the basic features
'''
return w1+","+l1+","+ w2+","+l2+","+isSubWord+","+bothBPE+","



def extractSemFeatures(w1, w2, l2, proc):
def extractSemFeatures(w1, w2, l2, nexplore, proc):
'''
Extracts the set of semantic features related to word embeddings for a pair of word (w1, w2)
Returns a string with a cvs format for the features
Expand Down Expand Up @@ -61,14 +63,10 @@ def extractSemFeatures(w1, w2, l2, proc):
newSpace.add(w1,vector,replace=True)
if w2 in newSpace.vocab:
w2Rank = newSpace.rank(w1,w2)
toprank = newSpace.similar_by_vector(vector,topn=explore)
sim = newSpace.similarity(w1,w2)
simRankt1 = toprank[w2Rank-1][1] - toprank[0][1] # how far in similarity is w2 to top1
simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1] # how far in similarity is w2 to the next word
simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10
simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100
simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','

toprank = newSpace.similar_by_vector(vector,topn=nexplore)
simsRankW2 = extractSimDiffFeats(w2Rank, toprank)
simsRankW2 = str(sim)+','+simsRankW2
else:
return noSimsRank

Expand All @@ -78,6 +76,22 @@ def extractSemFeatures(w1, w2, l2, proc):
return simsRankW2


def extractSimDiffFeats(rankDif, toprank):
""" Extracts the subset of semantic features related to differences in similarities between
words and translations
"""

#toprank = mlweSpace.similar_by_vector(vector,topn=nexplore)
simRankt1 = toprank[rankDif-1][1] - toprank[0][1] # how far in similarity is w2 to top1
simRanktnext = toprank[rankDif-1][1] - toprank[rankDif][1] # how far in similarity is w2 to the next word
simRankt10 = toprank[rankDif-1][1] - toprank[9][1] # how far in similarity is w2 to top10
simRankt100 = toprank[rankDif-1][1] - toprank[99][1] # how far in similarity is w2 to top100

simsRank = str(rankDif)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','

return simsRank


def extractLexFeatures(w1, w2):
'''
Extracts the set of features for a pair of word (w1, w2)
Expand Down
2 changes: 1 addition & 1 deletion scripts/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, modelPath):
print("..Multilingual lexicon loaded. Using "+str(mem)+" MB of memory")
self.ctDict = ctDict

# Load BPE model
# Load BPE model
self.bpe = easyBPE.BPE(self.BPEcodes)
mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
print("..BPE model loaded. Using "+str(mem)+" MB of memory")
Expand Down
19 changes: 7 additions & 12 deletions scripts/reranker/extractFeaturesLexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@

bpeMark = '@@'
emptyMark = 'EMPTY'
header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n'

# for debugging
countUp = 0
Expand Down Expand Up @@ -116,20 +115,16 @@ def findSimsNonTrad(w1, w2, l2, proc):
print("kk")

# since all calculations are ready here we use them to estimate WE-related features
toprank = newSpace.similar_by_vector(vector,topn=explore)
# for the negative example
sim = newSpace.similarity(w1,nonTrad)
simRankt1 = toprank[rank-1][1] - toprank[0][1] # how far in similarity is noTrad to top1
simRanktnext = toprank[rank-1][1] - toprank[rank][1] # how far in similarity is noTrad to the next word
simRankt10 = toprank[rank-1][1] - toprank[9][1] # how far in similarity is noTrad to top10
simRankt100 = toprank[rank-1][1] - toprank[99][1] # how far in similarity is noTrad to top100
simsRankNoTrad = str(sim)+','+str(rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
simsRankNoTrad = extractSimDiffFeats(rank, toprank)
simsRankNoTrad = str(sim)+','+simsRankNoTrad

# for the positive example
sim = newSpace.similarity(w1,w2)
simRankt1 = toprank[w2Rank-1][1] - toprank[0][1] # how far in similarity is w2 to top1
simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1] # how far in similarity is w2 to the next word
simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10
simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100
simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
simsRankW2 = extractSimDiffFeats(w2Rank, toprank)
simsRankW2 = str(sim)+','+simsRankW2

else:
return noSimsRank,emptyMark,noSimsRank
Expand Down Expand Up @@ -173,7 +168,7 @@ def main(inF):

outF = inF+'.feat'
fOUT = open(outF, 'w')
fOUT.write(header)
fOUT.write(features.getHeader())
# Read the quad-lexicon
with open(inF) as f:
for line in f:
Expand Down
21 changes: 12 additions & 9 deletions scripts/reranker/trainReranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def main(inF, path):
X = df4ML.loc[:, feature_cols]
y = df4ML.Gold
clf = XGBClassifier(max_depth=6, n_estimators=300, learning_rate=0.05)
# TODO: fit parameters
# TODO: fit parameters of XGB
#clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
# beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(8, 2), learning_rate='constant',
# learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
Expand All @@ -110,14 +110,6 @@ def main(inF, path):
plot_importance(clf)
plt.show()

#thresholds = np.sort(clf.feature_importances_)
#for thresh in thresholds:
# select features using threshold
# selection = SelectFromModel(clf, threshold=thresh, prefit=True)
# select_X = selection.transform(X)
# scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False)
# print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2))



if __name__ == "__main__":
Expand All @@ -129,3 +121,14 @@ def main(inF, path):
main(sys.argv[1], scriptPath)




#thresholds = np.sort(clf.feature_importances_)
#for thresh in thresholds:
# select features using threshold
# selection = SelectFromModel(clf, threshold=thresh, prefit=True)
# select_X = selection.transform(X)
# scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False)
# print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2))


106 changes: 85 additions & 21 deletions scripts/tradQueriesEmbeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,66 @@
import sys
import os.path
import unicodedata
from io import StringIO

import load
import easyBPE
import features
from gensim.models import KeyedVectors

import numpy as np
import pandas as pd
from xgboost import XGBClassifier

trainingSet = 'untradDEallkeys.solr.all-languages'
bpeMark = '@@'

def preprocessingRead(data):
"""
Basic preprocessing for most of ML algorithms: binarisation, normalisation, scaling
Reads from a saved scaling model for test usage
"""

global modelPath

# extract the data into a dataframe
df = pd.read_csv(StringIO(data))

# convert categorical column in four binary columns, one per language
df4ML = df.join(pd.get_dummies(df['L2'],prefix='L2'))

# scale columns
# rankW2 has huge numbers in a wide interval, we should cut and/or move to a log scale
df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 1000 if x>1000 else x)
#df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 0 if x<= 0 else math.log10(x))

#colums2scale = ['rankW2','WEsim','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
colums2scale = ['rankW2','l1','l2','l1/l2','lev','levM2','simRankt1','simRankWnext','simRankt10','simRankt100']
scaler = joblib.load(modelPath+'reranker/'+trainingSet+'.scaler.pkl')
df4ML[colums2scale] = scaler.fit_transform(df4ML[colums2scale])

return df4ML


def predictBestTrad(df):
"""
"""

feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
nbest = df.loc[:, feature_cols]

clf = xgb.Booster() # init model

# Load previously trained model
clf.load_model(modelPath+'reranker/'+trainingSet+'.model')

# make prediction, a probability by default with XGB
nbestProbs = clf.predict(nbest)
indexTrad = np.argmax(nbestProbs)

return indexTrad


def rreplace(s, old, new, occurrence):
""" Replace last occurrence of a substring in a string
https://stackoverflow.com/questions/2556108/rreplace-how-to-replace-the-last-occurrence-of-an-expression-in-a-string
Expand All @@ -35,10 +87,9 @@ def percentage2d(part, whole):
return "0"

def remove_diacritic(input):
'''
Accept a unicode string, and return a normal string (bytes in Python 3)
""" Accept a unicode string, and return a normal string (bytes in Python 3)
without any diacritical marks.
'''
"""
return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')

def cleanEndString(toClean):
Expand Down Expand Up @@ -124,43 +175,55 @@ def translate(string, proc):
stringTrad = stringTrad + extractTradFromDict(toTrad, capitalized, stringTrad, ctDict)
else:
# if not, we look for the closest translation(s) in the embeddings space
isSubWord = 0
isSubWord = '0'
bped = easyBPE.applyBPE(proc.bpe, word)
if len(bped) >1:
isSubWord = 1
isSubWord = '1'
for subunit in bped:
print(subunit)
vector = proc.embeddingL1[subunit]
#allSubunit = proc.embeddingL1.similar_by_vector(vector,topn=5)

enSubunits = proc.embeddingEn.similar_by_vector(vector,topn=1000)
allFeats = features.getHeader()
for subunitTrad in enSubunits:
# populate for a dataframe with the n-best list
w2 = subunit[0]
bothBPE = '0'
if bpeMark in subunit and bpeMark in w2:
bothBPE = '1'
basicFeats = features.basicFeatures(subunit,'xx', w2, 'en', isSubWord, bothBPE)
semFeats = features.extractSemFeatures(subunit, w2, 'en', proc)
semFeats = features.extractSemFeatures(subunit, w2, 'en', 40000, proc)
lexFeats = features.extractLexFeatures(subunit, w2)
#featureString = S'HA DE FER EL MATEIX CLEANING Q AL TRAINING

esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2)
deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2)
frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2)
print(enSubunits)
print(esSubunit)
print(deSubunit)
print(frSubunit)
allFeats = allFeats + basicFeats+semFeats+lexFeats+'\n'
# create preprocessed data frame
df = preprocessingRead(allFeats)
indexTrad = predictBestTrad(df)
trad = enSubunits[indexTrad][0]
print(trad)




#esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2)
#deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2)
#frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2)
#print(enSubunits)
#print(esSubunit)
#print(deSubunit)
#print(frSubunit)
# we need to reconstruct BPE

return stringTrad


def main(inF, outF):
def main(inF, scriptPath):

modelPath = "../models/"
global modelPath
modelPath = scriptPath+"../models/"
# Initialise a new process for translation, loading the models
proc = load.QueryTrad(modelPath)

outF = inF+'trad'
# Read the queries from file
fOUT = open(outF, 'w')
with open(inF) as f:
Expand Down Expand Up @@ -190,11 +253,12 @@ def main(inF, outF):

if __name__ == "__main__":

if len(sys.argv) is not 3:
sys.stderr.write('Usage: python3 %s inputFile outputFile\n' % sys.argv[0])
if len(sys.argv) is not 2:
sys.stderr.write('Usage: python3 %s inputFile\n' % sys.argv[0])
sys.exit(1)
print("WARNING: This software needs python >3.6 to run properly\n")
main(sys.argv[1], sys.argv[2])
scriptPath = os.path.dirname(os.path.abspath( __file__ ))
main(sys.argv[1], scriptPath+'/')

# CHECK: source==target doesn't mean untranslated
#numTermTrad = numTerms-numTermsUntrad
Expand Down

0 comments on commit 223d24e

Please sign in to comment.