From 223d24e869abd0eac6b154349875f7c52cc2ad2a Mon Sep 17 00:00:00 2001 From: Cristina Date: Tue, 28 Aug 2018 18:42:31 +0200 Subject: [PATCH] building the complete pipeline for reranking --- scripts/features.py | 34 +++++-- scripts/load.py | 2 +- scripts/reranker/extractFeaturesLexicon.py | 19 ++-- scripts/reranker/trainReranker.py | 21 ++-- scripts/tradQueriesEmbeddings.py | 106 +++++++++++++++++---- 5 files changed, 129 insertions(+), 53 deletions(-) diff --git a/scripts/features.py b/scripts/features.py index 299c6cd..b23fb96 100644 --- a/scripts/features.py +++ b/scripts/features.py @@ -18,17 +18,19 @@ import phonetics bpeMark = '@@' +header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n' +def getHeader(): + return header + def basicFeatures(w1, l1, w2, l2, isSubWord, bothBPE): ''' Creates a string with a cvs format for the basic features ''' return w1+","+l1+","+ w2+","+l2+","+isSubWord+","+bothBPE+"," - - -def extractSemFeatures(w1, w2, l2, proc): +def extractSemFeatures(w1, w2, l2, nexplore, proc): ''' Extracts the set of semantic features related to word embeddings for a pair of word (w1, w2) Returns a string with a cvs format for the features @@ -61,14 +63,10 @@ def extractSemFeatures(w1, w2, l2, proc): newSpace.add(w1,vector,replace=True) if w2 in newSpace.vocab: w2Rank = newSpace.rank(w1,w2) - toprank = newSpace.similar_by_vector(vector,topn=explore) sim = newSpace.similarity(w1,w2) - simRankt1 = toprank[w2Rank-1][1] - toprank[0][1] # how far in similarity is w2 to top1 - simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1] # how far in similarity is w2 to the next word - simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10 - simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100 - simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+',' - + toprank = newSpace.similar_by_vector(vector,topn=nexplore) + simsRankW2 = extractSimDiffFeats(w2Rank, toprank) + simsRankW2 = str(sim)+','+simsRankW2 else: return noSimsRank @@ -78,6 +76,22 @@ def extractSemFeatures(w1, w2, l2, proc): return simsRankW2 +def extractSimDiffFeats(rankDif, toprank): + """ Extracts the subset of semantic features related to differences in similarities between + words and translations + """ + + #toprank = mlweSpace.similar_by_vector(vector,topn=nexplore) + simRankt1 = toprank[rankDif-1][1] - toprank[0][1] # how far in similarity is w2 to top1 + simRanktnext = toprank[rankDif-1][1] - toprank[rankDif][1] # how far in similarity is w2 to the next word + simRankt10 = toprank[rankDif-1][1] - toprank[9][1] # how far in similarity is w2 to top10 + simRankt100 = toprank[rankDif-1][1] - toprank[99][1] # how far in similarity is w2 to top100 + + simsRank = str(rankDif)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+',' + + return simsRank + + def extractLexFeatures(w1, w2): ''' Extracts the set of features for a pair of word (w1, w2) diff --git a/scripts/load.py b/scripts/load.py index 5173241..ce9acbc 100644 --- a/scripts/load.py +++ b/scripts/load.py @@ -47,7 +47,7 @@ def __init__(self, modelPath): print("..Multilingual lexicon loaded. Using "+str(mem)+" MB of memory") self.ctDict = ctDict - # Load BPE model + # Load BPE model self.bpe = easyBPE.BPE(self.BPEcodes) mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("..BPE model loaded. Using "+str(mem)+" MB of memory") diff --git a/scripts/reranker/extractFeaturesLexicon.py b/scripts/reranker/extractFeaturesLexicon.py index a7474d3..2a07ac0 100644 --- a/scripts/reranker/extractFeaturesLexicon.py +++ b/scripts/reranker/extractFeaturesLexicon.py @@ -30,7 +30,6 @@ bpeMark = '@@' emptyMark = 'EMPTY' -header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n' # for debugging countUp = 0 @@ -116,20 +115,16 @@ def findSimsNonTrad(w1, w2, l2, proc): print("kk") # since all calculations are ready here we use them to estimate WE-related features + toprank = newSpace.similar_by_vector(vector,topn=explore) # for the negative example sim = newSpace.similarity(w1,nonTrad) - simRankt1 = toprank[rank-1][1] - toprank[0][1] # how far in similarity is noTrad to top1 - simRanktnext = toprank[rank-1][1] - toprank[rank][1] # how far in similarity is noTrad to the next word - simRankt10 = toprank[rank-1][1] - toprank[9][1] # how far in similarity is noTrad to top10 - simRankt100 = toprank[rank-1][1] - toprank[99][1] # how far in similarity is noTrad to top100 - simsRankNoTrad = str(sim)+','+str(rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+',' + simsRankNoTrad = extractSimDiffFeats(rank, toprank) + simsRankNoTrad = str(sim)+','+simsRankNoTrad + # for the positive example sim = newSpace.similarity(w1,w2) - simRankt1 = toprank[w2Rank-1][1] - toprank[0][1] # how far in similarity is w2 to top1 - simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1] # how far in similarity is w2 to the next word - simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10 - simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100 - simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+',' + simsRankW2 = extractSimDiffFeats(w2Rank, toprank) + simsRankW2 = str(sim)+','+simsRankW2 else: return noSimsRank,emptyMark,noSimsRank @@ -173,7 +168,7 @@ def main(inF): outF = inF+'.feat' fOUT = open(outF, 'w') - fOUT.write(header) + fOUT.write(features.getHeader()) # Read the quad-lexicon with open(inF) as f: for line in f: diff --git a/scripts/reranker/trainReranker.py b/scripts/reranker/trainReranker.py index 95116b3..5f3fe73 100644 --- a/scripts/reranker/trainReranker.py +++ b/scripts/reranker/trainReranker.py @@ -90,7 +90,7 @@ def main(inF, path): X = df4ML.loc[:, feature_cols] y = df4ML.Gold clf = XGBClassifier(max_depth=6, n_estimators=300, learning_rate=0.05) - # TODO: fit parameters + # TODO: fit parameters of XGB #clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', # beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(8, 2), learning_rate='constant', # learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, @@ -110,14 +110,6 @@ def main(inF, path): plot_importance(clf) plt.show() - #thresholds = np.sort(clf.feature_importances_) - #for thresh in thresholds: - # select features using threshold - # selection = SelectFromModel(clf, threshold=thresh, prefit=True) - # select_X = selection.transform(X) - # scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False) - # print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2)) - if __name__ == "__main__": @@ -129,3 +121,14 @@ def main(inF, path): main(sys.argv[1], scriptPath) + + + #thresholds = np.sort(clf.feature_importances_) + #for thresh in thresholds: + # select features using threshold + # selection = SelectFromModel(clf, threshold=thresh, prefit=True) + # select_X = selection.transform(X) + # scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False) + # print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2)) + + diff --git a/scripts/tradQueriesEmbeddings.py b/scripts/tradQueriesEmbeddings.py index d2cd79a..5ba1b52 100644 --- a/scripts/tradQueriesEmbeddings.py +++ b/scripts/tradQueriesEmbeddings.py @@ -12,14 +12,66 @@ import sys import os.path import unicodedata +from io import StringIO import load import easyBPE import features from gensim.models import KeyedVectors +import numpy as np +import pandas as pd +from xgboost import XGBClassifier + +trainingSet = 'untradDEallkeys.solr.all-languages' bpeMark = '@@' +def preprocessingRead(data): + """ + Basic preprocessing for most of ML algorithms: binarisation, normalisation, scaling + Reads from a saved scaling model for test usage + """ + + global modelPath + + # extract the data into a dataframe + df = pd.read_csv(StringIO(data)) + + # convert categorical column in four binary columns, one per language + df4ML = df.join(pd.get_dummies(df['L2'],prefix='L2')) + + # scale columns + # rankW2 has huge numbers in a wide interval, we should cut and/or move to a log scale + df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 1000 if x>1000 else x) + #df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 0 if x<= 0 else math.log10(x)) + + #colums2scale = ['rankW2','WEsim','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2'] + colums2scale = ['rankW2','l1','l2','l1/l2','lev','levM2','simRankt1','simRankWnext','simRankt10','simRankt100'] + scaler = joblib.load(modelPath+'reranker/'+trainingSet+'.scaler.pkl') + df4ML[colums2scale] = scaler.fit_transform(df4ML[colums2scale]) + + return df4ML + + +def predictBestTrad(df): + """ + """ + + feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2'] + nbest = df.loc[:, feature_cols] + + clf = xgb.Booster() # init model + + # Load previously trained model + clf.load_model(modelPath+'reranker/'+trainingSet+'.model') + + # make prediction, a probability by default with XGB + nbestProbs = clf.predict(nbest) + indexTrad = np.argmax(nbestProbs) + + return indexTrad + + def rreplace(s, old, new, occurrence): """ Replace last occurrence of a substring in a string https://stackoverflow.com/questions/2556108/rreplace-how-to-replace-the-last-occurrence-of-an-expression-in-a-string @@ -35,10 +87,9 @@ def percentage2d(part, whole): return "0" def remove_diacritic(input): - ''' - Accept a unicode string, and return a normal string (bytes in Python 3) + """ Accept a unicode string, and return a normal string (bytes in Python 3) without any diacritical marks. - ''' + """ return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore') def cleanEndString(toClean): @@ -124,43 +175,55 @@ def translate(string, proc): stringTrad = stringTrad + extractTradFromDict(toTrad, capitalized, stringTrad, ctDict) else: # if not, we look for the closest translation(s) in the embeddings space - isSubWord = 0 + isSubWord = '0' bped = easyBPE.applyBPE(proc.bpe, word) if len(bped) >1: - isSubWord = 1 + isSubWord = '1' for subunit in bped: print(subunit) vector = proc.embeddingL1[subunit] - #allSubunit = proc.embeddingL1.similar_by_vector(vector,topn=5) + enSubunits = proc.embeddingEn.similar_by_vector(vector,topn=1000) + allFeats = features.getHeader() for subunitTrad in enSubunits: + # populate for a dataframe with the n-best list w2 = subunit[0] bothBPE = '0' if bpeMark in subunit and bpeMark in w2: bothBPE = '1' basicFeats = features.basicFeatures(subunit,'xx', w2, 'en', isSubWord, bothBPE) - semFeats = features.extractSemFeatures(subunit, w2, 'en', proc) + semFeats = features.extractSemFeatures(subunit, w2, 'en', 40000, proc) lexFeats = features.extractLexFeatures(subunit, w2) - #featureString = S'HA DE FER EL MATEIX CLEANING Q AL TRAINING - - esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2) - deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2) - frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2) - print(enSubunits) - print(esSubunit) - print(deSubunit) - print(frSubunit) + allFeats = allFeats + basicFeats+semFeats+lexFeats+'\n' + # create preprocessed data frame + df = preprocessingRead(allFeats) + indexTrad = predictBestTrad(df) + trad = enSubunits[indexTrad][0] + print(trad) + + + + + #esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2) + #deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2) + #frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2) + #print(enSubunits) + #print(esSubunit) + #print(deSubunit) + #print(frSubunit) # we need to reconstruct BPE return stringTrad -def main(inF, outF): +def main(inF, scriptPath): - modelPath = "../models/" + global modelPath + modelPath = scriptPath+"../models/" # Initialise a new process for translation, loading the models proc = load.QueryTrad(modelPath) + outF = inF+'trad' # Read the queries from file fOUT = open(outF, 'w') with open(inF) as f: @@ -190,11 +253,12 @@ def main(inF, outF): if __name__ == "__main__": - if len(sys.argv) is not 3: - sys.stderr.write('Usage: python3 %s inputFile outputFile\n' % sys.argv[0]) + if len(sys.argv) is not 2: + sys.stderr.write('Usage: python3 %s inputFile\n' % sys.argv[0]) sys.exit(1) print("WARNING: This software needs python >3.6 to run properly\n") - main(sys.argv[1], sys.argv[2]) + scriptPath = os.path.dirname(os.path.abspath( __file__ )) + main(sys.argv[1], scriptPath+'/') # CHECK: source==target doesn't mean untranslated #numTermTrad = numTerms-numTermsUntrad