building the complete pipeline for reranking

clubs-project · Aug 28, 2018 · 223d24e · 223d24e
1 parent cc8fba0
commit 223d24e
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 53 deletions.
diff --git a/scripts/features.py b/scripts/features.py
@@ -18,17 +18,19 @@
 import phonetics
 
 bpeMark = '@@'
+header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n'
 
 
+def getHeader():
+    return header
+
 def basicFeatures(w1, l1, w2, l2, isSubWord, bothBPE):
     '''
     Creates a string with a cvs format for the basic features
     '''
     return w1+","+l1+","+ w2+","+l2+","+isSubWord+","+bothBPE+","
 
-
-
-def extractSemFeatures(w1, w2, l2, proc):
+def extractSemFeatures(w1, w2, l2, nexplore, proc):
     '''
     Extracts the set of semantic features related to word embeddings for a pair of word (w1, w2)
     Returns a string with a cvs format for the features
@@ -61,14 +63,10 @@ def extractSemFeatures(w1, w2, l2, proc):
     newSpace.add(w1,vector,replace=True)
     if w2 in newSpace.vocab:
        w2Rank = newSpace.rank(w1,w2)
-       toprank = newSpace.similar_by_vector(vector,topn=explore)
        sim = newSpace.similarity(w1,w2)
-       simRankt1 = toprank[w2Rank-1][1] - toprank[0][1]  # how far in similarity is w2 to top1
-       simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1]  # how far in similarity is w2 to the next word
-       simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10
-       simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100
-       simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
-
+       toprank = newSpace.similar_by_vector(vector,topn=nexplore)
+       simsRankW2 = extractSimDiffFeats(w2Rank, toprank)
+       simsRankW2 = str(sim)+','+simsRankW2      
     else:
        return noSimsRank
 
@@ -78,6 +76,22 @@ def extractSemFeatures(w1, w2, l2, proc):
     return simsRankW2
 
 
+def extractSimDiffFeats(rankDif, toprank):
+    """ Extracts the subset of semantic features related to differences in similarities between
+        words and translations
+    """
+
+    #toprank = mlweSpace.similar_by_vector(vector,topn=nexplore)
+    simRankt1 = toprank[rankDif-1][1] - toprank[0][1]  # how far in similarity is w2 to top1
+    simRanktnext = toprank[rankDif-1][1] - toprank[rankDif][1]  # how far in similarity is w2 to the next word
+    simRankt10 = toprank[rankDif-1][1] - toprank[9][1] # how far in similarity is w2 to top10
+    simRankt100 = toprank[rankDif-1][1] - toprank[99][1] # how far in similarity is w2 to top100
+
+    simsRank = str(rankDif)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
+
+    return simsRank
+
+
 def extractLexFeatures(w1, w2):
     '''
     Extracts the set of features for a pair of word (w1, w2)

diff --git a/scripts/load.py b/scripts/load.py
@@ -47,7 +47,7 @@ def __init__(self, modelPath):
         print("..Multilingual lexicon loaded. Using "+str(mem)+" MB of memory")
         self.ctDict = ctDict
 
-       # Load BPE model
+        # Load BPE model
         self.bpe = easyBPE.BPE(self.BPEcodes)
         mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
         print("..BPE model loaded. Using "+str(mem)+" MB of memory")

diff --git a/scripts/reranker/extractFeaturesLexicon.py b/scripts/reranker/extractFeaturesLexicon.py
@@ -30,7 +30,6 @@
 
 bpeMark = '@@'
 emptyMark = 'EMPTY'
-header = 'Gold,w1,L1,w2,L2,srcSubUnit,bothBPEmark,WEsim,rankW2,simRankt1,simRankWnext,simRankt10,simRankt100,l1,l2,l1/l2,lev,cosSimN2,cosSimN3,cosSimN4,levM2\n'
 
 # for debugging
 countUp = 0 
@@ -116,20 +115,16 @@ def findSimsNonTrad(w1, w2, l2, proc):
           print("kk")
 
        # since all calculations are ready here we use them to estimate WE-related features
+       toprank = newSpace.similar_by_vector(vector,topn=explore)
        # for the negative example
        sim = newSpace.similarity(w1,nonTrad)
-       simRankt1 = toprank[rank-1][1] - toprank[0][1]  # how far in similarity is noTrad to top1
-       simRanktnext = toprank[rank-1][1] - toprank[rank][1]  # how far in similarity is noTrad to the next word
-       simRankt10 = toprank[rank-1][1] - toprank[9][1] # how far in similarity is noTrad to top10
-       simRankt100 = toprank[rank-1][1] - toprank[99][1] # how far in similarity is noTrad to top100
-       simsRankNoTrad = str(sim)+','+str(rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
+       simsRankNoTrad = extractSimDiffFeats(rank, toprank)
+       simsRankNoTrad = str(sim)+','+simsRankNoTrad
+
        # for the positive example
        sim = newSpace.similarity(w1,w2)
-       simRankt1 = toprank[w2Rank-1][1] - toprank[0][1]  # how far in similarity is w2 to top1
-       simRanktnext = toprank[w2Rank-1][1] - toprank[w2Rank][1]  # how far in similarity is w2 to the next word
-       simRankt10 = toprank[w2Rank-1][1] - toprank[9][1] # how far in similarity is w2 to top10
-       simRankt100 = toprank[w2Rank-1][1] - toprank[99][1] # how far in similarity is w2 to top100
-       simsRankW2 = str(sim)+','+str(w2Rank)+','+str(simRankt1)+','+str(simRanktnext)+','+str(simRankt10)+','+str(simRankt100)+','
+       simsRankW2 = extractSimDiffFeats(w2Rank, toprank)
+       simsRankW2 = str(sim)+','+simsRankW2
 
     else:
        return noSimsRank,emptyMark,noSimsRank
@@ -173,7 +168,7 @@ def main(inF):
 
     outF = inF+'.feat'
     fOUT = open(outF, 'w')
-    fOUT.write(header)
+    fOUT.write(features.getHeader())
     # Read the quad-lexicon
     with open(inF) as f:
        for line in f:

diff --git a/scripts/reranker/trainReranker.py b/scripts/reranker/trainReranker.py
@@ -90,7 +90,7 @@ def main(inF, path):
     X = df4ML.loc[:, feature_cols]
     y = df4ML.Gold
     clf = XGBClassifier(max_depth=6, n_estimators=300, learning_rate=0.05)
-    # TODO: fit parameters
+    # TODO: fit parameters of XGB
     #clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
     #   beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(8, 2), learning_rate='constant',
     #   learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
@@ -110,14 +110,6 @@ def main(inF, path):
     plot_importance(clf)
     plt.show()
 
-    #thresholds = np.sort(clf.feature_importances_)
-    #for thresh in thresholds:
-	# select features using threshold
-    #    selection = SelectFromModel(clf, threshold=thresh, prefit=True)
-    #    select_X = selection.transform(X)
-    #    scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False)
-    #    print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2))
-
 
 
 if __name__ == "__main__":
@@ -129,3 +121,14 @@ def main(inF, path):
     main(sys.argv[1], scriptPath)
 
 
+
+
+    #thresholds = np.sort(clf.feature_importances_)
+    #for thresh in thresholds:
+	# select features using threshold
+    #    selection = SelectFromModel(clf, threshold=thresh, prefit=True)
+    #    select_X = selection.transform(X)
+    #    scores = cross_validate(clf, select_X, y, scoring=scoring, cv=5, return_train_score=False)
+    #    print("Thresh=%.3f, n=%d, Accuracy: %0.4f (+/- %0.4f)" % (thresh, select_X.shape[1], scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2))
+
+
diff --git a/scripts/tradQueriesEmbeddings.py b/scripts/tradQueriesEmbeddings.py
@@ -12,14 +12,66 @@
 import sys
 import os.path
 import unicodedata
+from io import StringIO
 
 import load
 import easyBPE
 import features
 from gensim.models import KeyedVectors
 
+import numpy as np
+import pandas as pd
+from xgboost import XGBClassifier
+
+trainingSet = 'untradDEallkeys.solr.all-languages'
 bpeMark = '@@'
 
+def preprocessingRead(data):
+    """
+     Basic preprocessing for most of ML algorithms: binarisation, normalisation, scaling 
+     Reads from a saved scaling model for test usage
+    """
+
+    global modelPath
+
+    # extract the data into a dataframe
+    df = pd.read_csv(StringIO(data))
+
+    # convert categorical column in four binary columns, one per language
+    df4ML = df.join(pd.get_dummies(df['L2'],prefix='L2'))
+
+    # scale columns
+    # rankW2 has huge numbers in a wide interval, we should cut and/or move to a log scale
+    df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 1000 if x>1000 else x)
+    #df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 0 if x<= 0 else math.log10(x))
+
+    #colums2scale = ['rankW2','WEsim','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
+    colums2scale = ['rankW2','l1','l2','l1/l2','lev','levM2','simRankt1','simRankWnext','simRankt10','simRankt100']
+    scaler = joblib.load(modelPath+'reranker/'+trainingSet+'.scaler.pkl') 
+    df4ML[colums2scale] = scaler.fit_transform(df4ML[colums2scale])
+
+    return df4ML
+
+
+def predictBestTrad(df):
+    """
+    """
+
+    feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
+    nbest = df.loc[:, feature_cols]
+
+    clf = xgb.Booster()  # init model
+
+    # Load previously trained model
+    clf.load_model(modelPath+'reranker/'+trainingSet+'.model')
+
+    # make prediction, a probability by default with XGB
+    nbestProbs = clf.predict(nbest)  
+    indexTrad = np.argmax(nbestProbs)
+
+    return indexTrad
+
+
 def rreplace(s, old, new, occurrence):
     """ Replace last occurrence of a substring in a string
     https://stackoverflow.com/questions/2556108/rreplace-how-to-replace-the-last-occurrence-of-an-expression-in-a-string
@@ -35,10 +87,9 @@ def percentage2d(part, whole):
        return "0"
 
 def remove_diacritic(input):
-    '''
-    Accept a unicode string, and return a normal string (bytes in Python 3)
+    """ Accept a unicode string, and return a normal string (bytes in Python 3)
     without any diacritical marks.
-    '''
+    """
     return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')
 
 def cleanEndString(toClean):
@@ -124,43 +175,55 @@ def translate(string, proc):
                stringTrad = stringTrad + extractTradFromDict(toTrad, capitalized, stringTrad, ctDict)
             else:
             # if not, we look for the closest translation(s) in the embeddings space
-               isSubWord = 0
+               isSubWord = '0'
                bped = easyBPE.applyBPE(proc.bpe, word)
                if len(bped) >1:
-                  isSubWord = 1
+                  isSubWord = '1'
                for subunit in bped:
                    print(subunit)
                    vector =  proc.embeddingL1[subunit]
-                   #allSubunit = proc.embeddingL1.similar_by_vector(vector,topn=5)
+
                    enSubunits = proc.embeddingEn.similar_by_vector(vector,topn=1000)
+                   allFeats = features.getHeader()
                    for subunitTrad in enSubunits:
+                       # populate for a dataframe with the n-best list
                        w2 = subunit[0]
                        bothBPE = '0'
                        if bpeMark in subunit and bpeMark in w2:
                          bothBPE = '1'
                        basicFeats = features.basicFeatures(subunit,'xx', w2, 'en', isSubWord, bothBPE)
-                       semFeats = features.extractSemFeatures(subunit, w2, 'en', proc)
+                       semFeats = features.extractSemFeatures(subunit, w2, 'en', 40000, proc)
                        lexFeats = features.extractLexFeatures(subunit, w2)
-                       #featureString = S'HA DE FER EL MATEIX CLEANING Q AL TRAINING
-
-                   esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2)
-                   deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2)
-                   frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2)
-                   print(enSubunits)
-                   print(esSubunit)
-                   print(deSubunit)
-                   print(frSubunit)
+                       allFeats = allFeats + basicFeats+semFeats+lexFeats+'\n'
+                    # create preprocessed data frame
+                   df = preprocessingRead(allFeats)
+                   indexTrad = predictBestTrad(df)
+                   trad = enSubunits[indexTrad][0]
+                   print(trad)
+
+
+
+
+                   #esSubunit = proc.embeddingEs.similar_by_vector(vector,topn=2)
+                   #deSubunit = proc.embeddingDe.similar_by_vector(vector,topn=2)
+                   #frSubunit = proc.embeddingFr.similar_by_vector(vector,topn=2)
+                   #print(enSubunits)
+                   #print(esSubunit)
+                   #print(deSubunit)
+                   #print(frSubunit)
               # we need to reconstruct BPE
 
     return stringTrad
 
 
-def main(inF, outF):
+def main(inF, scriptPath):
 
-    modelPath = "../models/"
+    global modelPath
+    modelPath = scriptPath+"../models/"
     # Initialise a new process for translation, loading the models
     proc = load.QueryTrad(modelPath)
 
+    outF = inF+'trad'
     # Read the queries from file
     fOUT = open(outF, 'w')
     with open(inF) as f:
@@ -190,11 +253,12 @@ def main(inF, outF):
 
 if __name__ == "__main__":
 
-    if len(sys.argv) is not 3:
-        sys.stderr.write('Usage: python3 %s inputFile outputFile\n' % sys.argv[0])
+    if len(sys.argv) is not 2:
+        sys.stderr.write('Usage: python3 %s inputFile\n' % sys.argv[0])
         sys.exit(1)
     print("WARNING: This software needs python >3.6 to run properly\n")
-    main(sys.argv[1], sys.argv[2])
+    scriptPath = os.path.dirname(os.path.abspath( __file__ ))
+    main(sys.argv[1], scriptPath+'/')
 
     # CHECK: source==target doesn't mean untranslated
     #numTermTrad = numTerms-numTermsUntrad