Skip to content

Commit

Permalink
making the training a bit nicer
Browse files Browse the repository at this point in the history
  • Loading branch information
cristinae committed Aug 28, 2018
1 parent eccd609 commit cc8fba0
Showing 1 changed file with 54 additions and 79 deletions.
133 changes: 54 additions & 79 deletions scripts/reranker/trainReranker.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python3 -W ignore::DeprecationWarning
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


"""
Training a reranker for word embeddings given an input file with a set of features
generated by extractFeaturesLexicon.py
Date: 23.08.2018
Author: cristinae
"""
Expand All @@ -15,7 +16,6 @@
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
Expand All @@ -27,28 +27,24 @@
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib

def main(inF):
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")

outF = inF+'.model'
# read original training file
df = pd.read_csv(inF)
print (df.head())

# shuffle all examples to mix all types
df4ML = df.sample(frac=1)
# convert categorical column in four binary columns, one per language
df4ML = df4ML.join(pd.get_dummies(df['L2'],prefix='L2'))

# Original features
#feature_cols = ['w1','L1','w2','L2','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
# Features to use
feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
#feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']

def preprocessingWrite (df, modelPath):
'''
Basic preprocessing for most of ML algorithms: binarisation, normalisation, scaling
Saves the scaling model for test usage
'''

# convert categorical column in four binary columns, one per language
df4ML = df.join(pd.get_dummies(df['L2'],prefix='L2'))

# scale columns
# there is need for xfb?
# rankW2 has huge numbers in a wide interval, we should cut and/or move to a log scale
df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 1000 if x>1000 else x)
#df4ML['rankW2'] = df4ML['rankW2'].apply(lambda x: 0 if x<= 0 else math.log10(x))
Expand All @@ -58,10 +54,43 @@ def main(inF):
scaler = MinMaxScaler()
df4ML[colums2scale] = scaler.fit_transform(df4ML[colums2scale])

# save final model
joblib.dump(scaler, modelPath+'.scaler.pkl')

return df4ML



def main(inF, path):

modelPath = path + '/../../models/reranker/'
fileName = os.path.basename(inF)
fileName = os.path.splitext(fileName)[0] # yes, twice, it has 2 extensions
baseName = modelPath + '' + os.path.splitext(fileName)[0]
outModel = baseName +'.model'

# read original training file
df = pd.read_csv(inF)
print (df.head())

# shuffle all examples to mix all types
df4ML = df.sample(frac=1)

# Aply the preprocessing pipeline
df4ML = preprocessingWrite(df4ML, baseName)


# Original features
#feature_cols = ['w1','L1','w2','L2','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
# Features to use
feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','simRankt1','simRankWnext','simRankt10','simRankt100','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']
#feature_cols = ['L2_de','L2_en','L2_es','L2_fr','srcSubUnit','bothBPEmark','WEsim','rankW2','l1','l2','l1/l2','lev','cosSimN2','cosSimN3','cosSimN4','levM2']

scoring = ['accuracy', 'precision_macro', 'recall_macro']
X = df4ML.loc[:, feature_cols]
y = df4ML.Gold
clf = XGBClassifier(max_depth=6, n_estimators=300, learning_rate=0.05)
# TODO: fit parameters
#clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
# beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(8, 2), learning_rate='constant',
# learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
Expand All @@ -75,9 +104,8 @@ def main(inF):
#print(scores['test_accuracy'])

print("Accuracy: %0.3f (+/- %0.3f)" % (scores['test_accuracy'].mean(), scores['test_accuracy'].std()*2))
#print("Precision: %0.3f (+/- %0.3f)" % (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std()*2))
clf.fit(X, y)
clf.save_model('prova.model')
clf.save_model(outModel)
##plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
plot_importance(clf)
plt.show()
Expand All @@ -97,60 +125,7 @@ def main(inF):
if len(sys.argv) is not 2:
sys.stderr.write('Usage: python3 %s trainingFile\n' % sys.argv[0])
sys.exit(1)
main(sys.argv[1])



# fearure importance for SVMlinear
#rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(2),
# scoring='accuracy')
#rfecv.fit(X, y)
#print("Optimal number of features : %d" % rfecv.n_features_)
#print(rfecv.ranking_)
# Plot number of features VS. cross-validation scores
#plt.figure()
#plt.xlabel("Number of features selected")
#plt.ylabel("Cross validation score (nb of correct classifications)")
#plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
#plt.show()

'''
Thresh=0.003, n=20, Accuracy: 0.8712 (+/- 0.0104)
Thresh=0.003, n=20, Precision: 0.8699 (+/- 0.0113)
Thresh=0.004, n=19, Accuracy: 0.8714 (+/- 0.0068)
Thresh=0.004, n=19, Precision: 0.8701 (+/- 0.0073)
Thresh=0.006, n=18, Accuracy: 0.8721 (+/- 0.0070)
Thresh=0.006, n=18, Precision: 0.8708 (+/- 0.0075)
Thresh=0.009, n=16, Accuracy: 0.8692 (+/- 0.0088)
Thresh=0.009, n=16, Precision: 0.8679 (+/- 0.0095)
Thresh=0.011, n=15, Accuracy: 0.8712 (+/- 0.0077)
Thresh=0.011, n=15, Precision: 0.8700 (+/- 0.0080)
Thresh=0.012, n=14, Accuracy: 0.8706 (+/- 0.0068)
Thresh=0.012, n=14, Precision: 0.8693 (+/- 0.0074)
Thresh=0.019, n=13, Accuracy: 0.8696 (+/- 0.0050)
Thresh=0.019, n=13, Precision: 0.8685 (+/- 0.0060)
Thresh=0.023, n=12, Accuracy: 0.8709 (+/- 0.0054)
Thresh=0.023, n=12, Precision: 0.8698 (+/- 0.0061)
Thresh=0.027, n=11, Accuracy: 0.8696 (+/- 0.0055)
Thresh=0.027, n=11, Precision: 0.8684 (+/- 0.0067)
Thresh=0.034, n=10, Accuracy: 0.8687 (+/- 0.0060)
Thresh=0.034, n=10, Precision: 0.8674 (+/- 0.0064)
Thresh=0.043, n=9, Accuracy: 0.8687 (+/- 0.0077)
Thresh=0.043, n=9, Precision: 0.8673 (+/- 0.0083)
Thresh=0.053, n=8, Accuracy: 0.8674 (+/- 0.0109)
Thresh=0.053, n=8, Precision: 0.8658 (+/- 0.0115)
Thresh=0.062, n=7, Accuracy: 0.8682 (+/- 0.0072)
Thresh=0.062, n=7, Precision: 0.8666 (+/- 0.0071)
Thresh=0.068, n=6, Accuracy: 0.8642 (+/- 0.0083)
Thresh=0.068, n=6, Precision: 0.8625 (+/- 0.0085)
Thresh=0.073, n=5, Accuracy: 0.8622 (+/- 0.0067)
Thresh=0.073, n=5, Precision: 0.8607 (+/- 0.0069)
Thresh=0.085, n=4, Accuracy: 0.8450 (+/- 0.0108)
Thresh=0.085, n=4, Precision: 0.8442 (+/- 0.0108)
Thresh=0.099, n=3, Accuracy: 0.8085 (+/- 0.0114)
Thresh=0.099, n=3, Precision: 0.8116 (+/- 0.0118)
Thresh=0.156, n=2, Accuracy: 0.7579 (+/- 0.0113)
Thresh=0.156, n=2, Precision: 0.7605 (+/- 0.0121)
Thresh=0.208, n=1, Accuracy: 0.6871 (+/- 0.0248)
Thresh=0.208, n=1, Precision: 0.6845 (+/- 0.0248)
'''
scriptPath = os.path.dirname(os.path.abspath( __file__ ))
main(sys.argv[1], scriptPath)


0 comments on commit cc8fba0

Please sign in to comment.