This repository has been archived by the owner on Oct 15, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 88
/
models.py
60 lines (48 loc) · 2.39 KB
/
models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from features import TextFeatureTransformer, BadWordCounter, FeatureStacker
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
def build_stacked_model():
select = SelectPercentile(score_func=chi2, percentile=16)
clf = LogisticRegression(tol=1e-8, penalty='l2', C=4)
countvect_char = TfidfVectorizer(ngram_range=(1, 5),
analyzer="char", binary=False)
countvect_word = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False, min_df=3)
badwords = BadWordCounter()
ft = FeatureStacker([("badwords", badwords), ("chars", countvect_char),
("words", countvect_word)])
pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])
return pipeline
def build_elasticnet_model():
select = SelectPercentile(score_func=chi2, percentile=16)
clf = SGDClassifier(loss='log', penalty="elasticnet", shuffle=True,
alpha=0.0001, l1_ratio=0.95, n_iter=20)
countvect_char = TfidfVectorizer(ngram_range=(1, 5),
analyzer="char", binary=False)
countvect_word = TfidfVectorizer(ngram_range=(1, 3),
analyzer="word", binary=False, min_df=3)
badwords = BadWordCounter()
scaler = MinMaxScaler()
badwords_pipe = Pipeline([('bad', badwords), ('scaler', scaler)])
ft = FeatureStacker([("badwords", badwords_pipe), ("chars",
countvect_char), ("words", countvect_word)])
pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])
return pipeline
def build_base_model():
select = SelectPercentile(score_func=chi2, percentile=18)
clf = LogisticRegression(tol=1e-8, penalty='l2', C=7)
countvect_char = TfidfVectorizer(ngram_range=(1, 5),
analyzer="char", binary=False)
badwords = BadWordCounter()
ft = FeatureStacker([("badwords", badwords), ("chars", countvect_char), ])
pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])
return pipeline
def build_nltk_model():
select = SelectPercentile(score_func=chi2, percentile=36)
clf = LogisticRegression(tol=1e-8, penalty='l2', C=2)
ft = TextFeatureTransformer()
pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)])
return pipeline