Merge pull request dssg#1 from dssg/triage_transfer

Import code from triage
shaycrk · Jul 17, 2017 · cacc2df · cacc2df
2 parents 765faa3 + efd6e61
commit cacc2df
Show file tree

Hide file tree

Showing 23 changed files with 3,671 additions and 0 deletions.
diff --git a/catwalk/db.py b/catwalk/db.py
@@ -0,0 +1,24 @@
+import yaml
+from sqlalchemy import create_engine
+from sqlalchemy.engine.url import URL
+from sqlalchemy.pool import QueuePool
+
+from results_schema import *
+
+
+def ensure_db(engine):
+    Base.metadata.create_all(engine)
+
+
+def connect(poolclass=QueuePool):
+    with open('database.yaml') as f:
+        profile = yaml.load(f)
+        dbconfig = {
+            'host': profile['host'],
+            'username': profile['user'],
+            'database': profile['db'],
+            'password': profile['pass'],
+            'port': profile['port'],
+        }
+        dburl = URL('postgres', **dbconfig)
+        return create_engine(dburl, poolclass=poolclass)
diff --git a/catwalk/estimators/__init__.py b/catwalk/estimators/__init__.py
diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.linear_model import LogisticRegression
+
+from catwalk.estimators.transformers import CutOff
+
+class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
+    """
+    An in-place replacement for the scikit-learn's LogisticRegression.
+
+    It incorporates the MaxMinScaler, and the CutOff as preparations
+    for the  logistic regression.
+    """
+    def __init__(self, penalty='l2', dual=False, tol=1e-4, C=1.0,
+                 fit_intercept=True, intercept_scaling=1, class_weight=None,
+                 random_state=None, solver='liblinear', max_iter=100,
+                 multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
+
+
+        self.penalty = penalty
+        self.dual = dual
+        self.tol = tol
+        self.C = C
+        self.fit_intercept = fit_intercept
+        self.intercept_scaling = intercept_scaling
+        self.class_weight = class_weight
+        self.random_state = random_state
+        self.solver = solver
+        self.max_iter = max_iter
+        self.multi_class = multi_class
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.n_jobs = n_jobs
+
+        self.minmax_scaler = MinMaxScaler()
+        self.dsapp_cutoff = CutOff()
+        self.lr = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C,
+                                     fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
+                                     random_state=random_state, solver=solver, max_iter=max_iter,
+                                     multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
+
+        self.pipeline =Pipeline([
+            ('minmax_scaler', self.minmax_scaler),
+            ('dsapp_cutoff', self.dsapp_cutoff),
+            ('lr', self.lr)
+        ])
+
+
+    def fit(self, X, y = None):
+        self.pipeline.fit(X, y)
+
+        self.min_ = self.pipeline.named_steps['minmax_scaler'].min_
+        self.scale_ = self.pipeline.named_steps['minmax_scaler'].scale_
+        self.data_min_ = self.pipeline.named_steps['minmax_scaler'].data_min_
+        self.data_max_ = self.pipeline.named_steps['minmax_scaler'].data_max_
+        self.data_range_ = self.pipeline.named_steps['minmax_scaler'].data_range_
+
+        self.coef_ = self.pipeline.named_steps['lr'].coef_
+        self.intercept_ = self.pipeline.named_steps['lr'].intercept_
+
+        self.classes_ = self.pipeline.named_steps['lr'].classes_
+
+        return self
+
+    def predict_proba(self, X):
+        return self.pipeline.predict_proba(X)
+
+    def predict_log_proba(self, X):
+        return self.pipeline.predict_log_proba(X)
+
+    def predict(self, X):
+        return self.pipeline.predict(X)
+
+    def score(self, X, y):
+        return self.pipeline.score(X,y)