From 48d19f2a4540fd29c44c7832ff2ba55ce3dfd547 Mon Sep 17 00:00:00 2001
From: Fanwang Meng <fwmeng88@gmail.com>
Date: Mon, 9 Oct 2023 12:11:02 -0400
Subject: [PATCH] Update the function to allow direct usage of molecular
 features

---
 b3clf/b3clf.py | 71 ++++++++++++++++++++++++++++++--------------------
 b3clf/utils.py | 23 +++++++++-------
 2 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/b3clf/b3clf.py b/b3clf/b3clf.py
index 2694e6f..19fa331 100644
--- a/b3clf/b3clf.py
+++ b/b3clf/b3clf.py
@@ -31,26 +31,31 @@
 import numpy as np
 from .descriptor_padel import compute_descriptors
 from .geometry_opt import geometry_optimize
-from .utils import (get_descriptors, predict_permeability,
-                    scale_descriptors, select_descriptors)
+from .utils import (
+    get_descriptors,
+    predict_permeability,
+    scale_descriptors,
+    select_descriptors,
+)
 
 __all__ = [
     "b3clf",
 ]
 
 
-def b3clf(mol_in,
-          sep="\s+|\t+",
-          clf="xgb",
-          sampling="classic_ADASYN",
-          output="B3clf_output.xlsx",
-          verbose=1,
-          random_seed=42,
-          time_per_mol=-1,
-          keep_features="no",
-          keep_sdf="no",
-          threshold="none",
-          ):
+def b3clf(
+    mol_in,
+    sep="\s+|\t+",
+    clf="xgb",
+    sampling="classic_ADASYN",
+    output="B3clf_output.xlsx",
+    verbose=1,
+    random_seed=42,
+    time_per_mol=-1,
+    keep_features="no",
+    keep_sdf="no",
+    threshold="none",
+):
     """Use B3clf for BBB classifications with resampling strategies.
 
     Parameters
@@ -110,12 +115,13 @@ def b3clf(mol_in,
 
     geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep)
 
-    _ = compute_descriptors(sdf_file=internal_sdf,
-                            excel_out=features_out,
-                            output_csv=None,
-                            timeout=None,
-                            time_per_molecule=time_per_mol,
-                            )
+    _ = compute_descriptors(
+        sdf_file=internal_sdf,
+        excel_out=features_out,
+        output_csv=None,
+        timeout=None,
+        time_per_molecule=time_per_mol,
+    )
 
     # Get computed descriptors
     X_features, info_df = get_descriptors(df=features_out)
@@ -131,16 +137,25 @@ def b3clf(mol_in,
     # clf = get_clf(clf_str=clf, sampling_str=sampling)
 
     # Get classifier
-    result_df = predict_permeability(clf_str=clf,
-                                     sampling_str=sampling,
-                                     features_df=X_features,
-                                     info_df=info_df,
-                                     threshold=threshold)
+    result_df = predict_permeability(
+        clf_str=clf,
+        sampling_str=sampling,
+        mol_features=X_features,
+        info_df=info_df,
+        threshold=threshold,
+    )
 
     # Get classifier
-    display_cols = ["ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label"]
-
-    result_df = result_df[[col for col in result_df.columns.to_list() if col in display_cols]]
+    display_cols = [
+        "ID",
+        "SMILES",
+        "B3clf_predicted_probability",
+        "B3clf_predicted_label",
+    ]
+
+    result_df = result_df[
+        [col for col in result_df.columns.to_list() if col in display_cols]
+    ]
     if verbose != 0:
         print(result_df)
 
diff --git a/b3clf/utils.py b/b3clf/utils.py
index 8d90387..cfe869c 100644
--- a/b3clf/utils.py
+++ b/b3clf/utils.py
@@ -89,9 +89,9 @@ def scale_descriptors(df):
     dirname = os.path.dirname(__file__)
     filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib")
     b3db_scaler = load(filename)
-    df.iloc[:, :] = b3db_scaler.transform(df)
+    df_new = b3db_scaler.transform(df)
 
-    return df
+    return df_new
 
 
 def get_clf(clf_str, sampling_str):
@@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str):
     return clf
 
 
-def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold="none"):
+def predict_permeability(
+    clf_str, sampling_str, mol_features, info_df, threshold="none"
+):
     """Compute and store BBB predicted label and predicted probability to results dataframe."""
 
     # load the threshold data
@@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold=
     fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx")
     df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl")
     # default threshold is 0.5
-    label_pool = np.zeros(features_df.shape[0], dtype=int)
+    label_pool = np.zeros(mol_features.shape[0], dtype=int)
 
     # get the classifier
     clf = get_clf(clf_str=clf_str, sampling_str=sampling_str)
 
-    if features_df.index.tolist() != info_df.index.tolist():
-        raise ValueError(
-            "Features_df and Info_df do not have the same index. Internal processing error"
-        )
+    if type(mol_features) == pd.DataFrame:
+        if mol_features.index.tolist() != info_df.index.tolist():
+            raise ValueError(
+                "Features_df and Info_df do not have the same index. Internal processing error"
+            )
 
     # get predicted probabilities
-    info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(features_df)[:, 1]
+    info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[
+        :, 1
+    ]
     # get predicted label from probability using the threshold
     mask = np.greater_equal(
         info_df["B3clf_predicted_probability"].to_numpy(),