From 48d19f2a4540fd29c44c7832ff2ba55ce3dfd547 Mon Sep 17 00:00:00 2001 From: Fanwang Meng Date: Mon, 9 Oct 2023 12:11:02 -0400 Subject: [PATCH] Update the function to allow direct usage of molecular features --- b3clf/b3clf.py | 71 ++++++++++++++++++++++++++++++-------------------- b3clf/utils.py | 23 +++++++++------- 2 files changed, 57 insertions(+), 37 deletions(-) diff --git a/b3clf/b3clf.py b/b3clf/b3clf.py index 2694e6f..19fa331 100644 --- a/b3clf/b3clf.py +++ b/b3clf/b3clf.py @@ -31,26 +31,31 @@ import numpy as np from .descriptor_padel import compute_descriptors from .geometry_opt import geometry_optimize -from .utils import (get_descriptors, predict_permeability, - scale_descriptors, select_descriptors) +from .utils import ( + get_descriptors, + predict_permeability, + scale_descriptors, + select_descriptors, +) __all__ = [ "b3clf", ] -def b3clf(mol_in, - sep="\s+|\t+", - clf="xgb", - sampling="classic_ADASYN", - output="B3clf_output.xlsx", - verbose=1, - random_seed=42, - time_per_mol=-1, - keep_features="no", - keep_sdf="no", - threshold="none", - ): +def b3clf( + mol_in, + sep="\s+|\t+", + clf="xgb", + sampling="classic_ADASYN", + output="B3clf_output.xlsx", + verbose=1, + random_seed=42, + time_per_mol=-1, + keep_features="no", + keep_sdf="no", + threshold="none", +): """Use B3clf for BBB classifications with resampling strategies. Parameters @@ -110,12 +115,13 @@ def b3clf(mol_in, geometry_optimize(input_fname=mol_in, output_sdf=internal_sdf, sep=sep) - _ = compute_descriptors(sdf_file=internal_sdf, - excel_out=features_out, - output_csv=None, - timeout=None, - time_per_molecule=time_per_mol, - ) + _ = compute_descriptors( + sdf_file=internal_sdf, + excel_out=features_out, + output_csv=None, + timeout=None, + time_per_molecule=time_per_mol, + ) # Get computed descriptors X_features, info_df = get_descriptors(df=features_out) @@ -131,16 +137,25 @@ def b3clf(mol_in, # clf = get_clf(clf_str=clf, sampling_str=sampling) # Get classifier - result_df = predict_permeability(clf_str=clf, - sampling_str=sampling, - features_df=X_features, - info_df=info_df, - threshold=threshold) + result_df = predict_permeability( + clf_str=clf, + sampling_str=sampling, + mol_features=X_features, + info_df=info_df, + threshold=threshold, + ) # Get classifier - display_cols = ["ID", "SMILES", "B3clf_predicted_probability", "B3clf_predicted_label"] - - result_df = result_df[[col for col in result_df.columns.to_list() if col in display_cols]] + display_cols = [ + "ID", + "SMILES", + "B3clf_predicted_probability", + "B3clf_predicted_label", + ] + + result_df = result_df[ + [col for col in result_df.columns.to_list() if col in display_cols] + ] if verbose != 0: print(result_df) diff --git a/b3clf/utils.py b/b3clf/utils.py index 8d90387..cfe869c 100644 --- a/b3clf/utils.py +++ b/b3clf/utils.py @@ -89,9 +89,9 @@ def scale_descriptors(df): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, "pre_trained", "b3clf_scaler.joblib") b3db_scaler = load(filename) - df.iloc[:, :] = b3db_scaler.transform(df) + df_new = b3db_scaler.transform(df) - return df + return df_new def get_clf(clf_str, sampling_str): @@ -125,7 +125,9 @@ def get_clf(clf_str, sampling_str): return clf -def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold="none"): +def predict_permeability( + clf_str, sampling_str, mol_features, info_df, threshold="none" +): """Compute and store BBB predicted label and predicted probability to results dataframe.""" # load the threshold data @@ -133,18 +135,21 @@ def predict_permeability(clf_str, sampling_str, features_df, info_df, threshold= fpath_thres = os.path.join(dirname, "data", "B3clf_thresholds.xlsx") df_thres = pd.read_excel(fpath_thres, index_col=0, engine="openpyxl") # default threshold is 0.5 - label_pool = np.zeros(features_df.shape[0], dtype=int) + label_pool = np.zeros(mol_features.shape[0], dtype=int) # get the classifier clf = get_clf(clf_str=clf_str, sampling_str=sampling_str) - if features_df.index.tolist() != info_df.index.tolist(): - raise ValueError( - "Features_df and Info_df do not have the same index. Internal processing error" - ) + if type(mol_features) == pd.DataFrame: + if mol_features.index.tolist() != info_df.index.tolist(): + raise ValueError( + "Features_df and Info_df do not have the same index. Internal processing error" + ) # get predicted probabilities - info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(features_df)[:, 1] + info_df.loc[:, "B3clf_predicted_probability"] = clf.predict_proba(mol_features)[ + :, 1 + ] # get predicted label from probability using the threshold mask = np.greater_equal( info_df["B3clf_predicted_probability"].to_numpy(),