Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Literature evaluation updates #127

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
__pycache__/

wikipathways_graphs/annotated_diagram
wikipathways_graphs/literature_comparison
wikipathways_graphs/literature_comparison/*
!wikipathways_graphs/literature_comparison/Evaluation_Files/
wikipathways_graphs/literature_comparison/Evaluation_Files/*
!wikipathways_graphs/literature_comparison/Evaluation_Files/concept_idf_annotated.csv
wikipathways_graphs/pkl
wikipathways_graphs/WP*
!wikipathways_graphs/PFOCR_url_list.txt
Expand Down
70 changes: 70 additions & 0 deletions BioBERT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Load model directly
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)




tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"

# Define sentences
sentence1 = "STAT1"
sentence2 = "IFNG"
sentence3 = "PYROXD2"

# Get embeddings
embeddings1 = get_embeddings(model, tokenizer, sentence1)
embeddings2 = get_embeddings(model, tokenizer, sentence2)
embeddings3 = get_embeddings(model, tokenizer, sentence3)

# Calculate distances
print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())


###########################################################3

# power transformation function for range increase


# def modified_power_scale(x, xmin, p):
# """ Scale the distance using a modified power function to amplify differences. """
# return (x - xmin) ** p

# # Constants
# xmin = 0.001 # Slightly less than our minimum expected value
# p = 0.25 # High power to significantly amplify differences

# # Values
# distance1 = 0.007
# distance2 = 0.008

# # Apply scaling
# scaled_distance1 = modified_power_scale(distance1, xmin, p)
# scaled_distance2 = modified_power_scale(distance2, xmin, p)

# print("Scaled Distance 1:", scaled_distance1)
# print("Scaled Distance 2:", scaled_distance2)




70 changes: 70 additions & 0 deletions RoBERTa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')


# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"


# Define sentences
sentence1 = "STAT1"
sentence2 = "IFNG"
sentence3 = "PYROXD2"

# Get embeddings
embeddings1 = get_embeddings(model, tokenizer, sentence1)
embeddings2 = get_embeddings(model, tokenizer, sentence2)
embeddings3 = get_embeddings(model, tokenizer, sentence3)

# Calculate distances
print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())



# ###########################################################3

# # power transformation function for range increase


# def modified_power_scale(x, xmin, p):
# """ Scale the distance using a modified power function to amplify differences. """
# return (x - xmin) ** p

# # Constants
# xmin = 0.001 # Slightly less than our minimum expected value
# p = 0.25 # High power to significantly amplify differences

# # Values
# distance1 = 0.007
# distance2 = 0.008

# # Apply scaling
# scaled_distance1 = modified_power_scale(distance1, xmin, p)
# scaled_distance2 = modified_power_scale(distance2, xmin, p)

# print("Scaled Distance 1:", scaled_distance1)
# print("Scaled Distance 2:", scaled_distance2)




47 changes: 47 additions & 0 deletions biomed_RoBERTa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# from transformers import RobertaTokenizer, RobertaModel
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance

# Function to get embeddings
def get_embeddings(model, tokenizer, sentence):
encoded_input = tokenizer(sentence, return_tensors='pt')
with torch.no_grad():
output = model(**encoded_input)
return output.last_hidden_state.mean(dim=1)

# function to find cosine similarity
def get_cosine_similarity(embedding1,embedding2):
return(cosine_similarity(embedding1,embedding2).item())

# # Initialize tokenizer and model
# tokenizer = RobertaTokenizer.from_pretrained('allenai/biomed_roberta_base')
# model = RobertaModel.from_pretrained('allenai/biomed_roberta_base')




# # Define sentences
# sentence1 = "SKI"
# sentence2 = "SMAD1"
# sentence3 = "PYROXD2"

# # Define sentences
# sentence1 = "NCBI:6497"
# sentence2 = "NCBI:4086"
# sentence3 = "NCBI:84795"


# # Define sentences
# sentence1 = "STAT1"
# sentence2 = "IFNG"
# sentence3 = "PYROXD2"

# # Get embeddings
# embeddings1 = get_embeddings(model, tokenizer, sentence1)
# embeddings2 = get_embeddings(model, tokenizer, sentence2)
# embeddings3 = get_embeddings(model, tokenizer, sentence3)

# # Calculate distances
# print("Cosine Distance between", sentence1, "and", sentence2, ":", 1 - cosine_similarity(embeddings1, embeddings2).item())
# print("Cosine Distance between", sentence1, "and", sentence3, ":", 1 - cosine_similarity(embeddings1, embeddings3).item())

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ dependencies:
- networkx==3.2.1
- requests==2.31.0
- oaklib==0.5.33
- torch==2.4.0
- transformers==4.44.0
6 changes: 3 additions & 3 deletions evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ def output_num_paths_pairs(output_dir,num_paths_df,subgraph_algorithm):
num_paths_df.to_csv(output_folder+'/num_paths_'+subgraph_algorithm+'.csv',sep=',',index=False)
logging.info('Create number of paths file: %s',output_folder+'/num_paths_'+subgraph_algorithm+'.csv')

def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim):
def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim,search_type):

output_folder = output_dir+'/Evaluation_Files'
#Check for existence of output directory
Expand All @@ -309,8 +309,8 @@ def output_literature_comparison_df(output_dir,all_subgraphs_cosine_sim):

all_subgraphs_cosine_sim_df = pd.DataFrame.from_dict(all_subgraphs_cosine_sim, orient='columns')

all_subgraphs_cosine_sim_df.to_csv(output_folder+'/literature_comparison_evaluation.csv',sep=',',index=False)
logging.info('Create literature comparison evaluation file: %s',output_folder+'/literature_comparison_evaluation.csv')
all_subgraphs_cosine_sim_df.to_csv(output_folder+'/literature_comparison_evaluation_' + search_type + '.csv',sep=',',index=False)
logging.info('Create literature comparison evaluation file: %s',output_folder+'/literature_comparison_evaluation_' + search_type + '.csv')

return all_subgraphs_cosine_sim_df

Expand Down
15 changes: 7 additions & 8 deletions evaluation_plots_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,15 @@ def visualize_literature_comparison_heatmap(term_averages_cosine_sim_df,all_wiki


#Generates boxplot of each
def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

output_folder = all_wikipathways_dir+'/literature_comparison/Evaluation_Files'

all_subgraphs_other_pathways = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df.Compared_Pathway != "Same_Pathway"]

all_subgraphs_same_pathway = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df.Compared_Pathway == "Same_Pathway"]

plt_file = output_folder + '/Literature_Comparison_all_pathways_boxplot.png'
plt_file = output_folder + '/Literature_Comparison_all_pathways_boxplot_' + search_type + '.png'
sns.swarmplot(data=all_subgraphs_same_pathway, x="Pathway_ID", y="avg_zscore_per_pathway",hue='Algorithm',palette="flare",dodge=True, legend=False, marker="x", linewidth=1,size=10)
sns.swarmplot(data=all_subgraphs_other_pathways, x="Pathway_ID", y="avg_zscore_per_pathway",hue="Algorithm", dodge=True, legend=False)
sns.boxplot(data=all_subgraphs_other_pathways, x='Pathway_ID', y = 'avg_zscore_per_pathway',hue='Algorithm').set_title("Z-Score of Cosine Similarity to All Pathway Abstracts")
Expand All @@ -165,14 +165,14 @@ def visualize_literature_comparison_boxplot_all_pathways(all_subgraphs_zscore_df
plt.close()
logging.info('Created png: %s',plt_file)

def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

pathways = all_subgraphs_zscore_df.Pathway_ID.unique()

for pathway in pathways:

df = all_subgraphs_zscore_df.loc[all_subgraphs_zscore_df['Pathway_ID'] == pathway]
plt_file = all_wikipathways_dir + '/' + pathway + '_output/Evaluation_Files/Literature_Comparison_all_pathways_scatterplot.png'
plt_file = all_wikipathways_dir + '/' + pathway + '_output/Evaluation_Files/Literature_Comparison_all_pathways_scatterplot_' + search_type + '.png'
sns_plot = sns.swarmplot(data=df, x='Algorithm', y = 'avg_zscore_per_pathway',hue='Compared_Pathway')
sns.lineplot(x="Algorithm", dashes=False, y="avg_zscore_per_pathway", hue="Compared_Pathway", style="Compared_Pathway", data=df,legend=False).set_title("Z-Score of Cosine Similarity to All Pathway Abstracts for" + pathway)
sns.move_legend(sns_plot,"upper left", bbox_to_anchor=(1, 1))
Expand All @@ -181,14 +181,13 @@ def visualize_literature_comparison_scatterplot_all_pathways(all_subgraphs_zscor
plt.close()
logging.info('Created png: %s',plt_file)

def visualize_literature_comparison_heatmap_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir):
def visualize_literature_comparison_heatmap_all_pathways(all_subgraphs_zscore_df,all_wikipathways_dir,search_type):

output_folder = all_wikipathways_dir+'/literature_comparison/Evaluation_Files'

plt_file = output_folder + '/Literature_Comparison_all_pathways_heatmap.png'
plt_file = output_folder + '/Literature_Comparison_all_pathways_heatmap_' + search_type + '.png'
df_matrix = all_subgraphs_zscore_df.pivot_table(index='Pathway_ID',columns='Algorithm',values='avg_zscore_per_pathway')
sns.heatmap(df_matrix, fmt="g", cmap='viridis').set_title("Z-Score of Subgraphs to All Other Pathways")
plt.savefig(plt_file,bbox_inches="tight")
plt.close()
logging.info('Created png: %s',plt_file)

logging.info('Created png: %s',plt_file)
Loading