Skip to content

Commit

Permalink
update_embeddings_source
Browse files Browse the repository at this point in the history
  • Loading branch information
bakhatar committed Feb 20, 2024
1 parent 0d6471f commit 0578eea
Show file tree
Hide file tree
Showing 9 changed files with 57,117 additions and 2,530 deletions.
43 changes: 20 additions & 23 deletions actions/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ def name(self) -> str:
##new
def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain) -> list:
user_message_all = tracker.latest_message.get('text')
input_weight=tracker.latest_message.get('embedding')
# print("######tracker.latest_message",input_weight)
# Generate recommendations
try:
df_recommendations = provide_recommendations(user_message_all, THRESH=0.3, n=1000, unique_values_dict=unique_values_dict, BERT_weights=BERT_weights,n_module=n_module)
df_recommendations = provide_recommendations(user_message_all, input_weight,THRESH=0.3, n=1000, unique_values_dict=unique_values_dict, BERT_weights=BERT_weights,n_module=n_module)
dataframe_json = df_recommendations.to_json(orient='split')
except Exception as e:
print(e)
Expand All @@ -51,14 +53,12 @@ def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain) -> lis
return [SlotSet("user_question", user_message_all)]
# Generate and send module buttons
module_ids, module_names = module_recommendations(df_recommendations, n=n_module)
print(module_ids,module_names)
button_list = [{"title": name, "payload": f'/inform_module{{"module_id":"{str(module_id)}"}}'} for module_id, name in zip(module_ids, module_names)]
print("button_list",button_list)
dispatcher.utter_message(text="اختر الوحدة المتعلقة بسؤالك", buttons=button_list)

# Set the user_question value in a slot for future use
return [SlotSet("user_question", user_message_all) , SlotSet("my_dataframe_slot", dataframe_json)]


class ActionReselectModule(Action):
def name(self) -> str:
return "action_reselect_module"
Expand Down Expand Up @@ -110,29 +110,28 @@ def run(self, dispatcher, tracker, domain):
try:
my_dataframe_slot = tracker.get_slot('my_dataframe_slot')
df_rslt = pd.read_json(my_dataframe_slot, orient='split')

try:
module_number = tracker.get_slot('module_id')
situation_ids,situation_names=situation_recommendations(df_rslt,int(module_number),n=n_situation)
if situation_ids==[]:
dispatcher.utter_message("لا يوجد السياق متاح في هذه الوحدة")
else:

button_list = [{"title": situation_names[i], "payload": f'/inform_situation{{"situation_id":"{str(situation_ids[i])}"}}' } for i in range(len(situation_ids))]
button_list.append({"title": "انقر هنا لإعادة إختيار الوحدة", "payload": '/rechoisir_module'})
dispatcher.utter_message(text= "اختر السياق الأقرب إلى سؤالك",buttons=button_list)
return []
except Exception as e:
print(e)
dispatcher.utter_message("!! الرجاء المحاولة مرة أخرى")
return []
except Exception as e:
print(e)
dispatcher.utter_message(" !! خلل في تحميل البيانات")
return []
try:
module_number = tracker.get_slot('module_id')
print("module_number",module_number)
situation_ids,situation_names=situation_recommendations(df_rslt,int(module_number),n=n_situation)

except Exception as e:
print(e)
dispatcher.utter_message("!! الرجاء المحاولة مرة أخرى")
return []


if situation_ids==[]:
dispatcher.utter_message("لا يوجد السياق متاح في هذه الوحدة")
else:

button_list = [{"title": situation_names[i], "payload": f'/inform_situation{{"situation_id":"{str(situation_ids[i])}"}}' } for i in range(len(situation_ids))]
button_list.append({"title": "انقر هنا لإعادة إختيار الوحدة", "payload": '/rechoisir_module'})
dispatcher.utter_message(text= "اختر السياق الأقرب إلى سؤالك",buttons=button_list)
return []

class ActionGetsituationId(Action):
def name(self):
Expand Down Expand Up @@ -167,7 +166,6 @@ def run(self, dispatcher, tracker, domain):
return []
try:
situation_number = tracker.get_slot('situation_id')
print("situation_number",situation_number)
question_ids,question_names,reste,reste_question=question_recommendations(df_rslt,int(situation_number),n=n_question)

except Exception as e:
Expand Down Expand Up @@ -207,7 +205,6 @@ def run(self, dispatcher, tracker, domain):
# Access the ID from the slot
try:
question_number = tracker.get_slot('question_id')
print("question_number",question_number)
response=get_responses(int(question_number))

except Exception as e:
Expand Down
77 changes: 14 additions & 63 deletions actions/functions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import pickle
import json, os, math
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim
import numpy as np
import pandas as pd
print("############################# IMPORTING UTILS")
BERTmodel_names=['paraphrase-multilingual-MiniLM-L12-v2','medmediani/Arabic-KW-Mdel','Ezzaldin-97/STS-Arabert','distiluse-base-multilingual-cased-v1','sentence-transformers/LaBSE']

# data_path="mydata/Base_RGPH24V6+_all_ID"
# data_path="mydata/Base_RGPH24V7_all_ID"
data_path="/app/actions/mydata/Base_RGPH24V6+_all_ID"

print('################################## actions run using', data_path )
Expand Down Expand Up @@ -38,14 +36,8 @@
with open(dict_path, 'w', encoding='utf-8') as file:
json.dump(unique_values_dict, file, ensure_ascii=False, indent=2)


BERTmodel_name=BERTmodel_names[0]
situations_list = list(unique_values_dict.keys())
# BERTmodel_name=BERTmodel_names[-1]
BERT_model=SentenceTransformer(BERTmodel_name )
pkl_path=data_path+BERTmodel_name.split('/')[0]+'situations_embeddings.pkl'

print('################# using MODEL:', BERTmodel_name)
pkl_path=data_path+'rasa_situations_embeddings.pkl'
print('################# using embeddings MODEL:', pkl_path)
### initialize weights

##new
Expand All @@ -54,17 +46,15 @@
# Load sentences & embeddings from disk
with open(pkl_path, "rb") as fIn:
stored_data = pickle.load(fIn)
situations = stored_data['situations']
situations_list = list(stored_data['situations'] .keys())
BERT_weights = stored_data['BERT_weights']
print("BERT model found")
corpus_size=len(BERT_weights)
BERT_weights = BERT_weights.reshape((corpus_size, 768))

print("Embeddings found")
else:
# Encode using BERT model and save to disk
BERT_weights = BERT_model.encode(situations_list, convert_to_tensor=True, show_progress_bar=False)
print("BERT model fine-tuned")
with open(pkl_path, "wb") as fOut:
pickle.dump({'situations': unique_values_dict, 'BERT_weights': BERT_weights},
fOut, protocol=pickle.HIGHEST_PROTOCOL)
print("BERT model saved")
print("Embeddings not found")

print("#################################First load script ended #####################")

# Function to load data from the file
Expand All @@ -74,8 +64,7 @@ def load_data_from_file(file_path):
return data['definitions_dict'], data['module_titles'],data['choose_qst_variations'],data['definitions_dict_old'],data["other_qst_variations"]

## new
def provide_recommendations(user_input,THRESH, n, unique_values_dict,BERT_weights,n_module=n_module):
input_weight=BERT_model.encode(user_input, show_progress_bar = True,convert_to_tensor=True)
def provide_recommendations(user_input,input_weight,THRESH, n, unique_values_dict,BERT_weights,n_module=n_module):
cosine_scores = pytorch_cos_sim(input_weight, BERT_weights)
cosine_scores = cosine_scores.cpu().numpy()
# Assuming cosine_scores is a 2D numpy array
Expand Down Expand Up @@ -103,9 +92,7 @@ def provide_recommendations(user_input,THRESH, n, unique_values_dict,BERT_weight
df_rec=df_rec[df_rec.module_ID.isin(module_ids)]
df_resp=add_resp_ids(df,df_rec)
df_with_qst=add_qst_ids(df,df_resp)
# df_with_qst.to_excel(path,index=False)
# pd.DataFrame(ordred_situations_IDs[:n]).to_excel(path,index=False)##TODO : change to json , and try to use trackers
# return(pd.DataFrame(ordred_situations_IDs[:n]))
# df_with_qst.to_excel('records/rslt'+user_input+".xlsx",index=False)
return(df_with_qst)

##new
Expand Down Expand Up @@ -139,48 +126,16 @@ def add_resp_ids(df, df_rslt):
df_rslt['ralated_responses']=ids
return df_rslt

##new !!!!!!!!!!!!! not working
# def add_resp_ids(df, df_rslt):
# # Pre-filter DataFrames or Series for each category
# question_ids = df.set_index('Question_ID')['response_ID']
# tags_ids = df.set_index('Tags_ID')['response_ID']
# tags_sit_ids = df.set_index('tags_sit_ID')['response_ID']
# situation_ids = df.set_index('Situation_ID')['response_ID']
# # section_ids = df.set_index('Section_ID')['response_ID']
# # Function to get related response IDs based on category
# def get_related_responses(row):
# if row['category'] == 'question':
# return question_ids.get(row["element_ID"], []).unique()
# if row['category'] == 'tags':
# return tags_ids.get(row["element_ID"], []).unique()
# if row['category'] == 'situation Tags':
# return tags_sit_ids.get(row["element_ID"], []).unique()
# if row['category'] == 'situation':
# return situation_ids.get(row["element_ID"], []).unique()
# # if row['category'] == 'section':
# # return section_ids.get(row["element_ID"], []).unique()
# return []
# # Apply the function to each row of df_rslt
# df_rslt['ralated_responses'] = df_rslt.apply(get_related_responses, axis=1)
# return df_rslt

def module_recommendations(df_rslt,n=n_module):
module_ids = df_rslt['module_ID'].unique()[:n].tolist()
module_names=[df[df.Module_ID==module_id].module.unique().tolist()[0] for module_id in module_ids]
df[df.Module_ID.isin(module_ids)].module.unique().tolist()
return module_ids,module_names

##new
# def module_recommendations(df_rslt, n=n_module):
# # Get the first n unique module IDs
# module_ids = df_rslt['module_ID'].dropna().unique()[:n]
# # Retrieve module names for these IDs
# module_names = df[df.Module_ID.isin(module_ids)]['module'].drop_duplicates().tolist()
# return module_ids.tolist(), module_names


##new
def situation_recommendations(df_rslt, module_id, n=n_situation, nan_id=5):
def situation_recommendations(df_rslt, module_id, n=n_situation, nan_id=4):
# Filter the DataFrame and get unique situation IDs, excluding nan_id
filtered_situation_ids = df_rslt[(df_rslt['module_ID'] == int(module_id)) & (df_rslt['situation_ID'] != nan_id)]['situation_ID'].unique()
# Limit the number of situation IDs to n
Expand Down Expand Up @@ -221,8 +176,4 @@ def question_recommendations(df_rslt_with_qst, situation_ID, n=n_question):
def get_responses(question_id):
response=df[df.Question_ID==question_id]['Réponse Quasi-finale'].unique().tolist()
return(response)
# ##new
# def get_responses(question_id):
# # Use loc for efficient row selection and drop_duplicates for unique responses
# response = df.loc[df.Question_ID == question_id, 'Réponse Quasi-finale'].drop_duplicates().tolist()
# return response

Loading

0 comments on commit 0578eea

Please sign in to comment.