update_embeddings_source

sbaaihamza · Feb 20, 2024 · 0578eea · 0578eea
1 parent 0d6471f
commit 0578eea
Show file tree

Hide file tree

Showing 9 changed files with 57,117 additions and 2,530 deletions.
diff --git a/actions/actions.py b/actions/actions.py
@@ -37,9 +37,11 @@ def name(self) -> str:
     ##new
     def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain) -> list:
         user_message_all = tracker.latest_message.get('text')
+        input_weight=tracker.latest_message.get('embedding')
+        # print("######tracker.latest_message",input_weight)
         # Generate recommendations
         try:
-            df_recommendations = provide_recommendations(user_message_all, THRESH=0.3, n=1000, unique_values_dict=unique_values_dict, BERT_weights=BERT_weights,n_module=n_module)
+            df_recommendations = provide_recommendations(user_message_all, input_weight,THRESH=0.3, n=1000, unique_values_dict=unique_values_dict, BERT_weights=BERT_weights,n_module=n_module)
             dataframe_json = df_recommendations.to_json(orient='split')
         except Exception as e:
             print(e)
@@ -51,14 +53,12 @@ def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain) -> lis
             return [SlotSet("user_question", user_message_all)]
         # Generate and send module buttons
         module_ids, module_names = module_recommendations(df_recommendations, n=n_module)
-        print(module_ids,module_names)
         button_list = [{"title": name, "payload": f'/inform_module{{"module_id":"{str(module_id)}"}}'} for module_id, name in zip(module_ids, module_names)]
-        print("button_list",button_list)
         dispatcher.utter_message(text="اختر الوحدة المتعلقة بسؤالك", buttons=button_list)
+
         # Set the user_question value in a slot for future use
         return [SlotSet("user_question", user_message_all) , SlotSet("my_dataframe_slot", dataframe_json)]   
 
-
 class ActionReselectModule(Action):
     def name(self) -> str:
         return "action_reselect_module"
@@ -110,29 +110,28 @@ def run(self, dispatcher, tracker, domain):
         try:
             my_dataframe_slot = tracker.get_slot('my_dataframe_slot')
             df_rslt = pd.read_json(my_dataframe_slot, orient='split')
-
+            try:
+                module_number = tracker.get_slot('module_id')
+                situation_ids,situation_names=situation_recommendations(df_rslt,int(module_number),n=n_situation)
+                if situation_ids==[]:
+                                dispatcher.utter_message("لا يوجد السياق متاح في هذه الوحدة")
+                else:
+
+                    button_list = [{"title": situation_names[i], "payload": f'/inform_situation{{"situation_id":"{str(situation_ids[i])}"}}'  } for i in range(len(situation_ids))]
+                    button_list.append({"title": "انقر هنا لإعادة إختيار الوحدة", "payload": '/rechoisir_module'})
+                    dispatcher.utter_message(text= "اختر السياق الأقرب إلى سؤالك",buttons=button_list)
+                return []
+            except Exception as e:
+                print(e)
+                dispatcher.utter_message("!! الرجاء المحاولة مرة أخرى") 
+                return [] 
         except Exception as e:
             print(e)
             dispatcher.utter_message(" !! خلل في تحميل البيانات")
             return []
-        try:
-            module_number = tracker.get_slot('module_id')
-            print("module_number",module_number)
-            situation_ids,situation_names=situation_recommendations(df_rslt,int(module_number),n=n_situation)
-
-        except Exception as e:
-            print(e)
-            dispatcher.utter_message("!! الرجاء المحاولة مرة أخرى") 
-            return []                        
+
 
-        if situation_ids==[]:
-                        dispatcher.utter_message("لا يوجد السياق متاح في هذه الوحدة")
-        else:
 
-            button_list = [{"title": situation_names[i], "payload": f'/inform_situation{{"situation_id":"{str(situation_ids[i])}"}}'  } for i in range(len(situation_ids))]
-            button_list.append({"title": "انقر هنا لإعادة إختيار الوحدة", "payload": '/rechoisir_module'})
-            dispatcher.utter_message(text= "اختر السياق الأقرب إلى سؤالك",buttons=button_list)
-        return []
 
 class ActionGetsituationId(Action):
     def name(self):
@@ -167,7 +166,6 @@ def run(self, dispatcher, tracker, domain):
             return []
         try:
             situation_number = tracker.get_slot('situation_id')
-            print("situation_number",situation_number)
             question_ids,question_names,reste,reste_question=question_recommendations(df_rslt,int(situation_number),n=n_question)
 
         except Exception as e:
@@ -207,7 +205,6 @@ def run(self, dispatcher, tracker, domain):
         # Access the ID from the slot
         try:
             question_number = tracker.get_slot('question_id')
-            print("question_number",question_number)
             response=get_responses(int(question_number))
 
         except Exception as e:

diff --git a/actions/functions.py b/actions/functions.py
@@ -1,13 +1,11 @@
 import pickle
 import json, os, math
-from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import pytorch_cos_sim 
 import numpy as np
 import pandas as pd
 print("############################# IMPORTING UTILS")
-BERTmodel_names=['paraphrase-multilingual-MiniLM-L12-v2','medmediani/Arabic-KW-Mdel','Ezzaldin-97/STS-Arabert','distiluse-base-multilingual-cased-v1','sentence-transformers/LaBSE']
 
-# data_path="mydata/Base_RGPH24V6+_all_ID"
+# data_path="mydata/Base_RGPH24V7_all_ID"
 data_path="/app/actions/mydata/Base_RGPH24V6+_all_ID"
 
 print('################################## actions run using', data_path )
@@ -38,14 +36,8 @@
     with open(dict_path, 'w', encoding='utf-8') as file:
         json.dump(unique_values_dict, file, ensure_ascii=False, indent=2)
 
-
-BERTmodel_name=BERTmodel_names[0]
-situations_list = list(unique_values_dict.keys())
-# BERTmodel_name=BERTmodel_names[-1]
-BERT_model=SentenceTransformer(BERTmodel_name )
-pkl_path=data_path+BERTmodel_name.split('/')[0]+'situations_embeddings.pkl'
-
-print('################# using MODEL:', BERTmodel_name)
+pkl_path=data_path+'rasa_situations_embeddings.pkl'
+print('################# using embeddings MODEL:', pkl_path)
 ### initialize weights
 
 ##new
@@ -54,17 +46,15 @@
     # Load sentences & embeddings from disk
     with open(pkl_path, "rb") as fIn:
         stored_data = pickle.load(fIn)
-        situations = stored_data['situations']
+        situations_list = list(stored_data['situations'] .keys())       
         BERT_weights = stored_data['BERT_weights']
-    print("BERT model found")
+        corpus_size=len(BERT_weights)
+        BERT_weights = BERT_weights.reshape((corpus_size, 768))
+
+    print("Embeddings found")
 else:
-    # Encode using BERT model and save to disk
-    BERT_weights = BERT_model.encode(situations_list, convert_to_tensor=True, show_progress_bar=False)
-    print("BERT model fine-tuned")
-    with open(pkl_path, "wb") as fOut:
-        pickle.dump({'situations': unique_values_dict, 'BERT_weights': BERT_weights}, 
-                     fOut, protocol=pickle.HIGHEST_PROTOCOL)
-    print("BERT model saved")
+    print("Embeddings not found")
+
 print("#################################First load script ended #####################")
 
 # Function to load data from the file
@@ -74,8 +64,7 @@ def load_data_from_file(file_path):
     return data['definitions_dict'], data['module_titles'],data['choose_qst_variations'],data['definitions_dict_old'],data["other_qst_variations"]
 
 ## new
-def provide_recommendations(user_input,THRESH, n, unique_values_dict,BERT_weights,n_module=n_module):
-    input_weight=BERT_model.encode(user_input, show_progress_bar = True,convert_to_tensor=True)
+def provide_recommendations(user_input,input_weight,THRESH, n, unique_values_dict,BERT_weights,n_module=n_module):
     cosine_scores = pytorch_cos_sim(input_weight, BERT_weights)
     cosine_scores = cosine_scores.cpu().numpy()
     # Assuming cosine_scores is a 2D numpy array
@@ -103,9 +92,7 @@ def provide_recommendations(user_input,THRESH, n, unique_values_dict,BERT_weight
     df_rec=df_rec[df_rec.module_ID.isin(module_ids)]
     df_resp=add_resp_ids(df,df_rec)
     df_with_qst=add_qst_ids(df,df_resp)
-    # df_with_qst.to_excel(path,index=False)
-    # pd.DataFrame(ordred_situations_IDs[:n]).to_excel(path,index=False)##TODO : change to json , and try to use trackers 
-    # return(pd.DataFrame(ordred_situations_IDs[:n]))
+    # df_with_qst.to_excel('records/rslt'+user_input+".xlsx",index=False)
     return(df_with_qst)
 
 ##new 
@@ -139,48 +126,16 @@ def add_resp_ids(df, df_rslt):
     df_rslt['ralated_responses']=ids
     return df_rslt
 
-##new !!!!!!!!!!!!! not working
-# def add_resp_ids(df, df_rslt):
-#     # Pre-filter DataFrames or Series for each category
-#     question_ids = df.set_index('Question_ID')['response_ID']
-#     tags_ids = df.set_index('Tags_ID')['response_ID']
-#     tags_sit_ids = df.set_index('tags_sit_ID')['response_ID']
-#     situation_ids = df.set_index('Situation_ID')['response_ID']
-#     # section_ids = df.set_index('Section_ID')['response_ID']
-#     # Function to get related response IDs based on category
-#     def get_related_responses(row):
-#         if row['category'] == 'question':
-#             return question_ids.get(row["element_ID"], []).unique()
-#         if row['category'] == 'tags':
-#             return tags_ids.get(row["element_ID"], []).unique()
-#         if row['category'] == 'situation Tags':
-#             return tags_sit_ids.get(row["element_ID"], []).unique()
-#         if row['category'] == 'situation':
-#             return situation_ids.get(row["element_ID"], []).unique()
-#         # if row['category'] == 'section':
-#             # return section_ids.get(row["element_ID"], []).unique()
-#         return []
-#     # Apply the function to each row of df_rslt
-#     df_rslt['ralated_responses'] = df_rslt.apply(get_related_responses, axis=1)
-#     return df_rslt
 
 def module_recommendations(df_rslt,n=n_module):
     module_ids = df_rslt['module_ID'].unique()[:n].tolist()
     module_names=[df[df.Module_ID==module_id].module.unique().tolist()[0] for module_id in module_ids]
     df[df.Module_ID.isin(module_ids)].module.unique().tolist()
     return module_ids,module_names
 
-##new
-# def module_recommendations(df_rslt, n=n_module):
-#     # Get the first n unique module IDs
-#     module_ids = df_rslt['module_ID'].dropna().unique()[:n]
-#     # Retrieve module names for these IDs
-#     module_names = df[df.Module_ID.isin(module_ids)]['module'].drop_duplicates().tolist()
-#     return module_ids.tolist(), module_names
-
 
 ##new
-def situation_recommendations(df_rslt, module_id, n=n_situation, nan_id=5):
+def situation_recommendations(df_rslt, module_id, n=n_situation, nan_id=4):
     # Filter the DataFrame and get unique situation IDs, excluding nan_id
     filtered_situation_ids = df_rslt[(df_rslt['module_ID'] == int(module_id)) & (df_rslt['situation_ID'] != nan_id)]['situation_ID'].unique()
     # Limit the number of situation IDs to n
@@ -221,8 +176,4 @@ def question_recommendations(df_rslt_with_qst, situation_ID, n=n_question):
 def get_responses(question_id):
     response=df[df.Question_ID==question_id]['Réponse  Quasi-finale'].unique().tolist()
     return(response)
-# ##new
-# def get_responses(question_id):
-#     # Use loc for efficient row selection and drop_duplicates for unique responses
-#     response = df.loc[df.Question_ID == question_id, 'Réponse  Quasi-finale'].drop_duplicates().tolist()
-#     return response
+