From 5bafd145640d1274b774d3b32df282d6ddb2f79f Mon Sep 17 00:00:00 2001 From: lineUCB <106828511+lineUCB@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:51:11 -0700 Subject: [PATCH] Update embedding_create.py --- .../embedding_create.py | 75 +++++++++---------- 1 file changed, 34 insertions(+), 41 deletions(-) diff --git a/rag/file_conversion_router/embedding_create.py b/rag/file_conversion_router/embedding_create.py index e8093138..9550a6b5 100644 --- a/rag/file_conversion_router/embedding_create.py +++ b/rag/file_conversion_router/embedding_create.py @@ -23,6 +23,37 @@ def string_subtraction(main_string, sub_string): return main_string.replace(sub_string, '', 1) # The '1' ensures only the first occurrence is removed +def traverse_files(path, start_folder_name, url_list, id_list, doc_list): + results = [] + # Check if the provided path exists + if not os.path.exists(path): + raise ValueError(f"The provided path '{path}' does not exist.") + folder_tree = f"{start_folder_name} (h1)\n" + for root, dir, files in os.walk(path): + for file in files: + if file.endswith('.pkl'): + path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:] + line = ((len(path_list) - 1) * "--" + path_list[-1] + f" (L{len(path_list)})") + folder_tree += f"{line}\n" + + for root, dir, files in os.walk(path): + for file in files: + if file.endswith('.pkl'): + # file path + file_path = os.path.join(root, file) + path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:] + with open(file_path, 'rb') as pkl_file: + print(file_path) + chunks = pickle.load(pkl_file) + for chunk in chunks: + folder_path = ' > '.join(f"{item} (Level{i + 1})" for i, item in enumerate(path_list)) + page_path = chunk.titles + id = folder_path + ' > ' + page_path + id_list.append(id) + doc_list.append(chunk.content) + print(chunk.chunk_url) + url = "URLs:\n" + "\n".join(chunk.chunk_url) + url_list.append(url) ''' Traverse through files ''' @@ -55,44 +86,9 @@ def string_subtraction(main_string, sub_string): def embedding_create(markdown_path,name, embedding_name, folder_name, model): - def string_subtraction(main_string, sub_string): - return main_string.replace(sub_string, '', 1) # The '1' ensures only the first occurrence is removed - ''' Traverse through files ''' - - def traverse_files(path, start_folder_name): - results = [] - # Check if the provided path exists - if not os.path.exists(path): - raise ValueError(f"The provided path '{path}' does not exist.") - folder_tree = f"{start_folder_name} (h1)\n" - for root, dir, files in os.walk(path): - for file in files: - if file.endswith('.pkl'): - path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:] - line = ((len(path_list) - 1) * "--" + path_list[-1] + f" (L{len(path_list)})") - folder_tree += f"{line}\n" - - for root, dir, files in os.walk(path): - for file in files: - if file.endswith('.pkl'): - # file path - file_path = os.path.join(root, file) - path_list = [start_folder_name] + string_subtraction(root, path).split('/')[1:] - with open(file_path, 'rb') as pkl_file: - print(file_path) - chunks = pickle.load(pkl_file) - for chunk in chunks: - folder_path = ' > '.join(f"{item} (Level{i + 1})" for i, item in enumerate(path_list)) - page_path = chunk.titles - id = folder_path + ' > ' + page_path - id_list.append(id) - doc_list.append(chunk.content) - print(chunk.chunk_url) - url = "URLs:\n" + "\n".join(chunk.chunk_url) - url_list.append(url) id_list = [] doc_list = [] embedding_list = [] @@ -102,9 +98,9 @@ def traverse_files(path, start_folder_name): start = time.time() # Process each page # TODO PROCESS DOCUMENTS - docs = traverse_files(markdown_path,name) + docs = traverse_files(markdown_path, name, url_list, id_list, doc_list) - if model=='local' or model=='zephyr': + if model == 'local' or model == 'zephyr': openai.api_key = "empty" openai.api_base = "http://localhost:8000/v1" @@ -205,7 +201,6 @@ def last_token_pool(last_hidden_states: Tensor, print(f"Embedding error: {e}") fail.append(id_list[i]) # count += 1 - # # id_list.extend(ids) # embedding_list.extend(embedding) id_list=np.array(id_list) @@ -223,8 +218,6 @@ def last_token_pool(last_hidden_states: Tensor, 'url_list': url_list, 'time_list': time_list } - - # Create the folder if it does not exist if not os.path.exists(folder_name): os.makedirs(folder_name) @@ -235,7 +228,7 @@ def last_token_pool(last_hidden_states: Tensor, pickle.dump(data_to_store, f) for i in fail: - print("Failed Embeddings: ",i) + print("Failed Embeddings: ", i) if __name__ == "__main__":