From 2d59886f9eec0f7601a5aa0002d342bc90f3e25d Mon Sep 17 00:00:00 2001 From: Ozan Gokdemir Date: Mon, 26 Feb 2024 18:46:30 +0000 Subject: [PATCH 1/4] Initial running code to be refined. --- CITATION.cff | 2 +- ragamp/process_json_result.py | 13 +++ ragamp/pubmed_rag.py | 148 ++++++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 ragamp/process_json_result.py create mode 100644 ragamp/pubmed_rag.py diff --git a/CITATION.cff b/CITATION.cff index d67648c..b2f6422 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,5 +6,5 @@ authors: orcid: https://orcid.org/0000-0001-5299-1983 license: MIT repository-code: https://github.com/ogkdmr/ragamp -title: RagAmp +title: RagAmpq url: https://ogkdmr.github.io/ragamp/ diff --git a/ragamp/process_json_result.py b/ragamp/process_json_result.py new file mode 100644 index 0000000..7a6ee78 --- /dev/null +++ b/ragamp/process_json_result.py @@ -0,0 +1,13 @@ +"""Initial code for reading the content of the json formatted RAG response.""" + +from __future__ import annotations + +import json + +with open('data/query_responses_strains.json') as f: + q2r = json.load(f) + for k, v in q2r.items(): + print('Query AMP: ', k) + print() + print(' Response: ', v) + print() diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py new file mode 100644 index 0000000..7b43de7 --- /dev/null +++ b/ragamp/pubmed_rag.py @@ -0,0 +1,148 @@ +"""Code for Building and querying a RAG vector store index using an LLM. + +This module contains code for querying a vector store index using a +language model and generating responses. It uses the HuggingFace library for +language model and tokenizer, and the llama_index library for vector store +index operations. The module also includes code for creating and loading the +index from storage, as well as saving the query responses to a JSON file. +""" + +from __future__ import annotations + +import json +import logging +import os +import os.path as osp +import sys + +import torch +from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings +from llama_index import load_index_from_storage +from llama_index import ServiceContext +from llama_index import set_global_service_context +from llama_index import SimpleDirectoryReader +from llama_index import StorageContext +from llama_index import VectorStoreIndex +from llama_index.llms import HuggingFaceLLM +from llama_index.prompts import PromptTemplate +from tqdm import tqdm +from transformers import BitsAndBytesConfig + +os.environ['HF_HOME'] = '/lambda_stor/data/ogokdemir/transformers_cache' +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) + + +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_quant_type='nf4', + bnb_4bit_use_double_quant=True, +) + +llm = HuggingFaceLLM( + model_name='mistralai/Mistral-7B-Instruct-v0.1', + tokenizer_name='mistralai/Mistral-7B-Instruct-v0.1', + query_wrapper_prompt=PromptTemplate( + '[INST] {query_str} [/INST] \n', + ), + context_window=3900, + max_new_tokens=256, + model_kwargs={'quantization_config': quantization_config}, + # tokenizer_kwargs={}, + generate_kwargs={ + 'temperature': 0.2, + 'top_k': 5, + 'top_p': 0.95, + 'do_sample': True, + }, + device_map='auto', +) + +embed_model = HuggingFaceBgeEmbeddings( + model_name='dmis-lab/biobert-base-cased-v1.1', +) + +service_context = ServiceContext.from_defaults( + llm=llm, + embed_model=embed_model, +) + +set_global_service_context(service_context) + +PERSIST_DIR = 'data/vectorstores' + +if not osp.exists(PERSIST_DIR): + logging.info('Creating index from scratch') + documents = SimpleDirectoryReader('data/pmc').load_data() + index = VectorStoreIndex.from_documents(documents, show_progress=True) + index.storage_context.persist(PERSIST_DIR) +else: + logging.info('Loading index from storage') + storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) + index = load_index_from_storage(storage_context) + +query_engine = index.as_query_engine() +logging.info('Query engine ready, running inference') + +amps = [ + 'Amoebapore A', + 'BACTENECIN 5', + 'CCL20', + 'DEFB118', + 'Drosomycin', + 'Eotaxin2', + 'Gm cecropin A', + 'Human alphasynuclein', + 'Human granulysin', + 'Microcin B', + 'Microcin S', + 'NLP31', + 'Amoebapore B', + 'BACTENECIN 7', + 'CXCL2', + 'DEFB24', + 'Drosomycin2', + 'Eotaxin3', + 'Gm cecropin B', + 'Human beta defensin 2', + 'Human histatin 9', + 'Microcin C7', + 'Microcin V', + 'Peptide 2', + 'Amoebapore C', + 'CAP18', + 'CXCL3', + 'Defensin 1', + 'Drosophila cecropin B', + 'EP2', + 'Gm cecropin C', + 'Human beta defensin 3', + 'Human TC2', + 'Microcin L', + 'NLP27', + 'Peptide 5', + 'Bactenecin', + 'Cathepsin G', + 'CXCL6', + 'Dermcidin', + 'Elafin', + 'FGG', + 'Gm defensinlike peptide', + 'Human beta defensin 4', + 'LL23', + 'Microcin M', + 'NLP29', +] + +q2r = {} +for amp in tqdm(amps, desc='Querying', total=len(amps)): + query = f'What bacterial strains does {amp} act on?.' + response = query_engine.query(query) + q2r[amp] = str(response) + +with open('data/query_responses_strains.json', 'w') as f: + json.dump(q2r, f) + +# TODO: Find a way to customize the number of documents returned by +# the query engine. Currently, it returns 10 documents by default. From e37ae6d8f1483bd11253564473a41bc253a3e3b0 Mon Sep 17 00:00:00 2001 From: Ozan Gokdemir Date: Tue, 27 Feb 2024 20:53:53 +0000 Subject: [PATCH 2/4] Updates for the newer version of llama index. --- pyproject.toml | 8 +++++++- ragamp/pubmed_rag.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b4057cc..af09f35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,14 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: Implementation :: CPython", ] + dependencies = [ - "requests", + "requests >= 2.31.0", + "transformers >= 4.37.1", + "tokenizers >= 0.15.1", + "llama-index >= 0.9.36", + "langchain >= 0.1.3", + "llama-index-llms-huggingface" ] [project.urls] diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py index 7b43de7..647e395 100644 --- a/ragamp/pubmed_rag.py +++ b/ragamp/pubmed_rag.py @@ -17,14 +17,14 @@ import torch from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings -from llama_index import load_index_from_storage -from llama_index import ServiceContext -from llama_index import set_global_service_context -from llama_index import SimpleDirectoryReader -from llama_index import StorageContext -from llama_index import VectorStoreIndex -from llama_index.llms import HuggingFaceLLM -from llama_index.prompts import PromptTemplate +from llama_index.core import set_global_service_context +from llama_index.core import SimpleDirectoryReader +from llama_index.core import VectorStoreIndex +from llama_index.core.indices.loading import load_index_from_storage +from llama_index.core.indices.service_context import ServiceContext +from llama_index.core.prompts.base import PromptTemplate +from llama_index.core.storage.storage_context import StorageContext +from llama_index.llms.huggingface import HuggingFaceLLM from tqdm import tqdm from transformers import BitsAndBytesConfig From c3a2e4fece2aadefe873bf572bf587ab18e5cec6 Mon Sep 17 00:00:00 2001 From: Ozan Gokdemir Date: Fri, 1 Mar 2024 06:00:50 +0000 Subject: [PATCH 3/4] Now builds the index from the documents. --- examples/antimicrobial_peptides.txt | 47 ++++++++++++++++ pyproject.toml | 6 +- ragamp/pubmed_rag.py | 87 +++++++---------------------- 3 files changed, 72 insertions(+), 68 deletions(-) create mode 100644 examples/antimicrobial_peptides.txt diff --git a/examples/antimicrobial_peptides.txt b/examples/antimicrobial_peptides.txt new file mode 100644 index 0000000..bba6e77 --- /dev/null +++ b/examples/antimicrobial_peptides.txt @@ -0,0 +1,47 @@ +Amoebapore A +BACTENECIN 5 +CCL20 +DEFB118 +Drosomycin +Eotaxin2 +Gm cecropin A +Human alphasynuclein +Human granulysin +Microcin B +Microcin S +NLP31 +Amoebapore B +BACTENECIN 7 +CXCL2 +DEFB24 +Drosomycin2 +Eotaxin3 +Gm cecropin B +Human beta defensin 2 +Human histatin 9 +Microcin C7 +Microcin V +Peptide 2 +Amoebapore C +CAP18 +CXCL3 +Defensin 1 +Drosophila cecropin B +EP2 +Gm cecropin C +Human beta defensin 3 +Human TC2 +Microcin L +NLP27 +Peptide 5 +Bactenecin +Cathepsin G +CXCL6 +Dermcidin +Elafin +FGG +Gm defensinlike peptide +Human beta defensin 4 +LL23 +Microcin M +NLP29 diff --git a/pyproject.toml b/pyproject.toml index af09f35..0fe81c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,11 @@ dependencies = [ "tokenizers >= 0.15.1", "llama-index >= 0.9.36", "langchain >= 0.1.3", - "llama-index-llms-huggingface" + "llama-index-llms-huggingface", + "llama-index-readers-file", + "bitsandbytes >= 0.42.0", + "sentence_transformers >= 2.2.2", + "llama-index-embeddings-langchain" ] [project.urls] diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py index 647e395..39033e4 100644 --- a/ragamp/pubmed_rag.py +++ b/ragamp/pubmed_rag.py @@ -17,18 +17,17 @@ import torch from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings -from llama_index.core import set_global_service_context from llama_index.core import SimpleDirectoryReader from llama_index.core import VectorStoreIndex from llama_index.core.indices.loading import load_index_from_storage -from llama_index.core.indices.service_context import ServiceContext from llama_index.core.prompts.base import PromptTemplate from llama_index.core.storage.storage_context import StorageContext from llama_index.llms.huggingface import HuggingFaceLLM from tqdm import tqdm -from transformers import BitsAndBytesConfig -os.environ['HF_HOME'] = '/lambda_stor/data/ogokdemir/transformers_cache' +os.environ['HF_HOME'] = '/lus/eagle/projects/LUCID/ogokdemir/hf_cache' +from transformers import BitsAndBytesConfig # noqa + logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) @@ -59,81 +58,35 @@ device_map='auto', ) -embed_model = HuggingFaceBgeEmbeddings( - model_name='dmis-lab/biobert-base-cased-v1.1', -) +# TODO: pritamdeka/S-PubMedBert-MS-MARCO, look into this encoder alternative. -service_context = ServiceContext.from_defaults( - llm=llm, - embed_model=embed_model, +encoder = HuggingFaceBgeEmbeddings( + model_name='pritamdeka/S-PubMedBert-MS-MARCO', ) -set_global_service_context(service_context) - -PERSIST_DIR = 'data/vectorstores' +PERSIST_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/indexes/amp_index/' +AMP_PAPERS_DIR = '/lus/eagle/projects/candle_aesp/ogokdemir/pdfwf_runs/AmpParsedDocs/md_outs/' # noqa +QUERY_AMPS_DIR = '/home/ogokdemir/ragamp/examples/antimicrobial_peptides.txt' if not osp.exists(PERSIST_DIR): logging.info('Creating index from scratch') - documents = SimpleDirectoryReader('data/pmc').load_data() - index = VectorStoreIndex.from_documents(documents, show_progress=True) + documents = SimpleDirectoryReader(AMP_PAPERS_DIR).load_data() + index = VectorStoreIndex.from_documents( + documents, + embed_model=encoder, + show_progress=True, + ) index.storage_context.persist(PERSIST_DIR) else: logging.info('Loading index from storage') storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) index = load_index_from_storage(storage_context) -query_engine = index.as_query_engine() +query_engine = index.as_query_engine(llm=llm) logging.info('Query engine ready, running inference') -amps = [ - 'Amoebapore A', - 'BACTENECIN 5', - 'CCL20', - 'DEFB118', - 'Drosomycin', - 'Eotaxin2', - 'Gm cecropin A', - 'Human alphasynuclein', - 'Human granulysin', - 'Microcin B', - 'Microcin S', - 'NLP31', - 'Amoebapore B', - 'BACTENECIN 7', - 'CXCL2', - 'DEFB24', - 'Drosomycin2', - 'Eotaxin3', - 'Gm cecropin B', - 'Human beta defensin 2', - 'Human histatin 9', - 'Microcin C7', - 'Microcin V', - 'Peptide 2', - 'Amoebapore C', - 'CAP18', - 'CXCL3', - 'Defensin 1', - 'Drosophila cecropin B', - 'EP2', - 'Gm cecropin C', - 'Human beta defensin 3', - 'Human TC2', - 'Microcin L', - 'NLP27', - 'Peptide 5', - 'Bactenecin', - 'Cathepsin G', - 'CXCL6', - 'Dermcidin', - 'Elafin', - 'FGG', - 'Gm defensinlike peptide', - 'Human beta defensin 4', - 'LL23', - 'Microcin M', - 'NLP29', -] +with open(QUERY_AMPS_DIR) as f: + amps = f.read().splitlines() q2r = {} for amp in tqdm(amps, desc='Querying', total=len(amps)): @@ -144,5 +97,5 @@ with open('data/query_responses_strains.json', 'w') as f: json.dump(q2r, f) -# TODO: Find a way to customize the number of documents returned by -# the query engine. Currently, it returns 10 documents by default. +# TODO: Move dataloading and encoding to functions and parallelize them. +# TODO: Once that is done, build the index directly from the embeddings. From a450129f51e7e9261e194314cd93ad098448acbd Mon Sep 17 00:00:00 2001 From: Ozan Gokdemir Date: Wed, 6 Mar 2024 06:53:36 +0000 Subject: [PATCH 4/4] Working draft of the RAG code. Reads, encodes, queries. --- ragamp/pubmed_rag.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py index 39033e4..6e9b675 100644 --- a/ragamp/pubmed_rag.py +++ b/ragamp/pubmed_rag.py @@ -25,8 +25,8 @@ from llama_index.llms.huggingface import HuggingFaceLLM from tqdm import tqdm -os.environ['HF_HOME'] = '/lus/eagle/projects/LUCID/ogokdemir/hf_cache' -from transformers import BitsAndBytesConfig # noqa +# os.environ["HF_HOME"] = "/lus/eagle/projects/LUCID/ogokdemir/hf_cache" +from transformers import BitsAndBytesConfig logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) @@ -58,8 +58,6 @@ device_map='auto', ) -# TODO: pritamdeka/S-PubMedBert-MS-MARCO, look into this encoder alternative. - encoder = HuggingFaceBgeEmbeddings( model_name='pritamdeka/S-PubMedBert-MS-MARCO', ) @@ -67,6 +65,7 @@ PERSIST_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/indexes/amp_index/' AMP_PAPERS_DIR = '/lus/eagle/projects/candle_aesp/ogokdemir/pdfwf_runs/AmpParsedDocs/md_outs/' # noqa QUERY_AMPS_DIR = '/home/ogokdemir/ragamp/examples/antimicrobial_peptides.txt' +OUTPUT_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/outputs/amp_output/template_4.json' # noqa if not osp.exists(PERSIST_DIR): logging.info('Creating index from scratch') @@ -74,13 +73,14 @@ index = VectorStoreIndex.from_documents( documents, embed_model=encoder, + insert_batch_size=16384, show_progress=True, ) index.storage_context.persist(PERSIST_DIR) else: logging.info('Loading index from storage') storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) - index = load_index_from_storage(storage_context) + index = load_index_from_storage(storage_context, embed_model=encoder) query_engine = index.as_query_engine(llm=llm) logging.info('Query engine ready, running inference') @@ -90,11 +90,14 @@ q2r = {} for amp in tqdm(amps, desc='Querying', total=len(amps)): - query = f'What bacterial strains does {amp} act on?.' + query = f'What cellular processes does {amp} disrupt?' response = query_engine.query(query) q2r[amp] = str(response) -with open('data/query_responses_strains.json', 'w') as f: + +os.makedirs(osp.dirname(OUTPUT_DIR), exist_ok=True) + +with open(OUTPUT_DIR, 'w') as f: json.dump(q2r, f) # TODO: Move dataloading and encoding to functions and parallelize them.