From 2d59886f9eec0f7601a5aa0002d342bc90f3e25d Mon Sep 17 00:00:00 2001
From: Ozan Gokdemir <ogokdemir@uchicago.edu>
Date: Mon, 26 Feb 2024 18:46:30 +0000
Subject: [PATCH 1/4] Initial running code to be refined.

---
 CITATION.cff                  |   2 +-
 ragamp/process_json_result.py |  13 +++
 ragamp/pubmed_rag.py          | 148 ++++++++++++++++++++++++++++++++++
 3 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 ragamp/process_json_result.py
 create mode 100644 ragamp/pubmed_rag.py
diff --git a/CITATION.cff b/CITATION.cff
index d67648c..b2f6422 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -6,5 +6,5 @@ authors:
     orcid: https://orcid.org/0000-0001-5299-1983
 license: MIT
 repository-code: https://github.com/ogkdmr/ragamp
-title: RagAmp
+title: RagAmpq
 url: https://ogkdmr.github.io/ragamp/
diff --git a/ragamp/process_json_result.py b/ragamp/process_json_result.py
new file mode 100644
index 0000000..7a6ee78
--- /dev/null
+++ b/ragamp/process_json_result.py
@@ -0,0 +1,13 @@
+"""Initial code for reading the content of the json formatted RAG response."""
+
+from __future__ import annotations
+
+import json
+
+with open('data/query_responses_strains.json') as f:
+    q2r = json.load(f)
+    for k, v in q2r.items():
+        print('Query AMP: ', k)
+        print()
+        print('  Response: ', v)
+        print()
diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py
new file mode 100644
index 0000000..7b43de7
--- /dev/null
+++ b/ragamp/pubmed_rag.py
@@ -0,0 +1,148 @@
+"""Code for Building and querying a RAG vector store index using an LLM.
+
+This module contains code for querying a vector store index using a
+language model and generating responses. It uses the HuggingFace library for
+language model and tokenizer, and the llama_index library for vector store
+index operations. The module also includes code for creating and loading the
+index from storage, as well as saving the query responses to a JSON file.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import os.path as osp
+import sys
+
+import torch
+from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
+from llama_index import load_index_from_storage
+from llama_index import ServiceContext
+from llama_index import set_global_service_context
+from llama_index import SimpleDirectoryReader
+from llama_index import StorageContext
+from llama_index import VectorStoreIndex
+from llama_index.llms import HuggingFaceLLM
+from llama_index.prompts import PromptTemplate
+from tqdm import tqdm
+from transformers import BitsAndBytesConfig
+
+os.environ['HF_HOME'] = '/lambda_stor/data/ogokdemir/transformers_cache'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_quant_type='nf4',
+    bnb_4bit_use_double_quant=True,
+)
+
+llm = HuggingFaceLLM(
+    model_name='mistralai/Mistral-7B-Instruct-v0.1',
+    tokenizer_name='mistralai/Mistral-7B-Instruct-v0.1',
+    query_wrapper_prompt=PromptTemplate(
+        '<s>[INST] {query_str} [/INST] </s>\n',
+    ),
+    context_window=3900,
+    max_new_tokens=256,
+    model_kwargs={'quantization_config': quantization_config},
+    # tokenizer_kwargs={},
+    generate_kwargs={
+        'temperature': 0.2,
+        'top_k': 5,
+        'top_p': 0.95,
+        'do_sample': True,
+    },
+    device_map='auto',
+)
+
+embed_model = HuggingFaceBgeEmbeddings(
+    model_name='dmis-lab/biobert-base-cased-v1.1',
+)
+
+service_context = ServiceContext.from_defaults(
+    llm=llm,
+    embed_model=embed_model,
+)
+
+set_global_service_context(service_context)
+
+PERSIST_DIR = 'data/vectorstores'
+
+if not osp.exists(PERSIST_DIR):
+    logging.info('Creating index from scratch')
+    documents = SimpleDirectoryReader('data/pmc').load_data()
+    index = VectorStoreIndex.from_documents(documents, show_progress=True)
+    index.storage_context.persist(PERSIST_DIR)
+else:
+    logging.info('Loading index from storage')
+    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
+    index = load_index_from_storage(storage_context)
+
+query_engine = index.as_query_engine()
+logging.info('Query engine ready, running inference')
+
+amps = [
+    'Amoebapore A',
+    'BACTENECIN 5',
+    'CCL20',
+    'DEFB118',
+    'Drosomycin',
+    'Eotaxin2',
+    'Gm cecropin A',
+    'Human alphasynuclein',
+    'Human granulysin',
+    'Microcin B',
+    'Microcin S',
+    'NLP31',
+    'Amoebapore B',
+    'BACTENECIN 7',
+    'CXCL2',
+    'DEFB24',
+    'Drosomycin2',
+    'Eotaxin3',
+    'Gm cecropin B',
+    'Human beta defensin 2',
+    'Human histatin 9',
+    'Microcin C7',
+    'Microcin V',
+    'Peptide 2',
+    'Amoebapore C',
+    'CAP18',
+    'CXCL3',
+    'Defensin 1',
+    'Drosophila cecropin B',
+    'EP2',
+    'Gm cecropin C',
+    'Human beta defensin 3',
+    'Human TC2',
+    'Microcin L',
+    'NLP27',
+    'Peptide 5',
+    'Bactenecin',
+    'Cathepsin G',
+    'CXCL6',
+    'Dermcidin',
+    'Elafin',
+    'FGG',
+    'Gm defensinlike peptide',
+    'Human beta defensin 4',
+    'LL23',
+    'Microcin M',
+    'NLP29',
+]
+
+q2r = {}
+for amp in tqdm(amps, desc='Querying', total=len(amps)):
+    query = f'What bacterial strains does {amp} act on?.'
+    response = query_engine.query(query)
+    q2r[amp] = str(response)
+
+with open('data/query_responses_strains.json', 'w') as f:
+    json.dump(q2r, f)
+
+# TODO: Find a way to customize the number of documents returned by
+# the query engine. Currently, it returns 10 documents by default.

From e37ae6d8f1483bd11253564473a41bc253a3e3b0 Mon Sep 17 00:00:00 2001
From: Ozan Gokdemir <ogokdemir@uchicago.edu>
Date: Tue, 27 Feb 2024 20:53:53 +0000
Subject: [PATCH 2/4] Updates for the newer version of llama index.

---
 pyproject.toml       |  8 +++++++-
 ragamp/pubmed_rag.py | 16 ++++++++--------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b4057cc..af09f35 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,14 @@ classifiers = [
     "Programming Language :: Python :: 3 :: Only",
     "Programming Language :: Python :: Implementation :: CPython",
 ]
+
 dependencies = [
-    "requests",
+    "requests >= 2.31.0",
+    "transformers >= 4.37.1",
+    "tokenizers >=  0.15.1",
+    "llama-index >= 0.9.36",
+    "langchain >= 0.1.3",
+    "llama-index-llms-huggingface"
 ]
 
 [project.urls]
diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py
index 7b43de7..647e395 100644
--- a/ragamp/pubmed_rag.py
+++ b/ragamp/pubmed_rag.py
@@ -17,14 +17,14 @@
 
 import torch
 from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
-from llama_index import load_index_from_storage
-from llama_index import ServiceContext
-from llama_index import set_global_service_context
-from llama_index import SimpleDirectoryReader
-from llama_index import StorageContext
-from llama_index import VectorStoreIndex
-from llama_index.llms import HuggingFaceLLM
-from llama_index.prompts import PromptTemplate
+from llama_index.core import set_global_service_context
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core import VectorStoreIndex
+from llama_index.core.indices.loading import load_index_from_storage
+from llama_index.core.indices.service_context import ServiceContext
+from llama_index.core.prompts.base import PromptTemplate
+from llama_index.core.storage.storage_context import StorageContext
+from llama_index.llms.huggingface import HuggingFaceLLM
 from tqdm import tqdm
 from transformers import BitsAndBytesConfig
 

From c3a2e4fece2aadefe873bf572bf587ab18e5cec6 Mon Sep 17 00:00:00 2001
From: Ozan Gokdemir <ogokdemir@uchicago.edu>
Date: Fri, 1 Mar 2024 06:00:50 +0000
Subject: [PATCH 3/4] Now builds the index from the documents.

---
 examples/antimicrobial_peptides.txt | 47 ++++++++++++++++
 pyproject.toml                      |  6 +-
 ragamp/pubmed_rag.py                | 87 +++++++----------------------
 3 files changed, 72 insertions(+), 68 deletions(-)
 create mode 100644 examples/antimicrobial_peptides.txt

diff --git a/examples/antimicrobial_peptides.txt b/examples/antimicrobial_peptides.txt
new file mode 100644
index 0000000..bba6e77
--- /dev/null
+++ b/examples/antimicrobial_peptides.txt
@@ -0,0 +1,47 @@
+Amoebapore A
+BACTENECIN 5
+CCL20
+DEFB118
+Drosomycin
+Eotaxin2
+Gm cecropin A
+Human alphasynuclein
+Human granulysin
+Microcin B
+Microcin S
+NLP31
+Amoebapore B
+BACTENECIN 7
+CXCL2
+DEFB24
+Drosomycin2
+Eotaxin3
+Gm cecropin B
+Human beta defensin 2
+Human histatin 9
+Microcin C7
+Microcin V
+Peptide 2
+Amoebapore C
+CAP18
+CXCL3
+Defensin 1
+Drosophila cecropin B
+EP2
+Gm cecropin C
+Human beta defensin 3
+Human TC2
+Microcin L
+NLP27
+Peptide 5
+Bactenecin
+Cathepsin G
+CXCL6
+Dermcidin
+Elafin
+FGG
+Gm defensinlike peptide
+Human beta defensin 4
+LL23
+Microcin M
+NLP29
diff --git a/pyproject.toml b/pyproject.toml
index af09f35..0fe81c3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,11 @@ dependencies = [
     "tokenizers >=  0.15.1",
     "llama-index >= 0.9.36",
     "langchain >= 0.1.3",
-    "llama-index-llms-huggingface"
+    "llama-index-llms-huggingface",
+    "llama-index-readers-file",
+    "bitsandbytes >= 0.42.0",
+    "sentence_transformers >= 2.2.2",
+    "llama-index-embeddings-langchain"
 ]
 
 [project.urls]
diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py
index 647e395..39033e4 100644
--- a/ragamp/pubmed_rag.py
+++ b/ragamp/pubmed_rag.py
@@ -17,18 +17,17 @@
 
 import torch
 from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
-from llama_index.core import set_global_service_context
 from llama_index.core import SimpleDirectoryReader
 from llama_index.core import VectorStoreIndex
 from llama_index.core.indices.loading import load_index_from_storage
-from llama_index.core.indices.service_context import ServiceContext
 from llama_index.core.prompts.base import PromptTemplate
 from llama_index.core.storage.storage_context import StorageContext
 from llama_index.llms.huggingface import HuggingFaceLLM
 from tqdm import tqdm
-from transformers import BitsAndBytesConfig
 
-os.environ['HF_HOME'] = '/lambda_stor/data/ogokdemir/transformers_cache'
+os.environ['HF_HOME'] = '/lus/eagle/projects/LUCID/ogokdemir/hf_cache'
+from transformers import BitsAndBytesConfig  # noqa
+
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
 
@@ -59,81 +58,35 @@
     device_map='auto',
 )
 
-embed_model = HuggingFaceBgeEmbeddings(
-    model_name='dmis-lab/biobert-base-cased-v1.1',
-)
+# TODO: pritamdeka/S-PubMedBert-MS-MARCO, look into this encoder alternative.
 
-service_context = ServiceContext.from_defaults(
-    llm=llm,
-    embed_model=embed_model,
+encoder = HuggingFaceBgeEmbeddings(
+    model_name='pritamdeka/S-PubMedBert-MS-MARCO',
 )
 
-set_global_service_context(service_context)
-
-PERSIST_DIR = 'data/vectorstores'
+PERSIST_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/indexes/amp_index/'
+AMP_PAPERS_DIR = '/lus/eagle/projects/candle_aesp/ogokdemir/pdfwf_runs/AmpParsedDocs/md_outs/'  # noqa
+QUERY_AMPS_DIR = '/home/ogokdemir/ragamp/examples/antimicrobial_peptides.txt'
 
 if not osp.exists(PERSIST_DIR):
     logging.info('Creating index from scratch')
-    documents = SimpleDirectoryReader('data/pmc').load_data()
-    index = VectorStoreIndex.from_documents(documents, show_progress=True)
+    documents = SimpleDirectoryReader(AMP_PAPERS_DIR).load_data()
+    index = VectorStoreIndex.from_documents(
+        documents,
+        embed_model=encoder,
+        show_progress=True,
+    )
     index.storage_context.persist(PERSIST_DIR)
 else:
     logging.info('Loading index from storage')
     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
     index = load_index_from_storage(storage_context)
 
-query_engine = index.as_query_engine()
+query_engine = index.as_query_engine(llm=llm)
 logging.info('Query engine ready, running inference')
 
-amps = [
-    'Amoebapore A',
-    'BACTENECIN 5',
-    'CCL20',
-    'DEFB118',
-    'Drosomycin',
-    'Eotaxin2',
-    'Gm cecropin A',
-    'Human alphasynuclein',
-    'Human granulysin',
-    'Microcin B',
-    'Microcin S',
-    'NLP31',
-    'Amoebapore B',
-    'BACTENECIN 7',
-    'CXCL2',
-    'DEFB24',
-    'Drosomycin2',
-    'Eotaxin3',
-    'Gm cecropin B',
-    'Human beta defensin 2',
-    'Human histatin 9',
-    'Microcin C7',
-    'Microcin V',
-    'Peptide 2',
-    'Amoebapore C',
-    'CAP18',
-    'CXCL3',
-    'Defensin 1',
-    'Drosophila cecropin B',
-    'EP2',
-    'Gm cecropin C',
-    'Human beta defensin 3',
-    'Human TC2',
-    'Microcin L',
-    'NLP27',
-    'Peptide 5',
-    'Bactenecin',
-    'Cathepsin G',
-    'CXCL6',
-    'Dermcidin',
-    'Elafin',
-    'FGG',
-    'Gm defensinlike peptide',
-    'Human beta defensin 4',
-    'LL23',
-    'Microcin M',
-    'NLP29',
-]
+with open(QUERY_AMPS_DIR) as f:
+    amps = f.read().splitlines()
 
 q2r = {}
 for amp in tqdm(amps, desc='Querying', total=len(amps)):
@@ -144,5 +97,5 @@
 with open('data/query_responses_strains.json', 'w') as f:
     json.dump(q2r, f)
 
-# TODO: Find a way to customize the number of documents returned by
-# the query engine. Currently, it returns 10 documents by default.
+# TODO: Move dataloading and encoding to functions and parallelize them.
+# TODO: Once that is done, build the index directly from the embeddings.

From a450129f51e7e9261e194314cd93ad098448acbd Mon Sep 17 00:00:00 2001
From: Ozan Gokdemir <ogokdemir@uchicago.edu>
Date: Wed, 6 Mar 2024 06:53:36 +0000
Subject: [PATCH 4/4] Working draft of the RAG code. Reads, encodes, queries.

---
 ragamp/pubmed_rag.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/ragamp/pubmed_rag.py b/ragamp/pubmed_rag.py
index 39033e4..6e9b675 100644
--- a/ragamp/pubmed_rag.py
+++ b/ragamp/pubmed_rag.py
@@ -25,8 +25,8 @@
 from llama_index.llms.huggingface import HuggingFaceLLM
 from tqdm import tqdm
 
-os.environ['HF_HOME'] = '/lus/eagle/projects/LUCID/ogokdemir/hf_cache'
-from transformers import BitsAndBytesConfig  # noqa
+# os.environ["HF_HOME"] = "/lus/eagle/projects/LUCID/ogokdemir/hf_cache"
+from transformers import BitsAndBytesConfig
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
@@ -58,8 +58,6 @@
     device_map='auto',
 )
 
-# TODO: pritamdeka/S-PubMedBert-MS-MARCO, look into this encoder alternative.
-
 encoder = HuggingFaceBgeEmbeddings(
     model_name='pritamdeka/S-PubMedBert-MS-MARCO',
 )
@@ -67,6 +65,7 @@
 PERSIST_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/indexes/amp_index/'
 AMP_PAPERS_DIR = '/lus/eagle/projects/candle_aesp/ogokdemir/pdfwf_runs/AmpParsedDocs/md_outs/'  # noqa
 QUERY_AMPS_DIR = '/home/ogokdemir/ragamp/examples/antimicrobial_peptides.txt'
+OUTPUT_DIR = '/lus/eagle/projects/LUCID/ogokdemir/ragamp/outputs/amp_output/template_4.json'  # noqa
 
 if not osp.exists(PERSIST_DIR):
     logging.info('Creating index from scratch')
@@ -74,13 +73,14 @@
     index = VectorStoreIndex.from_documents(
         documents,
         embed_model=encoder,
+        insert_batch_size=16384,
         show_progress=True,
     )
     index.storage_context.persist(PERSIST_DIR)
 else:
     logging.info('Loading index from storage')
     storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
-    index = load_index_from_storage(storage_context)
+    index = load_index_from_storage(storage_context, embed_model=encoder)
 
 query_engine = index.as_query_engine(llm=llm)
 logging.info('Query engine ready, running inference')
@@ -90,11 +90,14 @@
 
 q2r = {}
 for amp in tqdm(amps, desc='Querying', total=len(amps)):
-    query = f'What bacterial strains does {amp} act on?.'
+    query = f'What cellular processes does {amp} disrupt?'
     response = query_engine.query(query)
     q2r[amp] = str(response)
 
-with open('data/query_responses_strains.json', 'w') as f:
+
+os.makedirs(osp.dirname(OUTPUT_DIR), exist_ok=True)
+
+with open(OUTPUT_DIR, 'w') as f:
     json.dump(q2r, f)
 
 # TODO: Move dataloading and encoding to functions and parallelize them.