Skip to content

Commit

Permalink
Added the test cases for pinecone and implemented review comments
Browse files Browse the repository at this point in the history
Signed-off-by: pallavi jaini <[email protected]>
  • Loading branch information
pallavijaini0525 committed Aug 8, 2024
1 parent 68250f6 commit ae7a93c
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 112 deletions.
2 changes: 1 addition & 1 deletion comps/dataprep/pinecone/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os

# Embedding model
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")

# Pinecone configuration
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ services:
ports:
- "6007:6007"
- "6008:6008"
- "6009:6009"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand Down
56 changes: 45 additions & 11 deletions comps/dataprep/pinecone/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
upload_folder = "./uploaded_files/"


def check_index_existance(client):
def check_index_existance():
print(f"[ check index existence ] checking {PINECONE_INDEX_NAME}")
pc = Pinecone(api_key=PINECONE_API_KEY)
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
Expand All @@ -44,7 +44,6 @@ def check_index_existance(client):
else:
return True


def create_index(client):
print(f"[ create index ] creating index {PINECONE_INDEX_NAME}")
try:
Expand All @@ -60,20 +59,17 @@ def create_index(client):
return False
return True


def store_by_id(client, key, value):
print(f"[ store by id ] storing ids of {key}")
def drop_index(index_name):
print(f"[ drop index ] dropping index {index_name}")
pc = Pinecone(api_key=PINECONE_API_KEY)
try:
index = client.Index(PINECONE_INDEX_NAME)
index.upsert(vectors=[{"id": "file:" + key, "values": value}], namespace="ns1")

print(f"[ store by id ] store document success. id: file:{key}")
pc.delete_index(index_name)
print(f"[ drop index ] index {index_name} deleted")
except Exception as e:
print(f"[ store by id ] fail to store document file:{key}: {e}")
print(f"[ drop index ] index {index_name} delete failed: {e}")
return False
return True


def ingest_data_to_pinecone(doc_path: DocPath):
"""Ingest document to Pinecone."""
path = doc_path.path
Expand Down Expand Up @@ -109,6 +105,15 @@ def ingest_data_to_pinecone(doc_path: DocPath):
# create embeddings using local embedding model
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)

pc = Pinecone(api_key=PINECONE_API_KEY)

#Checking Index existance
if (not check_index_existance()):
# Creating the index
create_index(pc)
print("Successfully created the index", PINECONE_INDEX_NAME)


# Batch size
batch_size = 32
num_chunks = len(chunks)
Expand Down Expand Up @@ -140,6 +145,12 @@ async def ingest_link_to_pinecone(link_list: List[str]):

pc = Pinecone(api_key=PINECONE_API_KEY)

#Checking Index existance
if (not check_index_existance()):
# Creating the index
create_index(pc)
print("Successfully created the index", PINECONE_INDEX_NAME)

# save link contents and doc_ids one by one
for link in link_list:
content = parse_html([link])[0][0]
Expand Down Expand Up @@ -222,7 +233,30 @@ async def rag_get_file_structure():
return file_content


@register_microservice(
name="opea_service@prepare_doc_pinecone_del", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6009
)
@traceable(run_type="tool")
async def delete_all(file_path: str = Body(..., embed=True)):
"""Delete file according to `file_path`.
`file_path`:
- "all": delete all files uploaded
"""
# delete all uploaded files
if file_path == "all":
print("[dataprep - del] delete all files")
remove_folder_with_ignore(upload_folder)
assert drop_index(index_name=PINECONE_INDEX_NAME)
print("[dataprep - del] successfully delete all files.")
create_upload_folder(upload_folder)
return {"status": True}
else:
raise HTTPException(status_code=404, detail="Single file deletion is not implemented yet")


if __name__ == "__main__":
create_upload_folder(upload_folder)
opea_microservices["opea_service@prepare_doc_pinecone"].start()
opea_microservices["opea_service@prepare_doc_pinecone_file"].start()
opea_microservices["opea_service@prepare_doc_pinecone_del"].start()
Binary file not shown.
100 changes: 0 additions & 100 deletions comps/retrievers/langchain/pinecone/ingest.py

This file was deleted.

59 changes: 59 additions & 0 deletions tests/test_dataprep_pinecone.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

set -xe

WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')
function build_docker_images() {
cd $WORKPATH

# build dataprep image for pinecone
docker build -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pinecone/docker/Dockerfile .
}

function start_service() {
export PINECONE_API_KEY="f98c36ea-20fa-4d02-9ddc-d331739c4923"
export PINECONE_INDEX_NAME="test-index"
export HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN

docker run -d --name="dataprep-pinecone" -p 6007:6007 -p 6008:6008 -p 6009:6009 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e PINECONE_API_KEY=$PINECONE_API_KEY -e PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME opea/dataprep-pinecone:latest

sleep 1m
}

function validate_microservice() {
URL="http://$ip_address:6007/v1/dataprep"
echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > ./dataprep_file.txt
curl --noproxy $ip_address --location --request POST \
--form 'files=@./dataprep_file.txt' $URL

DELETE_URL="http://$ip_address:6009/v1/dataprep/delete_file"
curl --noproxy $ip_address --location --request POST \
-d '{"file_path": "all"}' -H 'Content-Type: application/json' $DELETE_URL
}

function stop_docker() {
cid=$(docker ps -aq --filter "name=vectorstore-pinecone*")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi

cid=$(docker ps -aq --filter "name=dataprep-pinecone*")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}

function main() {

stop_docker

build_docker_images
start_service

validate_microservice

stop_docker
echo y | docker system prune

}

main
71 changes: 71 additions & 0 deletions tests/test_retrievers_langchain_pinecone.sh

Large diffs are not rendered by default.

0 comments on commit ae7a93c

Please sign in to comment.