Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci

Signed-off-by: pallavi jaini <[email protected]>
  • Loading branch information
pre-commit-ci[bot] authored and pallavijaini0525 committed Aug 7, 2024
1 parent 557c460 commit 3a31803
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,3 @@ services:
networks:
default:
driver: bridge

29 changes: 14 additions & 15 deletions comps/dataprep/pinecone/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,48 +33,47 @@
tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT")
upload_folder = "./uploaded_files/"


def check_index_existance(client):
print(f"[ check index existence ] checking {PINECONE_INDEX_NAME}")

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if PINECONE_INDEX_NAME not in existing_indexes:
print(f"[ check index existence ] index does not exist")
print("[ check index existence ] index does not exist")
return None
else:
return True


def create_index(client):
print(f"[ create index ] creating index {PINECONE_INDEX_NAME}")
try:
client.create_index(
name=PINECONE_INDEX_NAME,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
name=PINECONE_INDEX_NAME,
dimension=768,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
print(f"[ create index ] index {PINECONE_INDEX_NAME} successfully created")
except Exception as e:
print(f"[ create index ] fail to create index {PINECONE_INDEX_NAME}: {e}")
return False
return True


def store_by_id(client, key, value):
print(f"[ store by id ] storing ids of {key}")
try:
index = client.Index(PINECONE_INDEX_NAME)
index.upsert(
vectors=[
{"id":"file:" + key, "values":value}
],
namespace="ns1"
)

index.upsert(vectors=[{"id": "file:" + key, "values": value}], namespace="ns1")

print(f"[ store by id ] store document success. id: file:{key}")
except Exception as e:
print(f"[ store by id ] fail to store document file:{key}: {e}")
return False
return True



def ingest_data_to_pinecone(doc_path: DocPath):
"""Ingest document to Pinecone."""
path = doc_path.path
Expand Down Expand Up @@ -125,7 +124,7 @@ def ingest_data_to_pinecone(doc_path: DocPath):
index_name=PINECONE_INDEX_NAME,
)
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")

# store file_ids into index file-keys
pc = Pinecone(api_key=PINECONE_API_KEY)

Expand Down
8 changes: 4 additions & 4 deletions comps/dataprep/pinecone/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ fastapi
huggingface_hub
langchain
langchain-community
langchain-text-splitters
langchain-pinecone
langchain-openai
langchain-pinecone
langchain-text-splitters
langsmith
markdown
numpy
Expand All @@ -18,11 +18,11 @@ opentelemetry-exporter-otlp
opentelemetry-sdk
pandas
Pillow
prometheus-fastapi-instrumentator
pinecone-client
prometheus-fastapi-instrumentator
pymupdf
pyspark
python-bidi==0.4.2
pymupdf
python-docx
python-pptx
sentence_transformers
Expand Down
2 changes: 1 addition & 1 deletion comps/retrievers/langchain/pinecone/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ opentelemetry-sdk
pandas
Pillow
pinecone-client
prometheus_fastapi_instrumentator
pymupdf
python-docx
sentence_transformers
shortuuid
prometheus_fastapi_instrumentator
uvicorn
12 changes: 5 additions & 7 deletions comps/retrievers/langchain/pinecone/retriever_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import os
import time

from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings
from langchain_pinecone import PineconeVectorStore
from langsmith import traceable
from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME
from pinecone import Pinecone, ServerlessSpec

from comps import (
Expand Down Expand Up @@ -36,19 +36,17 @@
def retrieve(input: EmbedDoc) -> SearchedDoc:
start = time.time()

pc = Pinecone(api_key=PINECONE_API_KEY)
pc = Pinecone(api_key=PINECONE_API_KEY)

index = pc.Index(PINECONE_INDEX_NAME)
print(index.describe_index_stats()['total_vector_count'])
print(index.describe_index_stats()["total_vector_count"])
# check if the Pinecone index has data
if index.describe_index_stats()['total_vector_count'] == 0:
if index.describe_index_stats()["total_vector_count"] == 0:
result = SearchedDoc(retrieved_docs=[], initial_query=input.text)
statistics_dict["opea_service@retriever_pinecone"].append_latency(time.time() - start, None)
return result

search_res = vector_db.max_marginal_relevance_search(
query=input.text, k=input.k, fetch_k=input.fetch_k
)
search_res = vector_db.max_marginal_relevance_search(query=input.text, k=input.k, fetch_k=input.fetch_k)
# if the Pinecone index has data, perform the search
if input.search_type == "similarity":
docs_and_similarities = vector_db.similarity_search_by_vector_with_score(embedding=input.embedding, k=input.k)
Expand Down

0 comments on commit 3a31803

Please sign in to comment.