-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Perf optimizations in map_query_to_entities() (#1276)
* Address perf issue in map_query_to_entities() * Add semver --------- Co-authored-by: Matthieu Maitre <[email protected]> Co-authored-by: Alonso Guevara <[email protected]>
- Loading branch information
1 parent
1f70d42
commit 6aae386
Showing
9 changed files
with
388 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"type": "patch", | ||
"description": "Perf optimizations in map_query_to_entities()" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License |
182 changes: 182 additions & 0 deletions
182
tests/unit/query/context_builder/test_entity_extraction.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License | ||
|
||
from typing import Any | ||
|
||
from graphrag.model import Entity | ||
from graphrag.model.types import TextEmbedder | ||
from graphrag.query.context_builder.entity_extraction import ( | ||
EntityVectorStoreKey, | ||
map_query_to_entities, | ||
) | ||
from graphrag.query.llm.base import BaseTextEmbedding | ||
from graphrag.vector_stores import ( | ||
BaseVectorStore, | ||
VectorStoreDocument, | ||
VectorStoreSearchResult, | ||
) | ||
|
||
|
||
class MockBaseVectorStore(BaseVectorStore): | ||
def __init__(self, documents: list[VectorStoreDocument]) -> None: | ||
super().__init__("mock") | ||
self.documents = documents | ||
|
||
def connect(self, **kwargs: Any) -> None: | ||
raise NotImplementedError | ||
|
||
def load_documents( | ||
self, documents: list[VectorStoreDocument], overwrite: bool = True | ||
) -> None: | ||
raise NotImplementedError | ||
|
||
def similarity_search_by_vector( | ||
self, query_embedding: list[float], k: int = 10, **kwargs: Any | ||
) -> list[VectorStoreSearchResult]: | ||
return [ | ||
VectorStoreSearchResult(document=document, score=1) | ||
for document in self.documents[:k] | ||
] | ||
|
||
def similarity_search_by_text( | ||
self, text: str, text_embedder: TextEmbedder, k: int = 10, **kwargs: Any | ||
) -> list[VectorStoreSearchResult]: | ||
return sorted( | ||
[ | ||
VectorStoreSearchResult( | ||
document=document, score=abs(len(text) - len(document.text or "")) | ||
) | ||
for document in self.documents | ||
], | ||
key=lambda x: x.score, | ||
)[:k] | ||
|
||
def filter_by_id(self, include_ids: list[str] | list[int]) -> Any: | ||
return [document for document in self.documents if document.id in include_ids] | ||
|
||
|
||
class MockBaseTextEmbedding(BaseTextEmbedding): | ||
def embed(self, text: str, **kwargs: Any) -> list[float]: | ||
return [len(text)] | ||
|
||
async def aembed(self, text: str, **kwargs: Any) -> list[float]: | ||
return [len(text)] | ||
|
||
|
||
def test_map_query_to_entities(): | ||
entities = [ | ||
Entity( | ||
id="2da37c7a-50a8-44d4-aa2c-fd401e19976c", | ||
short_id="sid1", | ||
title="t1", | ||
rank=2, | ||
), | ||
Entity( | ||
id="c4f93564-4507-4ee4-b102-98add401a965", | ||
short_id="sid2", | ||
title="t22", | ||
rank=4, | ||
), | ||
Entity( | ||
id="7c6f2bc9-47c9-4453-93a3-d2e174a02cd9", | ||
short_id="sid3", | ||
title="t333", | ||
rank=1, | ||
), | ||
Entity( | ||
id="8fd6d72a-8e9d-4183-8a97-c38bcc971c83", | ||
short_id="sid4", | ||
title="t4444", | ||
rank=3, | ||
), | ||
] | ||
|
||
assert map_query_to_entities( | ||
query="t22", | ||
text_embedding_vectorstore=MockBaseVectorStore([ | ||
VectorStoreDocument(id=entity.id, text=entity.title, vector=None) | ||
for entity in entities | ||
]), | ||
text_embedder=MockBaseTextEmbedding(), | ||
all_entities_dict={entity.id: entity for entity in entities}, | ||
embedding_vectorstore_key=EntityVectorStoreKey.ID, | ||
k=1, | ||
oversample_scaler=1, | ||
) == [ | ||
Entity( | ||
id="c4f93564-4507-4ee4-b102-98add401a965", | ||
short_id="sid2", | ||
title="t22", | ||
rank=4, | ||
) | ||
] | ||
|
||
assert map_query_to_entities( | ||
query="t22", | ||
text_embedding_vectorstore=MockBaseVectorStore([ | ||
VectorStoreDocument(id=entity.title, text=entity.title, vector=None) | ||
for entity in entities | ||
]), | ||
text_embedder=MockBaseTextEmbedding(), | ||
all_entities_dict={entity.id: entity for entity in entities}, | ||
embedding_vectorstore_key=EntityVectorStoreKey.TITLE, | ||
k=1, | ||
oversample_scaler=1, | ||
) == [ | ||
Entity( | ||
id="c4f93564-4507-4ee4-b102-98add401a965", | ||
short_id="sid2", | ||
title="t22", | ||
rank=4, | ||
) | ||
] | ||
|
||
assert map_query_to_entities( | ||
query="", | ||
text_embedding_vectorstore=MockBaseVectorStore([ | ||
VectorStoreDocument(id=entity.id, text=entity.title, vector=None) | ||
for entity in entities | ||
]), | ||
text_embedder=MockBaseTextEmbedding(), | ||
all_entities_dict={entity.id: entity for entity in entities}, | ||
embedding_vectorstore_key=EntityVectorStoreKey.ID, | ||
k=2, | ||
) == [ | ||
Entity( | ||
id="c4f93564-4507-4ee4-b102-98add401a965", | ||
short_id="sid2", | ||
title="t22", | ||
rank=4, | ||
), | ||
Entity( | ||
id="8fd6d72a-8e9d-4183-8a97-c38bcc971c83", | ||
short_id="sid4", | ||
title="t4444", | ||
rank=3, | ||
), | ||
] | ||
|
||
assert map_query_to_entities( | ||
query="", | ||
text_embedding_vectorstore=MockBaseVectorStore([ | ||
VectorStoreDocument(id=entity.id, text=entity.title, vector=None) | ||
for entity in entities | ||
]), | ||
text_embedder=MockBaseTextEmbedding(), | ||
all_entities_dict={entity.id: entity for entity in entities}, | ||
embedding_vectorstore_key=EntityVectorStoreKey.TITLE, | ||
k=2, | ||
) == [ | ||
Entity( | ||
id="c4f93564-4507-4ee4-b102-98add401a965", | ||
short_id="sid2", | ||
title="t22", | ||
rank=4, | ||
), | ||
Entity( | ||
id="8fd6d72a-8e9d-4183-8a97-c38bcc971c83", | ||
short_id="sid4", | ||
title="t4444", | ||
rank=3, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (c) 2024 Microsoft Corporation. | ||
# Licensed under the MIT License |
Oops, something went wrong.