From c0970da77562bb7c30e5fb5010f031f2f9f9903e Mon Sep 17 00:00:00 2001 From: Bhargav Suryadevara Date: Sun, 5 Nov 2023 22:40:53 -0600 Subject: [PATCH 1/3] Added docs and tests --- morpheus/llm/nodes/retriever_node.py | 33 +++++ tests/llm/nodes/test_llm_retriever_node.py | 50 +++++++ .../llm/nodes/test_llm_retriever_node_pipe.py | 124 ++++++++++++++++++ 3 files changed, 207 insertions(+) create mode 100644 tests/llm/nodes/test_llm_retriever_node.py create mode 100644 tests/llm/nodes/test_llm_retriever_node_pipe.py diff --git a/morpheus/llm/nodes/retriever_node.py b/morpheus/llm/nodes/retriever_node.py index 545ef9ddf9..5a9e563500 100644 --- a/morpheus/llm/nodes/retriever_node.py +++ b/morpheus/llm/nodes/retriever_node.py @@ -23,6 +23,18 @@ class RetrieverNode(LLMNodeBase): + """ + Node for retrieving data from a vector database based on embeddings. + + Parameters + ---------- + embedding : typing.Callable[[list[str]], typing.Coroutine[typing.Any, typing.Any, list[list[float]]]] | None + Callable function for generating vector embeddings. Default is None. + service : VectorDBResourceService + Vector database resource service for executing similarity searches. + similarity_search_kwargs : dict + Additional keyword arguments for the similarity search. + """ def __init__( self, @@ -38,12 +50,33 @@ def __init__( self._similarity_search_kwargs = similarity_search_kwargs def get_input_names(self) -> list[str]: + """ + Get the input names for the RetrieverNode. + + Returns + ------- + list[str] + List of input names for the RetrieverNode. + """ if (self._embedding is None): return ["embedding"] return ["query"] async def execute(self, context: LLMContext): + """ + Execute the retrieval process based on the provided context. + + Parameters + ---------- + context : LLMContext + Context object containing necessary information for execution. + + Returns + ------- + LLMContext + Updated context object after the execution. + """ if (self._embedding is not None): # Get the keys from the task diff --git a/tests/llm/nodes/test_llm_retriever_node.py b/tests/llm/nodes/test_llm_retriever_node.py new file mode 100644 index 0000000000..5988dbb0a5 --- /dev/null +++ b/tests/llm/nodes/test_llm_retriever_node.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing +from unittest import mock + +import pytest + +from _utils.llm import execute_node +from morpheus.llm import LLMNodeBase +from morpheus.llm.nodes.retriever_node import RetrieverNode + + +@pytest.mark.parametrize("embedding", [None, mock.AsyncMock()]) +def test_constructor(embedding: typing.Callable | None): + mock_vdb_service = mock.MagicMock() + node = RetrieverNode(embedding=embedding, service=mock_vdb_service) + assert isinstance(node, LLMNodeBase) + + +@pytest.mark.parametrize("embedding, expected", [(None, ["embedding"]), (mock.AsyncMock(), ["query"])]) +def test_get_input_names(embedding: typing.Callable | None, + expected: list[str]): + mock_vdb_service = mock.MagicMock() + node = RetrieverNode(embedding=embedding, service=mock_vdb_service) + assert node.get_input_names() == expected + + +@pytest.mark.parametrize("embedding", [None, mock.AsyncMock(return_value=[[1.2, 2.3, 3.4], [4.5, 5.6, 6.7]])]) +def test_execute(embedding: mock.AsyncMock | None): + mock_vdb_service = mock.MagicMock() + mock_vdb_service.similarity_search = mock.AsyncMock(return_value=[[1, 2, 3], [4, 5, 6]]) + + node = RetrieverNode(embedding=embedding, service=mock_vdb_service) + + expected_output = [[1, 2, 3], [4, 5, 6]] + + assert execute_node(node, query=["query"]) == expected_output diff --git a/tests/llm/nodes/test_llm_retriever_node_pipe.py b/tests/llm/nodes/test_llm_retriever_node_pipe.py new file mode 100644 index 0000000000..e99fec0851 --- /dev/null +++ b/tests/llm/nodes/test_llm_retriever_node_pipe.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +import pytest + +import cudf +from morpheus.config import Config +from morpheus.llm import LLMEngine +from morpheus.llm.nodes.extracter_node import ExtracterNode +from morpheus.llm.nodes.retriever_node import RetrieverNode +from morpheus.llm.task_handlers.simple_task_handler import SimpleTaskHandler +from morpheus.messages import ControlMessage +from morpheus.pipeline.linear_pipeline import LinearPipeline +from morpheus.service.vdb.milvus_vector_db_service import MilvusVectorDBResourceService +from morpheus.service.vdb.milvus_vector_db_service import MilvusVectorDBService +from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage +from morpheus.stages.llm.llm_engine_stage import LLMEngineStage +from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage +from morpheus.stages.preprocess.deserialize_stage import DeserializeStage + + +@pytest.fixture(scope="module", name="milvus_service") +def milvus_service_fixture(milvus_server_uri: str): + service = MilvusVectorDBService(uri=milvus_server_uri) + yield service + + +def _build_engine(vdb_service, **similarity_search_kwargs) -> LLMEngine: + mock_embedding = mock.AsyncMock(return_value=[[1.2, 2.3, 3.4], [4.5, 5.6, 6.7]]) + engine = LLMEngine() + engine.add_node("extracter", node=ExtracterNode()) + engine.add_node("retriever", + inputs=["/extracter"], + node=RetrieverNode(service=vdb_service, + embedding=mock_embedding, **similarity_search_kwargs)) + engine.add_task_handler(inputs=["/retriever"], handler=SimpleTaskHandler()) + + return engine + +@pytest.mark.use_python +def test_pipeline(config: Config): + expected_output = [[1, 2, 3], [4, 5, 6]] + + values = {'prompt': ["prompt1", "prompt2"]} + input_df = cudf.DataFrame(values) + expected_df = input_df.copy(deep=True) + expected_df["response"] = expected_output + + task_payload = {"task_type": "llm_engine", "task_dict": {"input_keys": sorted(values.keys())}} + + mock_vdb_service = mock.MagicMock() + mock_vdb_service.similarity_search = mock.AsyncMock(return_value=[[1, 2, 3], [4, 5, 6]]) + + pipe = LinearPipeline(config) + pipe.set_source(InMemorySourceStage(config, dataframes=[input_df])) + pipe.add_stage( + DeserializeStage(config, message_type=ControlMessage, task_type="llm_engine", task_payload=task_payload)) + pipe.add_stage(LLMEngineStage(config, engine=_build_engine(vdb_service=mock_vdb_service))) + sink = pipe.add_stage(InMemorySinkStage(config)) + + pipe.run() + + message = sink.get_messages()[0] + assert isinstance(message, ControlMessage) + actual_df = message.payload().df + + # Using equals, as CompareDataFrameStage fails to compare. + assert actual_df.to_pandas().equals(expected_df.to_pandas()) + + +@pytest.mark.use_python +def test_pipeline_with_milvus(config: Config, + milvus_service: MilvusVectorDBService, + idx_part_collection_config: dict, + milvus_data: list[dict]): + + collection_name = "test_search_with_data_collection" + # Make sure to drop any existing collection from previous runs. + milvus_service.drop(collection_name) + # Create a collection. + milvus_service.create(collection_name, **idx_part_collection_config) + resource_service: MilvusVectorDBResourceService = milvus_service.load_resource(name=collection_name) + # Insert data into collection + resource_service.insert(milvus_data) + + # Define a similarity_search filter. + expr = "age==26 or age==27" + + values = {'prompt': ["prompt1", "prompt2"]} + input_df = cudf.DataFrame(values) + expected_df = input_df.copy(deep=True) + expected_df["response"] = [[{'0': 27, '1': 2}, {'0': 26, '1': 1}], [{'0': 27, '1': 2}, {'0': 26, '1': 1}]] + + task_payload = {"task_type": "llm_engine", "task_dict": {"input_keys": sorted(values.keys())}} + + pipe = LinearPipeline(config) + pipe.set_source(InMemorySourceStage(config, dataframes=[input_df])) + pipe.add_stage( + DeserializeStage(config, message_type=ControlMessage, task_type="llm_engine", task_payload=task_payload)) + pipe.add_stage(LLMEngineStage(config, engine=_build_engine(vdb_service=resource_service, expr=expr))) + sink = pipe.add_stage(InMemorySinkStage(config)) + + pipe.run() + + message = sink.get_messages()[0] + assert isinstance(message, ControlMessage) + actual_df = message.payload().df + + # Using equals, as CompareDataFrameStage fails to compare. + assert actual_df.to_pandas().equals(expected_df.to_pandas()) From 4d838ef77f97d8c6447d32fb41c18f3381f370f1 Mon Sep 17 00:00:00 2001 From: Bhargav Suryadevara Date: Mon, 6 Nov 2023 09:14:19 -0600 Subject: [PATCH 2/3] Updated tests --- tests/llm/nodes/test_llm_retriever_node.py | 3 +-- tests/llm/nodes/test_llm_retriever_node_pipe.py | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/llm/nodes/test_llm_retriever_node.py b/tests/llm/nodes/test_llm_retriever_node.py index 5988dbb0a5..6834532a91 100644 --- a/tests/llm/nodes/test_llm_retriever_node.py +++ b/tests/llm/nodes/test_llm_retriever_node.py @@ -31,8 +31,7 @@ def test_constructor(embedding: typing.Callable | None): @pytest.mark.parametrize("embedding, expected", [(None, ["embedding"]), (mock.AsyncMock(), ["query"])]) -def test_get_input_names(embedding: typing.Callable | None, - expected: list[str]): +def test_get_input_names(embedding: typing.Callable | None, expected: list[str]): mock_vdb_service = mock.MagicMock() node = RetrieverNode(embedding=embedding, service=mock_vdb_service) assert node.get_input_names() == expected diff --git a/tests/llm/nodes/test_llm_retriever_node_pipe.py b/tests/llm/nodes/test_llm_retriever_node_pipe.py index e99fec0851..55abae5a20 100644 --- a/tests/llm/nodes/test_llm_retriever_node_pipe.py +++ b/tests/llm/nodes/test_llm_retriever_node_pipe.py @@ -18,6 +18,7 @@ import pytest import cudf + from morpheus.config import Config from morpheus.llm import LLMEngine from morpheus.llm.nodes.extracter_node import ExtracterNode @@ -45,12 +46,12 @@ def _build_engine(vdb_service, **similarity_search_kwargs) -> LLMEngine: engine.add_node("extracter", node=ExtracterNode()) engine.add_node("retriever", inputs=["/extracter"], - node=RetrieverNode(service=vdb_service, - embedding=mock_embedding, **similarity_search_kwargs)) + node=RetrieverNode(service=vdb_service, embedding=mock_embedding, **similarity_search_kwargs)) engine.add_task_handler(inputs=["/retriever"], handler=SimpleTaskHandler()) return engine + @pytest.mark.use_python def test_pipeline(config: Config): expected_output = [[1, 2, 3], [4, 5, 6]] From a34602ddace5092a135976465027accb984a5f77 Mon Sep 17 00:00:00 2001 From: Bhargav Suryadevara Date: Mon, 6 Nov 2023 14:22:58 -0600 Subject: [PATCH 3/3] Updated collection name in the test --- tests/llm/nodes/test_llm_retriever_node_pipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/llm/nodes/test_llm_retriever_node_pipe.py b/tests/llm/nodes/test_llm_retriever_node_pipe.py index 55abae5a20..0128cd7a27 100644 --- a/tests/llm/nodes/test_llm_retriever_node_pipe.py +++ b/tests/llm/nodes/test_llm_retriever_node_pipe.py @@ -84,12 +84,13 @@ def test_pipeline(config: Config): @pytest.mark.use_python +@pytest.mark.milvus def test_pipeline_with_milvus(config: Config, milvus_service: MilvusVectorDBService, idx_part_collection_config: dict, milvus_data: list[dict]): - collection_name = "test_search_with_data_collection" + collection_name = "test_retriever_node_collection" # Make sure to drop any existing collection from previous runs. milvus_service.drop(collection_name) # Create a collection.