diff --git a/python/huggingfaceserver/huggingfaceserver/model.py b/python/huggingfaceserver/huggingfaceserver/model.py index 0d2ef725145..7575ee96064 100644 --- a/python/huggingfaceserver/huggingfaceserver/model.py +++ b/python/huggingfaceserver/huggingfaceserver/model.py @@ -196,6 +196,7 @@ async def generate(self, generate_request: GenerateRequest, headers: Dict[str, s async def predict(self, input_batch: Union[BatchEncoding, InferRequest], context: Dict[str, Any] = None) \ -> Union[Tensor, InferResponse]: + response_headers = {} if self.predictor_host: # when predictor_host is provided, serialize the tensor and send to optimized model serving runtime # like NVIDIA triton inference server @@ -208,7 +209,7 @@ async def predict(self, input_batch: Union[BatchEncoding, InferRequest], context outputs = self.model.generate(**input_batch) else: outputs = self.model(**input_batch).logits - return outputs + return outputs, response_headers except Exception as e: raise InferenceError(str(e)) diff --git a/python/huggingfaceserver/huggingfaceserver/test_model.py b/python/huggingfaceserver/huggingfaceserver/test_model.py index 0cf957d0696..4db5bebefce 100644 --- a/python/huggingfaceserver/huggingfaceserver/test_model.py +++ b/python/huggingfaceserver/huggingfaceserver/test_model.py @@ -27,7 +27,7 @@ def test_t5(): model.load() request = "translate this to germany" - response = asyncio.run(model({"instances": [request, request]}, headers={})) + response, response_headers = asyncio.run(model({"instances": [request, request]}, headers={})) assert response == {"predictions": ['Das ist für Deutschland', 'Das ist für Deutschland']} @@ -35,7 +35,7 @@ def test_bert(): model = HuggingfaceModel("bert-base-uncased", {"model_id": "bert-base-uncased", "do_lower_case": True}) model.load() - response = asyncio.run(model({"instances": ["The capital of France is [MASK].", + response, response_headers = asyncio.run(model({"instances": ["The capital of France is [MASK].", "The capital of [MASK] is paris."]}, headers={})) assert response == {"predictions": ["paris", "france"]} @@ -51,7 +51,7 @@ def test_bert_predictor_host(httpx_mock: HTTPXMock): predictor_host="localhost:8081", predictor_protocol="v2")) model.load() - response = asyncio.run(model({"instances": ["The capital of France is [MASK]."]}, headers={})) + response, response_headers = asyncio.run(model({"instances": ["The capital of France is [MASK]."]}, headers={})) assert response == {"predictions": ["[PAD]"]} @@ -62,7 +62,7 @@ def test_bert_sequence_classification(): model.load() request = "Hello, my dog is cute." - response = asyncio.run(model({"instances": [request, request]}, headers={})) + response, response_headers = asyncio.run(model({"instances": [request, request]}, headers={})) assert response == {"predictions": [1, 1]} @@ -73,7 +73,7 @@ def test_bert_token_classification(): model.load() request = "HuggingFace is a company based in Paris and New York" - response = asyncio.run(model({"instances": [request, request]}, headers={})) + response, response_headers = asyncio.run(model({"instances": [request, request]}, headers={})) assert response == {"predictions": [[[0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]} diff --git a/python/kserve/test/test_server.py b/python/kserve/test/test_server.py index 65e4bc81eb4..97e12905337 100644 --- a/python/kserve/test/test_server.py +++ b/python/kserve/test/test_server.py @@ -333,6 +333,7 @@ def test_infer_parameters_v2(self, http_server_client): input_data = json.dumps(req.to_rest()).encode('utf-8') with patch.object(DummyModel, 'predict', new_callable=mock.Mock) as mock_predict: + response_headers = {} mock_predict.return_value = InferResponse(model_name="TestModel", response_id="123", parameters={ "test-str": "dummy", @@ -347,7 +348,7 @@ def test_infer_parameters_v2(self, http_server_client): "test-str": "dummy", "test-bool": True, "test-int": 100 - })]) + })]), response_headers resp = http_server_client.post('/v2/models/TestModel/infer', content=input_data) mock_predict.assert_called_with(req, mock.ANY) diff --git a/python/lgbserver/lgbserver/test_model.py b/python/lgbserver/lgbserver/test_model.py index 6e1c6b98fad..0368a043ef1 100644 --- a/python/lgbserver/lgbserver/test_model.py +++ b/python/lgbserver/lgbserver/test_model.py @@ -47,39 +47,37 @@ def test_model(): 'petal_width_(cm)': {0: 0.2}, 'sepal_length_(cm)': {0: 5.1}} response, response_headers = model.predict({"inputs": [request, request]}) - assert numpy.argmax(response["predictions"][0]) == 2 + assert numpy.argmax(response["predictions"][0]) == 0 - response, response_headers = model.predict( - {"instances": [request, request]}) - assert numpy.argmax(response["predictions"][0]) == 2 + response, response_headers = model.predict({"instances": [request, request]}) + assert numpy.argmax(response["predictions"][0]) == 0 request = [ {'sepal_width_(cm)': 3.5}, {'petal_length_(cm)': 1.4}, {'petal_width_(cm)': 0.2}, {'sepal_length_(cm)': 5.1} ] - response = model.predict({"inputs": [request, request]}) + response, response_headers = model.predict({"inputs": [request, request]}) assert numpy.argmax(response["predictions"][0]) == 0 - response = model.predict({"instances": [request, request]}) + response, response_headers = model.predict({"instances": [request, request]}) assert numpy.argmax(response["predictions"][0]) == 0 request = [ {'sepal_width_(cm)': 3.5}, {'petal_length_(cm)': 1.4}, {'petal_width_(cm)': 0.2} ] - response = model.predict({"inputs": [request, request]}) + response, response_headers = model.predict({"inputs": [request, request]}) assert numpy.argmax(response["predictions"][0]) == 0 - response = model.predict({"instances": [request, request]}) + response, response_headers = model.predict({"instances": [request, request]}) assert numpy.argmax(response["predictions"][0]) == 0 # test v2 handler infer_input = InferInput(name="input-0", shape=[2, 4], datatype="FP32", data=[[6.8, 2.8, 4.8, 1.6], [6.0, 3.4, 4.5, 1.6]]) - infer_request = InferRequest( - model_name="model", infer_inputs=[infer_input]) + infer_request = InferRequest(model_name="model", infer_inputs=[infer_input]) infer_response, response_headers = model.predict(infer_request) assert infer_response.to_rest()["outputs"] == \ - [{'name': 'output-0', 'shape': [2, 3], 'datatype': 'FP64', - 'data': [3.7899802486733807e-06, 0.9996982074114203, 0.00029800260833088297, - 5.2172911836629736e-05, 0.99973341723876, 0.000214409849403366]}] + [{'name': 'output-0', 'shape': [2, 3], 'datatype': 'FP64', + 'data': [3.7899802486733807e-06, 0.9996982074114203, 0.00029800260833088297, + 5.2172911836629736e-05, 0.99973341723876, 0.000214409849403366]}] diff --git a/python/test_resources/graph/success_200_isvc/model.py b/python/test_resources/graph/success_200_isvc/model.py index bf2a4d6fe7f..0ec88cf15ce 100644 --- a/python/test_resources/graph/success_200_isvc/model.py +++ b/python/test_resources/graph/success_200_isvc/model.py @@ -32,7 +32,8 @@ def load(self): self.ready = True def predict(self, payload: Union[Dict, InferRequest, ModelInferRequest], headers) -> Dict: - return {"message": "SUCCESS"} + response_headers = {} + return {"message": "SUCCESS"}, response_headers parser = argparse.ArgumentParser(parents=[kserve.model_server.parser])