From 0174d94f8a2ed1ea68345c6c8059b7ae03a51b10 Mon Sep 17 00:00:00 2001 From: xchen Date: Fri, 20 Dec 2024 14:29:20 +0800 Subject: [PATCH] multimodal: update doc and model path in launcher (#48) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update multimodal doc and requirement * update model path --------- Co-authored-by: Xiaotong Chen <“cxt459847@alibaba-inc.com”> --- docs/sphinx/vlm/vlm_offline_inference_en.rst | 18 +++++++++++------- multimodal/dashinfer_vlm/api_server/server.py | 3 ++- multimodal/requirements.txt | 3 ++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/docs/sphinx/vlm/vlm_offline_inference_en.rst b/docs/sphinx/vlm/vlm_offline_inference_en.rst index 10ecdb4c..69a0d491 100644 --- a/docs/sphinx/vlm/vlm_offline_inference_en.rst +++ b/docs/sphinx/vlm/vlm_offline_inference_en.rst @@ -97,11 +97,18 @@ You can also use OpenAI's Python client library: }, ], }], - stream=False, + stream=True, max_completion_tokens=1024, temperature=0.1, ) + full_response = "" + for chunk in response: + full_response += chunk.choices[0].delta.content + print(".", end="") + + print(f"\nFull Response: \n{full_response}") + Launching with CLI ------------------------- You can also opt to install dashinfer-vlm locally and use command line to launch server. @@ -109,14 +116,11 @@ You can also opt to install dashinfer-vlm locally and use command line to launch 1. Pull dashinfer docker image (see :ref:`docker-label`) 2. Install TensorRT Python package, and download TensorRT GA build from NVIDIA Developer Zone. -Example: TensorRT 10.6.0.26 for CUDA 12.6, Linux x86_64 - .. code-block:: bash - pip install tensorrt - wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz - tar -xvzf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz - export LD_LIBRARY_PATH=`pwd`/TensorRT-10.6.0.26/lib + wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz + tar -xvzf TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz + export LD_LIBRARY_PATH=`pwd`/TensorRT-10.5.0.18/lib 3. Install dashinfer Python Package from `release `_ 4. Install dashinfer-vlm: ``pip install dashinfer-vlm``. diff --git a/multimodal/dashinfer_vlm/api_server/server.py b/multimodal/dashinfer_vlm/api_server/server.py index 921cd7d9..89221067 100644 --- a/multimodal/dashinfer_vlm/api_server/server.py +++ b/multimodal/dashinfer_vlm/api_server/server.py @@ -76,7 +76,8 @@ def init(): context.set("chat_format", chat_format) # -----------------------Convert Model------------------------ - output_dir = "/root/.cache/as_model/" + model.split("/")[-1] + home_dir = os.environ.get("HOME") or "/root" + output_dir = os.path.join(home_dir, ".cache/as_model/", model.split("/")[-1]) model_name = "model" data_type = "bfloat16" diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt index 135cf9d9..b5c95a89 100644 --- a/multimodal/requirements.txt +++ b/multimodal/requirements.txt @@ -1,3 +1,4 @@ +tensorrt==10.5.0 av numpy==1.24.3 requests==2.32.3 @@ -6,7 +7,7 @@ transformers>=4.45.0 cachetools>=5.4.0 six tiktoken -openai==1.52.2 +openai>=1.56.2 shortuuid fastapi pydantic_settings