From 0174d94f8a2ed1ea68345c6c8059b7ae03a51b10 Mon Sep 17 00:00:00 2001
From: xchen <xiaotongc@outlook.com>
Date: Fri, 20 Dec 2024 14:29:20 +0800
Subject: [PATCH] multimodal: update doc and model path in launcher (#48)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update multimodal doc and requirement

* update model path

---------

Co-authored-by: Xiaotong Chen <“cxt459847@alibaba-inc.com”>
---
 docs/sphinx/vlm/vlm_offline_inference_en.rst  | 18 +++++++++++-------
 multimodal/dashinfer_vlm/api_server/server.py |  3 ++-
 multimodal/requirements.txt                   |  3 ++-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/docs/sphinx/vlm/vlm_offline_inference_en.rst b/docs/sphinx/vlm/vlm_offline_inference_en.rst
index 10ecdb4c..69a0d491 100644
--- a/docs/sphinx/vlm/vlm_offline_inference_en.rst
+++ b/docs/sphinx/vlm/vlm_offline_inference_en.rst
@@ -97,11 +97,18 @@ You can also use OpenAI's Python client library:
                },
          ],
       }],
-      stream=False,
+      stream=True,
       max_completion_tokens=1024,
       temperature=0.1,
    )
 
+   full_response = ""
+   for chunk in response:
+      full_response += chunk.choices[0].delta.content
+      print(".", end="")
+
+   print(f"\nFull Response: \n{full_response}")
+
 Launching with CLI
 -------------------------
 You can also opt to install dashinfer-vlm locally and use command line to launch server.
@@ -109,14 +116,11 @@ You can also opt to install dashinfer-vlm locally and use command line to launch
 1. Pull dashinfer docker image (see :ref:`docker-label`)
 2. Install TensorRT Python package, and download TensorRT GA build from NVIDIA Developer Zone.
 
-Example: TensorRT 10.6.0.26 for CUDA 12.6, Linux x86_64
-
 .. code-block:: bash
 
-   pip install tensorrt
-   wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
-   tar -xvzf TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz
-   export LD_LIBRARY_PATH=`pwd`/TensorRT-10.6.0.26/lib
+   wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
+   tar -xvzf TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
+   export LD_LIBRARY_PATH=`pwd`/TensorRT-10.5.0.18/lib
 
 3. Install dashinfer Python Package from `release <https://github.com/modelscope/dash-infer/releases>`_
 4. Install dashinfer-vlm: ``pip install dashinfer-vlm``.
diff --git a/multimodal/dashinfer_vlm/api_server/server.py b/multimodal/dashinfer_vlm/api_server/server.py
index 921cd7d9..89221067 100644
--- a/multimodal/dashinfer_vlm/api_server/server.py
+++ b/multimodal/dashinfer_vlm/api_server/server.py
@@ -76,7 +76,8 @@ def init():
     context.set("chat_format", chat_format)
 
     # -----------------------Convert Model------------------------
-    output_dir = "/root/.cache/as_model/" + model.split("/")[-1]
+    home_dir = os.environ.get("HOME") or "/root"
+    output_dir = os.path.join(home_dir, ".cache/as_model/", model.split("/")[-1])
     model_name = "model"
     data_type = "bfloat16"
 
diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt
index 135cf9d9..b5c95a89 100644
--- a/multimodal/requirements.txt
+++ b/multimodal/requirements.txt
@@ -1,3 +1,4 @@
+tensorrt==10.5.0
 av
 numpy==1.24.3
 requests==2.32.3
@@ -6,7 +7,7 @@ transformers>=4.45.0
 cachetools>=5.4.0
 six
 tiktoken
-openai==1.52.2
+openai>=1.56.2
 shortuuid
 fastapi
 pydantic_settings