diff --git a/comps/__init__.py b/comps/__init__.py
index 873a7697a3..a5d00f9e07 100644
--- a/comps/__init__.py
+++ b/comps/__init__.py
@@ -18,6 +18,7 @@
     RAGASScores,
     GraphDoc,
     LVMDoc,
+    LVMVideoDoc,
 )
 
 # Constants
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
index 9760d7d3ec..0d397dcb7b 100644
--- a/comps/cores/proto/docarray.py
+++ b/comps/cores/proto/docarray.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 from docarray import BaseDoc, DocList
-from docarray.documents import AudioDoc
+from docarray.documents import AudioDoc, VideoDoc
 from docarray.typing import AudioUrl
 from pydantic import Field, conint, conlist, field_validator
 
@@ -170,3 +170,11 @@ class LVMDoc(BaseDoc):
     temperature: float = 0.01
     repetition_penalty: float = 1.03
     streaming: bool = False
+
+
+class LVMVideoDoc(BaseDoc):
+    video_url: str
+    chunk_start: float
+    chunk_duration: float
+    prompt: str
+    max_new_tokens: conint(ge=0, le=1024) = 512
diff --git a/comps/lvms/video-llama/Dockerfile b/comps/lvms/video-llama/Dockerfile
new file mode 100644
index 0000000000..b172a217a4
--- /dev/null
+++ b/comps/lvms/video-llama/Dockerfile
@@ -0,0 +1,18 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=en_US.UTF-8
+
+COPY comps /home/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/comps/lvms/video-llama/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home
+
+WORKDIR /home/comps/lvms/video-llama
+
+ENTRYPOINT ["python", "lvm.py"]
\ No newline at end of file
diff --git a/comps/lvms/video-llama/README.md b/comps/lvms/video-llama/README.md
new file mode 100644
index 0000000000..43ec0bd188
--- /dev/null
+++ b/comps/lvms/video-llama/README.md
@@ -0,0 +1,70 @@
+# LVM Microservice
+
+This is a Docker-based microservice that runs Video-Llama as a Large Vision Model (LVM). It utilizes Llama-2-7b-chat-hf for conversations based on video dialogues. It support Intel Xeon CPU.
+
+# 🚀1. Start Microservice with Docker
+
+## 1.1 Build Images
+
+```bash
+cd GenAIComps
+# Video-Llama Server Image
+docker build --no-cache -t opea/video-llama-lvm-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/video-llama/server/docker/Dockerfile .
+# LVM Service Image
+docker build --no-cache -t opea/lvm-video-llama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f comps/lvms/video-llama/Dockerfile .
+```
+
+## 1.2 Start Video-Llama and LVM Services
+
+For the very first run, please follow below steps:
+
+```bash
+# prepare environment variables
+export ip_address=$(hostname -I | awk '{print $1}')
+export no_proxy=$no_proxy,${ip_address}
+export LVM_ENDPOINT=http://${ip_address}:9009
+# Start service
+docker compose -f comps/lvms/video-llama/docker_compose.yaml up -d
+# it should take about 1.5 hours for the model to download in the video-llama server, assuming a maximum download speed of 100 Mbps
+until docker logs video-llama-lvm-server 2>&1 | grep -q "Uvicorn running on"; do
+    sleep 5m
+done
+```
+
+If you've run the microservice before, it's recommended to keep the downloaded model so it won't be redownloaded each time you run it. To achieve this, you need to modify the following configuration:
+
+```yaml
+# comps/lvms/video-llama/docker_compose.yaml
+services:
+  lvm-video-llama:
+    ...
+    environment:
+      llm_download: "False" # avoid download
+```
+
+# ✅ 2. Test
+
+```bash
+# use curl
+export ip_address=$(hostname -I | awk '{print $1}')
+## check video-llama
+http_proxy="" curl -X POST "http://${ip_address}:9009/generate?video_url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideo-LLaMA%2Fraw%2Fmain%2Fexamples%2Fsilence_girl.mp4&start=0.0&duration=9&prompt=What%20is%20the%20person%20doing%3F&max_new_tokens=150" -H "accept: */*" -d ''
+
+## check lvm
+http_proxy="" curl -X POST http://${ip_address}:9000/v1/lvm -d '{"video_url":"https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4","chunk_start": 0,"chunk_duration": 9,"prompt":"What is the person doing?","max_new_tokens": 150}' -H 'Content-Type: application/json'
+
+# or use python
+export ip_address=$(hostname -I | awk '{print $1}')
+python comps/lvms/video-llama/check_lvm.py
+```
+
+# ♻️ 3. Clean
+
+```bash
+# remove the container
+cid=$(docker ps -aq --filter "name=video-llama")
+if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+# remove the model volume (suggest to keep this to avoid download for each run)
+if docker volume ls | grep -q video-llama-model; then docker volume rm video-llama_video-llama-model; fi
+
+```
diff --git a/comps/lvms/video-llama/check_lvm.py b/comps/lvms/video-llama/check_lvm.py
new file mode 100644
index 0000000000..fcf6f6aeea
--- /dev/null
+++ b/comps/lvms/video-llama/check_lvm.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import datetime
+import json
+import os
+
+import requests
+
+ip_address = os.getenv("ip_address")
+####### video-llama request ########
+print("video-llama request")
+api_url = f"http://${ip_address}:9009/generate"
+content = {
+    "video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
+    "start": 0.0,
+    "duration": 9,
+    "prompt": "What is the person doing?",
+    "max_new_tokens": 150,
+}
+
+start = datetime.datetime.now()
+with requests.post(api_url, params=content, stream=True) as response:
+    for chunk in response.iter_content(chunk_size=8192):
+        if chunk:
+            print(chunk.decode("utf-8"), end="", flush=True)  # Flush to ensure immediate output
+
+end = datetime.datetime.now()
+print(f"\nTotal time: {end - start}")
+
+####### lvm request ########
+print("lvm request")
+api_url = f"http://${ip_address}:9000/v1/lvm"
+headers = {"Content-Type": "application/json"}
+data = {
+    "video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
+    "chunk_start": 0,
+    "chunk_duration": 9,
+    "prompt": "what is the person doing",
+    "max_new_tokens": 150,
+}
+
+start = datetime.datetime.now()
+with requests.post(api_url, headers=headers, data=json.dumps(data), stream=True) as response:
+    for chunk in response.iter_content(chunk_size=8192):
+        if chunk:
+            print(chunk.decode("utf-8"), end="", flush=True)  # Flush to ensure immediate output
+
+end = datetime.datetime.now()
+print(f"\nTotal time: {end - start}")
diff --git a/comps/lvms/video-llama/docker_compose.yaml b/comps/lvms/video-llama/docker_compose.yaml
new file mode 100644
index 0000000000..54aace84e7
--- /dev/null
+++ b/comps/lvms/video-llama/docker_compose.yaml
@@ -0,0 +1,40 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  lvm-video-llama:
+    image: opea/video-llama-lvm-server:latest
+    container_name: video-llama-lvm-server
+    ports:
+      - "9009:9009"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      llm_download: "True"
+    volumes:
+      - "/home/$USER/.cache:/home/user/.cache" # RECOMMENDED: use local cache to avoid download
+      - video-llama-model:/home/user/model
+    restart: unless-stopped
+
+  lvm:
+    image: opea/lvm-video-llama:latest
+    container_name: lvm-video-llama
+    ports:
+      - "9000:9000"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      LVM_ENDPOINT: ${LVM_ENDPOINT}
+    restart: unless-stopped
+    depends_on:
+      - lvm-video-llama
+networks:
+  default:
+    driver: bridge
+volumes:
+  video-llama-model:
diff --git a/comps/lvms/video-llama/lvm.py b/comps/lvms/video-llama/lvm.py
new file mode 100644
index 0000000000..1cbfcd5e1b
--- /dev/null
+++ b/comps/lvms/video-llama/lvm.py
@@ -0,0 +1,80 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+# import json
+import logging
+import os
+
+import requests
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+
+from comps import LVMVideoDoc, ServiceType, opea_microservices, register_microservice, register_statistics
+
+# import time
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+@register_microservice(
+    name="opea_service@lvm",
+    service_type=ServiceType.LVM,
+    endpoint="/v1/lvm",
+    host="0.0.0.0",
+    port=9000,
+    input_datatype=LVMVideoDoc,
+    output_datatype=StreamingResponse,
+)
+@register_statistics(names=["opea_service@lvm"])
+async def lvm(input: LVMVideoDoc):
+    """This function handles the LVM microservice, which generates text based on a video URL, start time, duration, prompt, and maximum new tokens.
+
+    Parameters:
+    input (LVMVideoDoc): The input containing the video URL, start time, duration, prompt, and maximum new tokens.
+
+    Returns:
+    StreamingResponse: A streaming response containing the generated text in text/event-stream format, or a JSON error response if the upstream API responds with an error.
+    """
+    logging.info("[lvm] Received input")
+
+    video_url = input.video_url
+    chunk_start = input.chunk_start
+    chunk_duration = input.chunk_duration
+    prompt = input.prompt
+    max_new_tokens = input.max_new_tokens
+
+    params = {
+        "video_url": video_url,
+        "start": chunk_start,
+        "duration": chunk_duration,
+        "prompt": prompt,
+        "max_new_tokens": max_new_tokens,
+    }
+    logging.info(f"[lvm] Params: {params}")
+
+    response = requests.post(url=f"{lvm_endpoint}/generate", params=params, proxies={"http": None}, stream=True)
+    logging.info(f"[lvm] Response status code: {response.status_code}")
+    if response.status_code == 200:
+
+        def streamer():
+            yield f"{{'video_url': '{video_url}', 'chunk_start': {chunk_start}, 'chunk_duration': {chunk_duration}}}\n".encode(
+                "utf-8"
+            )
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    yield chunk
+                logging.info(f"[llm - chat_stream] Streaming: {chunk}")
+            logging.info("[llm - chat_stream] stream response finished")
+
+        return StreamingResponse(streamer(), media_type="text/event-stream")
+    else:
+        logging.error(f"[lvm] Error: {response.text}")
+        raise HTTPException(status_code=500, detail="The upstream API responded with an error.")
+
+
+if __name__ == "__main__":
+    lvm_endpoint = os.getenv("LVM_ENDPOINT")
+
+    opea_microservices["opea_service@lvm"].start()
diff --git a/comps/lvms/video-llama/requirements.txt b/comps/lvms/video-llama/requirements.txt
new file mode 100644
index 0000000000..c7cc250eba
--- /dev/null
+++ b/comps/lvms/video-llama/requirements.txt
@@ -0,0 +1,11 @@
+datasets
+docarray
+fastapi
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+Pillow
+prometheus-fastapi-instrumentator
+pydub
+shortuuid
+uvicorn
diff --git a/comps/lvms/video-llama/server/data/silence_girl.mp4 b/comps/lvms/video-llama/server/data/silence_girl.mp4
new file mode 100644
index 0000000000..ad98e90f6c
Binary files /dev/null and b/comps/lvms/video-llama/server/data/silence_girl.mp4 differ
diff --git a/comps/lvms/video-llama/server/docker/Dockerfile b/comps/lvms/video-llama/server/docker/Dockerfile
new file mode 100644
index 0000000000..1152aa84c8
--- /dev/null
+++ b/comps/lvms/video-llama/server/docker/Dockerfile
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.9-slim
+
+ENV LANG=C.UTF-8
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    git git-lfs && \ 
+    git lfs install
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user:user /home/user/
+RUN mkdir /home/user/model && chown user:user -R /home/user/model
+
+USER user
+
+COPY --chown=user:user comps /home/user/comps
+WORKDIR /home/user/comps/lvms/video-llama/server
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/lvms/video-llama/server/requirements.txt
+
+ARG VIDEO_LLAMA_REPO=https://github.com/DAMO-NLP-SG/Video-LLaMA.git
+ARG VIDEO_LLAMA_COMMIT=0adb19e
+RUN tar -xvf video-llama.patch.tar && \
+    git clone ${VIDEO_LLAMA_REPO} Video-LLaMA && \
+    cd Video-LLaMA && git checkout ${VIDEO_LLAMA_COMMIT} && \
+    git apply --whitespace=fix ../video-llama.patch && \
+    mv video_llama ../ && \
+    cd ../ && rm -rf Video-LLaMA
+
+
+ENV PYTHONPATH=/home/user
+
+
+ENTRYPOINT ["bash", "start.sh"]
\ No newline at end of file
diff --git a/comps/lvms/video-llama/server/docker/docker_compose_vllama.yaml b/comps/lvms/video-llama/server/docker/docker_compose_vllama.yaml
new file mode 100644
index 0000000000..17d38e076c
--- /dev/null
+++ b/comps/lvms/video-llama/server/docker/docker_compose_vllama.yaml
@@ -0,0 +1,25 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  lvm-video-llama:
+    image: opea/video-llama-lvm-server:latest
+    container_name: video-llama-lvm-server
+    ports:
+      - "9009:9009"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      no_proxy: ${no_proxy}
+      llm_download: "True"
+    volumes:
+      - "/home/$USER/.cache:/home/user/.cache" # RECOMMENDED: use cache to avoid download
+      - video-llama-model:/home/user/model
+    restart: unless-stopped
+networks:
+  default:
+    driver: bridge
+volumes:
+  video-llama-model:
diff --git a/comps/lvms/video-llama/server/extract_vl_embedding.py b/comps/lvms/video-llama/server/extract_vl_embedding.py
new file mode 100644
index 0000000000..304b5472fc
--- /dev/null
+++ b/comps/lvms/video-llama/server/extract_vl_embedding.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import random
+
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+from video_llama.common.config import Config
+from video_llama.common.dist_utils import get_rank
+from video_llama.common.registry import registry
+
+
+class VLEmbeddingExtractor(object):
+    """Docstring for VLEmbeddingExtractor."""
+
+    def __init__(self, cfg_path, model_type):
+        super(VLEmbeddingExtractor, self).__init__()
+        args = argparse.Namespace(**{"cfg_path": cfg_path, "model_type": model_type, "options": []})
+        self.cfg = Config(args)
+        self.setup_seeds()
+        model_config = self.cfg.model_cfg
+        print("vis_processor vit_precision:", model_config.get("vit_precision", "fp16"))
+        if model_config.get("vit_precision", "fp16") == "fp16":
+            print("WARNING! FP16 not currently supported. Switching to FP32")
+            model_config["vit_precision"] = "fp32"
+        model_cls = registry.get_model_class(model_config.arch)
+        self.model = model_cls.from_config(model_config).to("cpu")
+        self.model.eval()
+
+    def setup_seeds(self):
+        seed = self.cfg.run_cfg.seed + get_rank()
+
+        print("Seed: ", seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+        cudnn.benchmark = False
+        cudnn.deterministic = True
diff --git a/comps/lvms/video-llama/server/requirements.txt b/comps/lvms/video-llama/server/requirements.txt
new file mode 100644
index 0000000000..41dacfbd21
--- /dev/null
+++ b/comps/lvms/video-llama/server/requirements.txt
@@ -0,0 +1,34 @@
+# OPEA
+beautifulsoup4
+
+# microservice
+decord
+docarray
+einops
+faiss-cpu
+fastapi
+ftfy
+iopath
+langchain==0.2.9
+langchain-community==0.2.1
+langchain-core==0.2.21
+numpy
+omegaconf
+opencv-python-headless
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+prometheus_fastapi_instrumentator
+pytorchvideo
+sentence-transformers==3.0.1
+sentencepiece
+shortuuid
+timm
+torch==1.13.1 --index-url https://download.pytorch.org/whl/cpu
+torchaudio==0.13.1 --index-url https://download.pytorch.org/whl/cpu
+torchvision==0.14.1 --index-url https://download.pytorch.org/whl/cpu
+transformers
+uvicorn
+webdataset
diff --git a/comps/lvms/video-llama/server/server.py b/comps/lvms/video-llama/server/server.py
new file mode 100644
index 0000000000..f54cdc65e4
--- /dev/null
+++ b/comps/lvms/video-llama/server/server.py
@@ -0,0 +1,238 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""Stand-alone video llama FastAPI Server."""
+
+import argparse
+import logging
+import os
+from threading import Thread
+from urllib.parse import urlparse
+
+import decord
+import requests
+import uvicorn
+from extract_vl_embedding import VLEmbeddingExtractor as VL
+from fastapi import FastAPI, Query
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from pydantic import BaseModel, Field
+from transformers import TextIteratorStreamer, set_seed
+from video_llama.common.registry import registry
+from video_llama.conversation.conversation_video import Chat
+
+# Initialize decord bridge and seed
+decord.bridge.set_bridge("torch")
+set_seed(22)
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+
+# Define global variables
+context_db = None
+streamer = None
+chat = None
+VIDEO_DIR = "/home/user/videos"
+CFG_PATH = "video_llama_config/video_llama_eval_only_vl.yaml"
+MODEL_TYPE = "llama_v2"
+
+os.makedirs(VIDEO_DIR, exist_ok=True)
+
+# Initialize FastAPI app
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+# Pydantic models for request validation
+class videoInfo(BaseModel):
+    video_path: str = Field(..., description="URL of the video to be processed, support remote")
+    start_time: float = Field(..., descrciption="video clip start time in seconds", example=0.0)
+    duration: float = Field(..., description="video clip duration in seconds", example=10.0)
+
+
+class GenerateRequest(BaseModel):
+    start_time: float = Field(..., descrciption="video clip start time in seconds", example=0.0)
+    duration: float = Field(..., description="video clip duration in seconds", example=10.0)
+    prompt: str = Field(..., description="Query for Video-LLama", example="What is the man doing?")
+    max_new_tokens: int = Field(default=512, description="Maximum number of tokens to generate", example=512)  #
+
+
+# Function to construct instructions context
+def construct_instructions():
+    instructions = [
+        """ Identify the person [with specific features / seen at a specific location / performing a specific action] in the provided data based on the video content.
+        Describe in detail the relevant actions of the individuals mentioned in the question.
+        Provide full details of their actions being performed and roles. Focus on the individual and the actions being performed.
+        Exclude information about their age and items on the shelf that are not directly observable.
+        Do not mention items on the shelf that are not  visible. \
+        Exclude information about the background and surrounding details.
+        Ensure all information is distinct, accurate, and directly observable.
+        Do not repeat actions of individuals and do not mention anything about other persons not visible in the video.
+        Mention actions and roles once only.
+        """,
+        """Analyze the provided data to recognize and describe the activities performed by individuals.
+        Specify the type of activity and any relevant contextual details,
+        Do not give repetitions, always give distinct and accurate information only.""",
+        """Determine the interactions between individuals and items in the provided data.
+        Describe the nature of the interaction between individuals and the items involved.
+        Provide full details of their relevant actions and roles. Focus on the individuals and the action being performed by them.
+        Exclude information about their age and items on the shelf that are not directly observable.
+        Exclude information about the background and surrounding details.
+        Ensure all information is distinct, accurate, and directly observable.
+        Do not repeat actions of individuals and do not mention anything about other persons not visible in the video.
+        Do not mention  items on the shelf that are not observable. \
+        """,
+        """Analyze the provided data to answer queries based on specific time intervals.
+        Provide detailed information corresponding to the specified time frames,
+        Do not give repetitions, always give distinct and accurate information only.""",
+        """Identify individuals based on their appearance as described in the provided data.
+        Provide details about their identity and actions,
+        Do not give repetitions, always give distinct and accurate information only.""",
+        """Answer questions related to events and activities that occurred on a specific day.
+        Provide a detailed account of the events,
+        Do not give repetitions, always give distinct and accurate information only.""",
+    ]
+    HFembeddings = HuggingFaceEmbeddings(model_kwargs={"device": "cpu"})
+    context = FAISS.from_texts(instructions, HFembeddings)
+    return context
+
+
+# Helper functions for chat and inference
+def get_context(query, context):
+    context = context.similarity_search(query)
+    return [i.page_content for i in context]
+
+
+def chat_reset(chat_state, img_list):
+    logging.info("-" * 30)
+    logging.info("resetting chatState")
+    if chat_state is not None:
+        chat_state.messages = []
+    if img_list is not None:
+        img_list = []
+    return chat_state, img_list
+
+
+def inference(chat: Chat, streamer, video: videoInfo, instruction: str, max_new_tokens: int):
+    logging.info("Video-Llama generation begin.")
+    video_path = video.video_path
+    start_time = video.start_time
+    duration = video.duration
+
+    chat.upload_video_without_audio(video_path, start_time, duration)
+    chat.ask("<rag_prompt>" + instruction)
+    chat.answer(
+        max_new_tokens=max_new_tokens,
+        num_beams=1,
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        length_penalty=1,
+        temperature=0.02,
+        max_length=2000,
+        keep_conv_hist=True,
+        streamer=streamer,
+    )
+    if "similar video" not in instruction:
+        logging.info("Resetting the chat history")
+        chat.clear()
+    logging.info("Video-Llama generation done, remove video.")
+    os.remove(video_path)
+
+
+def stream_res(video, instruction, max_new_tokens):
+    logging.debug("Start to stream...")
+    thread = Thread(target=inference, args=(chat, streamer, video, instruction, max_new_tokens))
+    thread.start()
+    for text in streamer:
+        yield text
+
+
+def is_local_file(url):
+    """Returns True if url is a local file, False otherwise."""
+    return not url.startswith("http://") and not url.startswith("https://")
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post("/generate", response_class=StreamingResponse)
+async def generate(
+    video_url: str = Query(..., description="remote URL of the video to be processed"),
+    start: float = Query(..., description="video clip start time in seconds", examples=0.0),
+    duration: float = Query(..., description="video clip duration in seconds", examples=10.0),
+    prompt: str = Query(..., description="Query for Video-LLama", examples="What is the man doing?"),
+    max_new_tokens: int = Query(150, description="Maximum number of tokens to generate", examples=150),
+) -> StreamingResponse:
+    if not is_local_file(video_url):
+        parsed_url = urlparse(video_url)
+        video_name = os.path.basename(parsed_url.path)
+    else:
+        video_name = os.path.basename(video_url)
+
+    if video_name.lower().endswith(".mp4"):
+        logging.info(f"Format check passed, the file '{video_name}' is an MP4 file.")
+    else:
+        logging.info(f"Format check failed, the file '{video_name}' is not an MP4 file.")
+        return JSONResponse(status_code=400, content={"message": "Invalid file type. Only mp4 videos are allowed."})
+
+    if not is_local_file(video_url):
+        try:
+            video_path = os.path.join(VIDEO_DIR, video_name)
+            response = requests.get(video_url, stream=True)
+
+            if response.status_code == 200:
+                with open(video_path, "wb") as file:
+                    for chunk in response.iter_content(chunk_size=1024):
+                        if chunk:  # filter out keep-alive new chunks
+                            file.write(chunk)
+                logging.info(f"File downloaded: {video_path}")
+            else:
+                logging.info(f"Error downloading file: {response.status_code}")
+                return JSONResponse(status_code=500, content={"message": "Error downloading file."})
+        except Exception as e:
+            logging.info(f"Error downloading file: {response.status_code}")
+            return JSONResponse(status_code=500, content={"message": "Error downloading file."})
+    else:
+        # check if the video exist
+        video_path = video_url
+        if not os.path.exists(video_path):
+            logging.info(f"File not found: {video_path}")
+            return JSONResponse(status_code=404, content={"message": "File not found."})
+    video_info = videoInfo(start_time=start, duration=duration, video_path=video_path)
+
+    # format context and instruction
+    instruction = f"{get_context(prompt,context_db)[0]}: {prompt}"
+    # logging.info("instruction:",instruction)
+
+    return StreamingResponse(stream_res(video_info, instruction, max_new_tokens))
+
+
+# Main entry point
+parser = argparse.ArgumentParser()
+parser.add_argument("--host", type=str, default="0.0.0.0")
+parser.add_argument("--port", type=int, default=9009)
+args = parser.parse_args()
+
+context_db = construct_instructions()
+video_llama = VL(cfg_path=CFG_PATH, model_type=MODEL_TYPE)
+tokenizer = video_llama.model.llama_tokenizer
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+
+vis_processor_cfg = video_llama.cfg.datasets_cfg.webvid.vis_processor.train
+vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
+
+chat = Chat(video_llama.model, vis_processor, device="cpu")
+
+uvicorn.run(app, host=args.host, port=args.port)
diff --git a/comps/lvms/video-llama/server/start.sh b/comps/lvms/video-llama/server/start.sh
new file mode 100644
index 0000000000..f016ad1a88
--- /dev/null
+++ b/comps/lvms/video-llama/server/start.sh
@@ -0,0 +1,23 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# /bin/bash
+# Download models
+MODEL_REPO=https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-7B-Finetuned
+llm_download=${llm_download}
+echo "llm_download: ${llm_download}"
+if [ "$llm_download" = "True" ]; then
+  # clean if exists
+  rm -rf /home/user/model/Video-LLaMA-2-7B-Finetuned
+
+  echo "Please wait for model download..."
+  git lfs install &&  git clone ${MODEL_REPO} /home/user/model/Video-LLaMA-2-7B-Finetuned
+  # rm Video-LLaMA-2-7B-Finetuned/AL*.pth Video-LLaMA-2-7B-Finetuned/imagebind_huge.pth
+elif [ "$llm_download" = "False" ]; then
+  echo "No model download"
+else
+  echo "llm_download should be True or False"
+  exit 1
+fi
+
+python server.py
diff --git a/comps/lvms/video-llama/server/video-llama.patch.tar b/comps/lvms/video-llama/server/video-llama.patch.tar
new file mode 100644
index 0000000000..7e9826f113
Binary files /dev/null and b/comps/lvms/video-llama/server/video-llama.patch.tar differ
diff --git a/comps/lvms/video-llama/server/video_llama_config/video_llama_eval_only_vl.yaml b/comps/lvms/video-llama/server/video_llama_config/video_llama_eval_only_vl.yaml
new file mode 100644
index 0000000000..3b239fe0f9
--- /dev/null
+++ b/comps/lvms/video-llama/server/video_llama_config/video_llama_eval_only_vl.yaml
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+model:
+  arch: video_llama
+  model_type: pretrain_llama_v2 # pretrain_llama_v2 pretrain_vicuna
+  freeze_vit: True
+  freeze_qformer: True
+  max_txt_len: 256 #512
+  end_sym: "###"
+  low_resource: False
+
+  frozen_llama_proj: True
+
+  llama_model: "/home/user/model/Video-LLaMA-2-7B-Finetuned/llama-2-7b-chat-hf"
+  ckpt: "/home/user/model/Video-LLaMA-2-7B-Finetuned/VL_LLaMA_2_7B_Finetuned.pth"
+
+  equip_audio_branch: False # whether equips the audio branch
+  fusion_head_layers: 2
+  max_frame_pos: 32 #cannot be changed - frozen with training model
+  fusion_header_type: "seqTransf"
+
+datasets:
+  webvid:
+    vis_processor:
+      train:
+        name: "alpro_video_eval"
+        n_frms: 32 #8
+        image_size: 224
+    text_processor:
+      train:
+        name: "blip_caption"
+
+run:
+  task: video_text_pretrain
+  seed: 10
+  input_video_dir: "data/testset-raw"
+  input_questions_json: "data/testset-raw/testset_small.json"
+  output_dir: "output/origFT_videollama_testset_small_results_only_vl"
diff --git a/tests/test_lvms_video-llama.sh b/tests/test_lvms_video-llama.sh
new file mode 100755
index 0000000000..1e94982fb3
--- /dev/null
+++ b/tests/test_lvms_video-llama.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -xe
+
+WORKPATH=$(dirname "$PWD")
+ip_address=$(hostname -I | awk '{print $1}')
+
+function build_docker_images() {
+    cd $WORKPATH
+    echo $(pwd)
+    docker build --no-cache -t opea/video-llama-lvm-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/video-llama/server/docker/Dockerfile .
+    if $? ; then
+        echo "opea/video-llama-lvm-server built fail"
+        exit 1
+    else
+        echo "opea/video-llama-lvm-server built successful"
+    fi
+    docker build --no-cache -t opea/lvm-video-llama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f comps/lvms/video-llama/Dockerfile .
+    if $? ; then
+        echo "opea/lvm-video-llama built fail"
+        exit 1
+    else
+        echo "opea/lvm-video-llama built successful"
+    fi
+
+}
+
+function start_service() {
+    cd $WORKPATH
+    unset http_proxy
+    export LVM_ENDPOINT=http://$ip_address:5030
+
+    docker run -d --name="test-comps-lvm-video-llama" -p 5030:9009 \
+        --ipc=host \
+        -e http_proxy=$http_proxy \
+        -e https_proxy=$https_proxy \
+        -e no_proxy=$no_proxy \
+        -e llm_download="True" \
+        -v "/home/$USER/.cache:/home/user/.cache" \
+        -v video-llama-model:/home/user/model \
+        opea/video-llama-lvm-server:latest
+
+    docker run -d --name="test-comps-lvm" -p 5031:9000 \
+        --ipc=host \
+        -e http_proxy=$http_proxy \
+        -e https_proxy=$https_proxy \
+        -e no_proxy=$no_proxy \
+        -e LVM_ENDPOINT=$LVM_ENDPOINT \
+        opea/lvm-video-llama:latest
+
+    echo "Waiting for the LVM service to start"
+    until docker logs test-comps-lvm 2>&1 | grep -q "Uvicorn running on"; do
+    sleep 5
+    done
+
+    echo "Waiting for the Video-Llama service to start, downloading model..."
+    until docker logs test-comps-lvm-video-llama 2>&1 | grep -q "Uvicorn running on"; do
+    sleep 5m
+    done
+}
+
+function validate_microservice() {
+    result=$(http_proxy="" curl http://localhost:5031/v1/lvm -X POST -d '{"video_url":"./data/silence_girl.mp4","chunk_start": 0,"chunk_duration": 7,"prompt":"What is the person doing?","max_new_tokens": 50}' -H 'Content-Type: application/json')
+    if [[ $result == *"silence"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong."
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-lvm*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+    if docker volume ls | grep -q video-llama-model; then docker volume rm video-llama-model; fi
+
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main