pin fastchat version

fix serving mounts for finetuned models fix tests for fine tune serving fix comma
Trainy-ai · Oct 27, 2023 · 4d94979 · 4d94979
1 parent 37a765a
commit 4d94979
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 23 deletions.
diff --git a/.github/workflows/pytest-smoke.yml b/.github/workflows/pytest-smoke.yml
@@ -17,6 +17,7 @@ jobs:
         python-version: ["3.10"]
         test-path:
           - tests/test_cli.py::test_hf_serve
+          - tests/test_cli.py::test_llmatc_serve
           - tests/test_launch.py
           - tests/test_runtracker.py
           - tests/test_serve.py
@@ -52,4 +53,5 @@ jobs:
           name: error-log
           path: |
             /tmp/serve_huggingface-*.log
+            /tmp/serve_llmatc-*.log
             /home/runner/sky_logs/sky-*/*.log
diff --git a/docs/source/quickstart/serving.rst b/docs/source/quickstart/serving.rst
@@ -8,12 +8,12 @@ Deployment
 ----------
 
 Model deployments are referenced by their HuggingFace modelhub name. Finetuned models trained through LLM-ATC are referenced
-by using the :code:`llm-atc/` prefix.
+by using the :code:`--name llm-atc`.
 
 .. code-block:: console
 
-    # serve an llm-atc finetuned model, requires `llm-atc/` prefix and grabs model checkpoint from object store
-    $ llm-atc serve --name llm-atc/myvicuna --source s3://my-bucket/my_vicuna/ --accelerator A100:1 -c servecluster --cloud gcp --region asia-southeast1 --envs "HF_TOKEN=<HuggingFace_token>"
+    # serve an llm-atc finetuned model, requires source `llm-atc/` prefix and grabs model checkpoint from object store
+    $ llm-atc serve --name llm-atc --source s3://my-bucket/my_vicuna/ --accelerator A100:1 -c servecluster --cloud gcp --region asia-southeast1 --envs "HF_TOKEN=<HuggingFace_token>"
 
     # serve a HuggingFace model, e.g. `lmsys/vicuna-13b-v1.3`
     $ llm-atc serve --name lmsys/vicuna-13b-v1.3 --accelerator A100:1 -c servecluster --cloud gcp --region asia-southeast1 --envs "HF_TOKEN=<HuggingFace_token>"
@@ -33,11 +33,20 @@ from your laptop.
 .. code-block:: console
 
     # get the ip address of the OpenAI API endpoint
-    $ ip=$(grep -A1 "Host servecluster" ~/.ssh/config | grep "HostName" | awk '{print $2}')
+    $ ip=$(sky status --ip servecluster)
 
     # test which models are available
     $ curl http://$ip:8000/v1/models
 
+    # chat completion
+    $ curl http://$ip:8000/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "my-model",
+            "messages": [{"role": "user", "content": "Hello! What is your name?"}]
+        }'
+
+
     # shutdown when done
     $ sky stop servecluster
 

diff --git a/llm_atc/config/serve/serve.yml b/llm_atc/config/serve/serve.yml
@@ -27,18 +27,6 @@ setup: |
 
   pip install git+https://github.com/huggingface/transformers.git
 
-  sudo apt update
-  sudo apt install -y rclone
-  # copy files from object store onto disk
-  if [[ $MODEL_NAME == llm-atc/* ]];
-  then
-    CHECKPOINT="/$MODEL_NAME/"
-    LOCAL_CHKPT="./$MODEL_NAME/"
-    mkdir -p $LOCAL_CHKPT
-    rclone sync --progress --exclude "train*" $CHECKPOINT $LOCAL_CHKPT
-  fi
-
-
 run: |
 
   master_addr=`echo "$SKYPILOT_NODE_IPS" | head -n1`

diff --git a/llm_atc/config/train/vicuna.yml b/llm_atc/config/train/vicuna.yml
@@ -34,6 +34,7 @@ setup: |
   pip install torch==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu116
   git clone https://github.com/lm-sys/FastChat.git
   cd FastChat
+  git checkout cbf285360e8e809a316c88a8377c1bb0f0c770bc
   pip install -e .
   if [ $USE_FLASH_ATTN -eq 1 ]; then
     pip install packaging

diff --git a/llm_atc/serve.py b/llm_atc/serve.py
@@ -21,10 +21,6 @@ def serve_route(model_name: str, source: Optional[str] = None, **serve_kwargs):
         raise ValueError(
             "Attempting to use a finetuned model without a corresponding object store location"
         )
-    elif not source is None and not model_name.startswith("llm-atc/"):
-        logging.warning(
-            "Specified object store mount but model is not an llm-atc model. Skipping mounting."
-        )
     return Serve(model_name, source, **serve_kwargs).serve()
 
 
@@ -69,6 +65,10 @@ def default_serve_task(self) -> sky.Task:
     def serve(self) -> sky.Task:
         """Deploy fastchat.serve.openai_api_server with vllm_worker"""
         serve_task = self.default_serve_task
+        if self.source and self.names == "llm-atc":
+            logging.info(f"Using a fine tuned model at {self.source}")
+            serve_task.update_file_mounts({"/llm-atc": self.source})
+            self.names = "/llm-atc"
         self.envs["MODEL_NAME"] = self.names
         if "HF_TOKEN" not in self.envs:
             logging.warning(
@@ -80,6 +80,4 @@ def serve(self) -> sky.Task:
         resource._cloud = sky.clouds.CLOUD_REGISTRY.from_str(self.cloud)
         resource._set_region_zone(self.region, self.zone)
         serve_task.set_resources(resource)
-        if self.source and self.names.startswith("llm-atc/"):
-            serve_task.update_file_mounts({"/" + self.names: self.source})
         return serve_task
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -105,7 +105,29 @@ def test_hf_serve():
             + """awk '{print $2}'); echo $ip; curl http://"$ip":8000/v1/models | grep vicuna""",
         ],
         f"sky stop -y {name} ; sleep 300 ; sky down --purge -y {name}",
-        timeout=45 * 60,
+        timeout=30 * 60,
+    )
+    run_one_test(test)
+
+
+@pytest.mark.cli
+def test_llmatc_serve():
+    """
+    Tests serving a llm-atc fine tuned model
+    """
+
+    name = "test_fine_tune"
+    ssh_config = os.path.expanduser("~/.ssh/config")
+    test = Test(
+        "serve_llmatc",
+        [
+            f"llm-atc serve --detach_run --name llm-atc --source s3://my-trainy-bucket/mymistral --accelerator V100:1 -c {name} --cloud aws --region us-east-2",
+            "sleep 300",
+            f"""ip=$(grep -A1 "Host {name}" {ssh_config} | grep "HostName" | """
+            + """awk '{print $2}'); echo $ip; curl http://"$ip":8000/v1/models | grep llm-atc""",
+        ],
+        f"sky stop -y {name} ; sleep 300 ; sky down --purge -y {name}",
+        timeout=30 * 60,
     )
     run_one_test(test)