Pipeline generator utils (#31)

* p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> * add req Signed-off-by: kevin <[email protected]> * p Signed-off-by: kevin <[email protected]> --------- Signed-off-by: kevin <[email protected]>
vllm-project · Sep 20, 2024 · 8272857 · 8272857
1 parent 99ea02d
commit 8272857
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,12 @@ terraform.rc
 
 .env
 
-.vscode/
+.vscode/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+.cache
+*.log
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+click==8.1.7
+pydantic==2.9.2
diff --git a/scripts/__init__.py b/scripts/__init__.py
diff --git a/scripts/pipeline_generator/__init__.py b/scripts/pipeline_generator/__init__.py
diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py
@@ -0,0 +1,62 @@
+import enum
+from typing import Optional, List
+
+# Constants
+HF_HOME = "/root/.cache/huggingface"
+DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
+VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
+VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
+AMD_REPO = "rocm/vllm-ci"
+A100_GPU = "a100"
+
+# File paths
+TEST_PATH = ".buildkite/test-pipeline.yaml"
+EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
+PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
+MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"
+
+STEPS_TO_BLOCK = []
+
+
+class AgentQueue(str, enum.Enum):
+    AWS_CPU = "cpu_queue"
+    AWS_SMALL_CPU = "small_cpu_queue"
+    AWS_1xL4 = "gpu_1_queue"
+    AWS_4xL4 = "gpu_4_queue"
+    A100 = "a100-queue"
+    AMD_GPU = "amd"
+    AMD_CPU = "amd-cpu"
+
+
+def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
+    if no_gpu:
+        return AgentQueue.AWS_SMALL_CPU
+    if gpu_type == A100_GPU:
+        return AgentQueue.A100
+    return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4
+
+
+def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
+    """Convert test commands into one-line command with the right directory."""
+    working_dir = step_working_dir or DEFAULT_WORKING_DIR
+    test_commands_str = ";\n".join(test_commands)
+    return f"cd {working_dir};\n{test_commands_str}"
+
+
+def get_multi_node_test_command(
+        test_commands: List[str],
+        working_dir: str,
+        num_nodes: int,
+        num_gpus: int,
+        docker_image_path: str
+        ) -> str:
+    quoted_commands = [f"'{command}'" for command in test_commands]
+    multi_node_command = [
+        MULTI_NODE_TEST_SCRIPT,
+        working_dir or DEFAULT_WORKING_DIR,
+        str(num_nodes),
+        str(num_gpus),
+        docker_image_path,
+        *quoted_commands
+    ]
+    return " ".join(map(str, multi_node_command))
diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py
diff --git a/scripts/tests/pipeline_generator/__init__.py b/scripts/tests/pipeline_generator/__init__.py
diff --git a/scripts/tests/pipeline_generator/test_utils.py b/scripts/tests/pipeline_generator/test_utils.py
@@ -0,0 +1,66 @@
+import pytest
+import sys
+from typing import List
+
+from scripts.pipeline_generator.utils import (
+    get_agent_queue,
+    get_full_test_command,
+    get_multi_node_test_command,
+    AgentQueue,
+    MULTI_NODE_TEST_SCRIPT,
+)
+
+
+@pytest.mark.parametrize(
+    ("no_gpu", "gpu_type", "num_gpus", "expected_result"),
+    [
+        (True, None, None, AgentQueue.AWS_SMALL_CPU),
+        (False, "a100", None, AgentQueue.A100),
+        (False, None, 1, AgentQueue.AWS_1xL4),
+        (False, None, 4, AgentQueue.AWS_4xL4),
+    ],
+)
+def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue):
+    assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("test_commands", "step_working_dir", "expected_result"),
+    [
+        (["echo 'hello'"], None, "cd /vllm-workspace/tests;\necho 'hello'"),
+        (["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests;\necho 'hello'"),
+        (["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests;\necho 'hello1';\necho 'hello2'"),
+    ],
+)
+def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str):
+    assert get_full_test_command(test_commands, step_working_dir) == expected_result
+
+
+def test_get_multi_node_test_command():
+    test_commands = [
+        (
+            "distributed/test_same_node.py;"
+            "pytest -v -s distributed/test_multi_node_assignment.py;"
+            "pytest -v -s distributed/test_pipeline_parallel.py"
+        ),
+        "distributed/test_same_node.py",
+    ]
+    working_dir = "/vllm-workspace/tests"
+    num_nodes = 2
+    num_gpus = 4
+    docker_image_path = "ecr-path/vllm-ci-test-repo:latest"
+    expected_multi_node_command = [
+        MULTI_NODE_TEST_SCRIPT,
+        working_dir,
+        num_nodes,
+        num_gpus,
+        docker_image_path,
+        f"'{test_commands[0]}'",
+        f"'{test_commands[1]}'",
+    ]
+    expected_result = " ".join(map(str, expected_multi_node_command))
+    assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))