diff --git a/.gitignore b/.gitignore index 3da3291..797af49 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,12 @@ terraform.rc .env -.vscode/ \ No newline at end of file +.vscode/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +.cache +*.log diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..946ac76 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +click==8.1.7 +pydantic==2.9.2 diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/pipeline_generator/__init__.py b/scripts/pipeline_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/pipeline_generator/utils.py b/scripts/pipeline_generator/utils.py new file mode 100644 index 0000000..51dd6eb --- /dev/null +++ b/scripts/pipeline_generator/utils.py @@ -0,0 +1,62 @@ +import enum +from typing import Optional, List + +# Constants +HF_HOME = "/root/.cache/huggingface" +DEFAULT_WORKING_DIR = "/vllm-workspace/tests" +VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" +VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" +AMD_REPO = "rocm/vllm-ci" +A100_GPU = "a100" + +# File paths +TEST_PATH = ".buildkite/test-pipeline.yaml" +EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml" +PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml" +MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" + +STEPS_TO_BLOCK = [] + + +class AgentQueue(str, enum.Enum): + AWS_CPU = "cpu_queue" + AWS_SMALL_CPU = "small_cpu_queue" + AWS_1xL4 = "gpu_1_queue" + AWS_4xL4 = "gpu_4_queue" + A100 = "a100-queue" + AMD_GPU = "amd" + AMD_CPU = "amd-cpu" + + +def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue: + if no_gpu: + return AgentQueue.AWS_SMALL_CPU + if gpu_type == A100_GPU: + return AgentQueue.A100 + return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4 + + +def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str: + """Convert test commands into one-line command with the right directory.""" + working_dir = step_working_dir or DEFAULT_WORKING_DIR + test_commands_str = ";\n".join(test_commands) + return f"cd {working_dir};\n{test_commands_str}" + + +def get_multi_node_test_command( + test_commands: List[str], + working_dir: str, + num_nodes: int, + num_gpus: int, + docker_image_path: str + ) -> str: + quoted_commands = [f"'{command}'" for command in test_commands] + multi_node_command = [ + MULTI_NODE_TEST_SCRIPT, + working_dir or DEFAULT_WORKING_DIR, + str(num_nodes), + str(num_gpus), + docker_image_path, + *quoted_commands + ] + return " ".join(map(str, multi_node_command)) diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tests/pipeline_generator/__init__.py b/scripts/tests/pipeline_generator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/tests/pipeline_generator/test_utils.py b/scripts/tests/pipeline_generator/test_utils.py new file mode 100644 index 0000000..5281a93 --- /dev/null +++ b/scripts/tests/pipeline_generator/test_utils.py @@ -0,0 +1,66 @@ +import pytest +import sys +from typing import List + +from scripts.pipeline_generator.utils import ( + get_agent_queue, + get_full_test_command, + get_multi_node_test_command, + AgentQueue, + MULTI_NODE_TEST_SCRIPT, +) + + +@pytest.mark.parametrize( + ("no_gpu", "gpu_type", "num_gpus", "expected_result"), + [ + (True, None, None, AgentQueue.AWS_SMALL_CPU), + (False, "a100", None, AgentQueue.A100), + (False, None, 1, AgentQueue.AWS_1xL4), + (False, None, 4, AgentQueue.AWS_4xL4), + ], +) +def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue): + assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result + + +@pytest.mark.parametrize( + ("test_commands", "step_working_dir", "expected_result"), + [ + (["echo 'hello'"], None, "cd /vllm-workspace/tests;\necho 'hello'"), + (["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests;\necho 'hello'"), + (["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests;\necho 'hello1';\necho 'hello2'"), + ], +) +def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str): + assert get_full_test_command(test_commands, step_working_dir) == expected_result + + +def test_get_multi_node_test_command(): + test_commands = [ + ( + "distributed/test_same_node.py;" + "pytest -v -s distributed/test_multi_node_assignment.py;" + "pytest -v -s distributed/test_pipeline_parallel.py" + ), + "distributed/test_same_node.py", + ] + working_dir = "/vllm-workspace/tests" + num_nodes = 2 + num_gpus = 4 + docker_image_path = "ecr-path/vllm-ci-test-repo:latest" + expected_multi_node_command = [ + MULTI_NODE_TEST_SCRIPT, + working_dir, + num_nodes, + num_gpus, + docker_image_path, + f"'{test_commands[0]}'", + f"'{test_commands[1]}'", + ] + expected_result = " ".join(map(str, expected_multi_node_command)) + assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__]))