Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline generator utils #31

Merged
merged 8 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,12 @@ terraform.rc

.env

.vscode/
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

.cache
*.log
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
click==8.1.7
pydantic==2.9.2
Empty file added scripts/__init__.py
Empty file.
Empty file.
62 changes: 62 additions & 0 deletions scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import enum
from typing import Optional, List

# Constants
HF_HOME = "/root/.cache/huggingface"
DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
AMD_REPO = "rocm/vllm-ci"
A100_GPU = "a100"

# File paths
TEST_PATH = ".buildkite/test-pipeline.yaml"
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"

STEPS_TO_BLOCK = []


class AgentQueue(str, enum.Enum):
AWS_CPU = "cpu_queue"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why some have the _queue suffix and some do not?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the ones with _queue are those that we name it ourselves.. the one without are from AMD ..

AWS_SMALL_CPU = "small_cpu_queue"
AWS_1xL4 = "gpu_1_queue"
AWS_4xL4 = "gpu_4_queue"
A100 = "a100-queue"
AMD_GPU = "amd"
AMD_CPU = "amd-cpu"


def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
if no_gpu:
return AgentQueue.AWS_SMALL_CPU
if gpu_type == A100_GPU:
return AgentQueue.A100
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4


def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
"""Convert test commands into one-line command with the right directory."""
working_dir = step_working_dir or DEFAULT_WORKING_DIR
test_commands_str = ";\n".join(test_commands)
return f"cd {working_dir};\n{test_commands_str}"
Comment on lines +42 to +43
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you added the \n, do you still need to add the ; ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably not but let's just leave it there for now... I can test again & take it out later when it's up and running



def get_multi_node_test_command(
test_commands: List[str],
working_dir: str,
num_nodes: int,
num_gpus: int,
docker_image_path: str
) -> str:
quoted_commands = [f"'{command}'" for command in test_commands]
multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir or DEFAULT_WORKING_DIR,
str(num_nodes),
str(num_gpus),
docker_image_path,
*quoted_commands
]
return " ".join(map(str, multi_node_command))
Empty file added scripts/tests/__init__.py
Empty file.
Empty file.
66 changes: 66 additions & 0 deletions scripts/tests/pipeline_generator/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest
import sys
from typing import List

from scripts.pipeline_generator.utils import (
get_agent_queue,
get_full_test_command,
get_multi_node_test_command,
AgentQueue,
MULTI_NODE_TEST_SCRIPT,
)


@pytest.mark.parametrize(
("no_gpu", "gpu_type", "num_gpus", "expected_result"),
[
(True, None, None, AgentQueue.AWS_SMALL_CPU),
(False, "a100", None, AgentQueue.A100),
(False, None, 1, AgentQueue.AWS_1xL4),
(False, None, 4, AgentQueue.AWS_4xL4),
],
)
def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue):
assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result


@pytest.mark.parametrize(
("test_commands", "step_working_dir", "expected_result"),
[
(["echo 'hello'"], None, "cd /vllm-workspace/tests;\necho 'hello'"),
(["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests;\necho 'hello'"),
(["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests;\necho 'hello1';\necho 'hello2'"),
],
)
def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str):
assert get_full_test_command(test_commands, step_working_dir) == expected_result


def test_get_multi_node_test_command():
test_commands = [
(
"distributed/test_same_node.py;"
"pytest -v -s distributed/test_multi_node_assignment.py;"
"pytest -v -s distributed/test_pipeline_parallel.py"
),
"distributed/test_same_node.py",
]
working_dir = "/vllm-workspace/tests"
num_nodes = 2
num_gpus = 4
docker_image_path = "ecr-path/vllm-ci-test-repo:latest"
expected_multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir,
num_nodes,
num_gpus,
docker_image_path,
f"'{test_commands[0]}'",
f"'{test_commands[1]}'",
]
expected_result = " ".join(map(str, expected_multi_node_command))
assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
Loading