-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pipeline generator plugins #33
Changes from 8 commits
717b163
e3aac9c
49043e5
72f1fd2
3cdd2b3
ab323d9
ec9e184
3615f75
f1a2975
a315f66
9e52954
9d973dc
a040279
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
click==8.1.7 | ||
pydantic==2.9.2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from pydantic import BaseModel, Field | ||
from typing import List, Dict, Any, Optional | ||
|
||
from .utils import HF_HOME | ||
|
||
DOCKER_PLUGIN_NAME = "docker#v5.2.0" | ||
KUBERNETES_PLUGIN_NAME = "kubernetes" | ||
|
||
class DockerPluginConfig(BaseModel): | ||
image: str = "" | ||
always_pull: bool = Field(default=True, alias="always-pull") | ||
propagate_environment: bool = Field(default=True, alias="propagate-environment") | ||
gpus: Optional[str] = "all" | ||
mount_buildkite_agent: Optional[bool] = Field(default=False, alias="mount-buildkite-agent") | ||
command: List[str] = Field(default_factory=list) | ||
environment: List[str] = [ | ||
f"HF_HOME={HF_HOME}", | ||
"VLLM_USAGE_SOURCE=ci-test", | ||
"HF_TOKEN", | ||
"BUILDKITE_ANALYTICS_TOKEN" | ||
] | ||
volumes: List[str] = [ | ||
"/dev/shm:/dev/shm", | ||
f"{HF_HOME}:{HF_HOME}" | ||
] | ||
|
||
class KubernetesPodContainerConfig(BaseModel): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we cancelled the plan to move back to k8s stack? why are we adding this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for the A100 node on EKS (roblox aws) |
||
image: str | ||
command: List[str] | ||
resources: Dict[str, Dict[str, int]] | ||
volume_mounts: List[Dict[str, str]] = Field( | ||
alias="volumeMounts", | ||
default=[ | ||
{"name": "devshm", "mountPath": "/dev/shm"}, | ||
{"name": "hf-cache", "mountPath": HF_HOME} | ||
] | ||
) | ||
env: List[Dict[str, str]] = Field( | ||
default=[ | ||
{"name": "HF_HOME", "value": HF_HOME}, | ||
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, | ||
{ | ||
"name": "HF_TOKEN", | ||
"valueFrom": { | ||
"secretKeyRef": { | ||
"name": "hf-token-secret", | ||
"key": "token" | ||
} | ||
} | ||
}, | ||
], | ||
) | ||
|
||
class KubernetesPodSpec(BaseModel): | ||
containers: List[KubernetesPodContainerConfig] | ||
priority_class_name: str = Field(default="ci", alias="priorityClassName") | ||
node_selector: Dict[str, Any] = Field( | ||
default={"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where does this |
||
alias="nodeSelector" | ||
) | ||
volumes: List[Dict[str, Any]] = Field( | ||
default=[ | ||
{"name": "devshm", "emptyDir": {"medium": "Memory"}}, | ||
{"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}} | ||
] | ||
) | ||
|
||
class KubernetesPluginConfig(BaseModel): | ||
pod_spec: KubernetesPodSpec = Field(alias="podSpec") | ||
|
||
def get_kubernetes_plugin_config(docker_image_path: str, test_bash_command: List[str], num_gpus: int) -> Dict: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: k8s does not always use docker. and the image does not have to be docker image. |
||
pod_spec = KubernetesPodSpec( | ||
containers=[ | ||
KubernetesPodContainerConfig( | ||
image=docker_image_path, | ||
command=[" ".join(test_bash_command)], | ||
resources={"limits": {"nvidia.com/gpu": num_gpus}} | ||
) | ||
] | ||
) | ||
return {KUBERNETES_PLUGIN_NAME: KubernetesPluginConfig(podSpec=pod_spec).dict(by_alias=True)} | ||
|
||
def get_docker_plugin_config(docker_image_path: str, test_bash_command: List[str], no_gpu: bool) -> Dict: | ||
docker_plugin_config = DockerPluginConfig( | ||
image=docker_image_path, | ||
command=test_bash_command | ||
) | ||
if no_gpu: | ||
docker_plugin_config.gpus = None | ||
return {DOCKER_PLUGIN_NAME: docker_plugin_config.dict(exclude_none=True, by_alias=True)} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import enum | ||
from typing import Optional, List | ||
|
||
# Constants | ||
HF_HOME = "/root/.cache/huggingface" | ||
DEFAULT_WORKING_DIR = "/vllm-workspace/tests" | ||
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" | ||
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" | ||
AMD_REPO = "rocm/vllm-ci" | ||
A100_GPU = "a100" | ||
|
||
# File paths | ||
TEST_PATH = ".buildkite/test-pipeline.yaml" | ||
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml" | ||
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml" | ||
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" | ||
|
||
STEPS_TO_BLOCK = [] | ||
|
||
|
||
class AgentQueue(str, enum.Enum): | ||
AWS_CPU = "cpu_queue" | ||
AWS_SMALL_CPU = "small_cpu_queue" | ||
AWS_1xL4 = "gpu_1_queue" | ||
AWS_4xL4 = "gpu_4_queue" | ||
A100 = "a100-queue" | ||
AMD_GPU = "amd" | ||
AMD_CPU = "amd-cpu" | ||
|
||
|
||
def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue: | ||
if no_gpu: | ||
return AgentQueue.AWS_SMALL_CPU | ||
if gpu_type == A100_GPU: | ||
return AgentQueue.A100 | ||
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4 | ||
|
||
|
||
def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str: | ||
"""Convert test commands into one-line command with the right directory.""" | ||
working_dir = step_working_dir or DEFAULT_WORKING_DIR | ||
test_commands_str = "; ".join(test_commands) | ||
return f"cd {working_dir}; {test_commands_str}" | ||
|
||
|
||
def get_multi_node_test_command( | ||
test_commands: List[str], | ||
working_dir: str, | ||
num_nodes: int, | ||
num_gpus: int, | ||
docker_image_path: str | ||
) -> str: | ||
quoted_commands = [f"'{command}'" for command in test_commands] | ||
multi_node_command = [ | ||
MULTI_NODE_TEST_SCRIPT, | ||
working_dir or DEFAULT_WORKING_DIR, | ||
str(num_nodes), | ||
str(num_gpus), | ||
docker_image_path, | ||
*quoted_commands | ||
] | ||
return " ".join(map(str, multi_node_command)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import pytest | ||
import sys | ||
|
||
from unittest.mock import patch | ||
from scripts.pipeline_generator.plugin import ( | ||
get_kubernetes_plugin_config, | ||
get_docker_plugin_config, | ||
DOCKER_PLUGIN_NAME, | ||
KUBERNETES_PLUGIN_NAME, | ||
) | ||
|
||
def test_get_kubernetes_plugin_config(): | ||
docker_image_path = "test_image:latest" | ||
test_bash_command = ["echo", "Hello, Kubernetes!"] | ||
num_gpus = 1 | ||
|
||
expected_config = { | ||
KUBERNETES_PLUGIN_NAME: { | ||
"podSpec": { | ||
"containers": [ | ||
{ | ||
"image": docker_image_path, | ||
"command": [" ".join(test_bash_command)], | ||
"resources": {"limits": {"nvidia.com/gpu": num_gpus}}, | ||
"volumeMounts": [ | ||
{"name": "devshm", "mountPath": "/dev/shm"}, | ||
{"name": "hf-cache", "mountPath": "/root/.cache/huggingface"} | ||
], | ||
"env": [ | ||
{"name": "HF_HOME", "value": "/root/.cache/huggingface"}, | ||
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, | ||
{ | ||
"name": "HF_TOKEN", | ||
"valueFrom": { | ||
"secretKeyRef": { | ||
"name": "hf-token-secret", | ||
"key": "token" | ||
} | ||
} | ||
}, | ||
], | ||
} | ||
], | ||
"priorityClassName": "ci", | ||
"nodeSelector": {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"}, | ||
"volumes": [ | ||
{"name": "devshm", "emptyDir": {"medium": "Memory"}}, | ||
{"name": "hf-cache", "hostPath": {"path": "/root/.cache/huggingface", "type": "Directory"}} | ||
] | ||
} | ||
} | ||
} | ||
|
||
assert get_kubernetes_plugin_config(docker_image_path, test_bash_command, num_gpus) == expected_config | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"docker_image_path, test_bash_command, no_gpu, expected_config", | ||
[ | ||
( | ||
"test_image:latest", | ||
["bash", "-c", "echo A", "pytest -v -s a.py"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this command looks weird and not real.. "echo A" is the arg of |
||
False, | ||
{ | ||
DOCKER_PLUGIN_NAME: { | ||
"image": "test_image:latest", | ||
"always-pull": True, | ||
"propagate-environment": True, | ||
"gpus": "all", | ||
"command": ["bash", "-c", "echo A", "pytest -v -s a.py"], | ||
"environment": [ | ||
"HF_HOME=/root/.cache/huggingface", | ||
"VLLM_USAGE_SOURCE=ci-test", | ||
"HF_TOKEN", | ||
"BUILDKITE_ANALYTICS_TOKEN" | ||
], | ||
"mount-buildkite-agent": False, | ||
"volumes": [ | ||
"/dev/shm:/dev/shm", | ||
"/root/.cache/huggingface:/root/.cache/huggingface" | ||
] | ||
} | ||
} | ||
), | ||
( | ||
"cpu_image:latest", | ||
["bash", "-c", "echo B", "pytest -v -s b.py"], | ||
True, | ||
{ | ||
DOCKER_PLUGIN_NAME: { | ||
"image": "cpu_image:latest", | ||
"always-pull": True, | ||
"propagate-environment": True, | ||
"command": ["bash", "-c", "echo B", "pytest -v -s b.py"], | ||
"environment": [ | ||
"HF_HOME=/root/.cache/huggingface", | ||
"VLLM_USAGE_SOURCE=ci-test", | ||
"HF_TOKEN", | ||
"BUILDKITE_ANALYTICS_TOKEN" | ||
], | ||
"mount-buildkite-agent": False, | ||
"volumes": [ | ||
"/dev/shm:/dev/shm", | ||
"/root/.cache/huggingface:/root/.cache/huggingface" | ||
] | ||
} | ||
} | ||
), | ||
] | ||
) | ||
def test_get_docker_plugin_config(docker_image_path, test_bash_command, no_gpu, expected_config): | ||
assert get_docker_plugin_config(docker_image_path, test_bash_command, no_gpu) == expected_config | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import pytest | ||
import sys | ||
from typing import List | ||
|
||
from scripts.pipeline_generator.utils import ( | ||
get_agent_queue, | ||
get_full_test_command, | ||
get_multi_node_test_command, | ||
AgentQueue, | ||
MULTI_NODE_TEST_SCRIPT, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("no_gpu", "gpu_type", "num_gpus", "expected_result"), | ||
[ | ||
(True, None, None, AgentQueue.AWS_SMALL_CPU), | ||
(False, "a100", None, AgentQueue.A100), | ||
(False, None, 1, AgentQueue.AWS_1xL4), | ||
(False, None, 4, AgentQueue.AWS_4xL4), | ||
], | ||
) | ||
def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue): | ||
assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("test_commands", "step_working_dir", "expected_result"), | ||
[ | ||
(["echo 'hello'"], None, "cd /vllm-workspace/tests; echo 'hello'"), | ||
(["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests; echo 'hello'"), | ||
(["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests; echo 'hello1'; echo 'hello2'"), | ||
], | ||
) | ||
def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str): | ||
assert get_full_test_command(test_commands, step_working_dir) == expected_result | ||
|
||
|
||
def test_get_multi_node_test_command(): | ||
test_commands = [ | ||
( | ||
"distributed/test_same_node.py;" | ||
"pytest -v -s distributed/test_multi_node_assignment.py;" | ||
"pytest -v -s distributed/test_pipeline_parallel.py" | ||
), | ||
"distributed/test_same_node.py", | ||
] | ||
working_dir = "/vllm-workspace/tests" | ||
num_nodes = 2 | ||
num_gpus = 4 | ||
docker_image_path = "ecr-path/vllm-ci-test-repo:latest" | ||
expected_multi_node_command = [ | ||
MULTI_NODE_TEST_SCRIPT, | ||
working_dir, | ||
num_nodes, | ||
num_gpus, | ||
docker_image_path, | ||
f"'{test_commands[0]}'", | ||
f"'{test_commands[1]}'", | ||
] | ||
expected_result = " ".join(map(str, expected_multi_node_command)) | ||
assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you add some doc / comments, like for everything?
like maybe some link pointers to where the golden spec of these structs are defined?