Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline generator plugins #33

Merged
merged 13 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,12 @@ terraform.rc

.env

.vscode/
.vscode/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

.cache
*.log
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
click==8.1.7
pydantic==2.9.2
Empty file added scripts/__init__.py
Empty file.
Empty file.
90 changes: 90 additions & 0 deletions scripts/pipeline_generator/plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional

from .utils import HF_HOME

DOCKER_PLUGIN_NAME = "docker#v5.2.0"
KUBERNETES_PLUGIN_NAME = "kubernetes"

class DockerPluginConfig(BaseModel):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add some doc / comments, like for everything?

like maybe some link pointers to where the golden spec of these structs are defined?

image: str = ""
always_pull: bool = Field(default=True, alias="always-pull")
propagate_environment: bool = Field(default=True, alias="propagate-environment")
gpus: Optional[str] = "all"
mount_buildkite_agent: Optional[bool] = Field(default=False, alias="mount-buildkite-agent")
command: List[str] = Field(default_factory=list)
environment: List[str] = [
f"HF_HOME={HF_HOME}",
"VLLM_USAGE_SOURCE=ci-test",
"HF_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN"
]
volumes: List[str] = [
"/dev/shm:/dev/shm",
f"{HF_HOME}:{HF_HOME}"
]

class KubernetesPodContainerConfig(BaseModel):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we cancelled the plan to move back to k8s stack? why are we adding this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for the A100 node on EKS (roblox aws)

image: str
command: List[str]
resources: Dict[str, Dict[str, int]]
volume_mounts: List[Dict[str, str]] = Field(
alias="volumeMounts",
default=[
{"name": "devshm", "mountPath": "/dev/shm"},
{"name": "hf-cache", "mountPath": HF_HOME}
]
)
env: List[Dict[str, str]] = Field(
default=[
{"name": "HF_HOME", "value": HF_HOME},
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"},
{
"name": "HF_TOKEN",
"valueFrom": {
"secretKeyRef": {
"name": "hf-token-secret",
"key": "token"
}
}
},
],
)

class KubernetesPodSpec(BaseModel):
containers: List[KubernetesPodContainerConfig]
priority_class_name: str = Field(default="ci", alias="priorityClassName")
node_selector: Dict[str, Any] = Field(
default={"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where does this NVIDIA-A100-SXM4-80GB come from?

alias="nodeSelector"
)
volumes: List[Dict[str, Any]] = Field(
default=[
{"name": "devshm", "emptyDir": {"medium": "Memory"}},
{"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}}
]
)

class KubernetesPluginConfig(BaseModel):
pod_spec: KubernetesPodSpec = Field(alias="podSpec")

def get_kubernetes_plugin_config(docker_image_path: str, test_bash_command: List[str], num_gpus: int) -> Dict:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: docker_image_path -> image or container_image ?

k8s does not always use docker. and the image does not have to be docker image.

pod_spec = KubernetesPodSpec(
containers=[
KubernetesPodContainerConfig(
image=docker_image_path,
command=[" ".join(test_bash_command)],
resources={"limits": {"nvidia.com/gpu": num_gpus}}
)
]
)
return {KUBERNETES_PLUGIN_NAME: KubernetesPluginConfig(podSpec=pod_spec).dict(by_alias=True)}

def get_docker_plugin_config(docker_image_path: str, test_bash_command: List[str], no_gpu: bool) -> Dict:
docker_plugin_config = DockerPluginConfig(
image=docker_image_path,
command=test_bash_command
)
if no_gpu:
docker_plugin_config.gpus = None
return {DOCKER_PLUGIN_NAME: docker_plugin_config.dict(exclude_none=True, by_alias=True)}
62 changes: 62 additions & 0 deletions scripts/pipeline_generator/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import enum
from typing import Optional, List

# Constants
HF_HOME = "/root/.cache/huggingface"
DEFAULT_WORKING_DIR = "/vllm-workspace/tests"
VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7"
VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo"
AMD_REPO = "rocm/vllm-ci"
A100_GPU = "a100"

# File paths
TEST_PATH = ".buildkite/test-pipeline.yaml"
EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml"
PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml"
MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh"

STEPS_TO_BLOCK = []


class AgentQueue(str, enum.Enum):
AWS_CPU = "cpu_queue"
AWS_SMALL_CPU = "small_cpu_queue"
AWS_1xL4 = "gpu_1_queue"
AWS_4xL4 = "gpu_4_queue"
A100 = "a100-queue"
AMD_GPU = "amd"
AMD_CPU = "amd-cpu"


def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue:
if no_gpu:
return AgentQueue.AWS_SMALL_CPU
if gpu_type == A100_GPU:
return AgentQueue.A100
return AgentQueue.AWS_1xL4 if num_gpus == 1 else AgentQueue.AWS_4xL4


def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str:
"""Convert test commands into one-line command with the right directory."""
working_dir = step_working_dir or DEFAULT_WORKING_DIR
test_commands_str = "; ".join(test_commands)
return f"cd {working_dir}; {test_commands_str}"


def get_multi_node_test_command(
test_commands: List[str],
working_dir: str,
num_nodes: int,
num_gpus: int,
docker_image_path: str
) -> str:
quoted_commands = [f"'{command}'" for command in test_commands]
multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir or DEFAULT_WORKING_DIR,
str(num_nodes),
str(num_gpus),
docker_image_path,
*quoted_commands
]
return " ".join(map(str, multi_node_command))
Empty file added scripts/tests/__init__.py
Empty file.
Empty file.
116 changes: 116 additions & 0 deletions scripts/tests/pipeline_generator/test_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import pytest
import sys

from unittest.mock import patch
from scripts.pipeline_generator.plugin import (
get_kubernetes_plugin_config,
get_docker_plugin_config,
DOCKER_PLUGIN_NAME,
KUBERNETES_PLUGIN_NAME,
)

def test_get_kubernetes_plugin_config():
docker_image_path = "test_image:latest"
test_bash_command = ["echo", "Hello, Kubernetes!"]
num_gpus = 1

expected_config = {
KUBERNETES_PLUGIN_NAME: {
"podSpec": {
"containers": [
{
"image": docker_image_path,
"command": [" ".join(test_bash_command)],
"resources": {"limits": {"nvidia.com/gpu": num_gpus}},
"volumeMounts": [
{"name": "devshm", "mountPath": "/dev/shm"},
{"name": "hf-cache", "mountPath": "/root/.cache/huggingface"}
],
"env": [
{"name": "HF_HOME", "value": "/root/.cache/huggingface"},
{"name": "VLLM_USAGE_SOURCE", "value": "ci-test"},
{
"name": "HF_TOKEN",
"valueFrom": {
"secretKeyRef": {
"name": "hf-token-secret",
"key": "token"
}
}
},
],
}
],
"priorityClassName": "ci",
"nodeSelector": {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"},
"volumes": [
{"name": "devshm", "emptyDir": {"medium": "Memory"}},
{"name": "hf-cache", "hostPath": {"path": "/root/.cache/huggingface", "type": "Directory"}}
]
}
}
}

assert get_kubernetes_plugin_config(docker_image_path, test_bash_command, num_gpus) == expected_config


@pytest.mark.parametrize(
"docker_image_path, test_bash_command, no_gpu, expected_config",
[
(
"test_image:latest",
["bash", "-c", "echo A", "pytest -v -s a.py"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this command looks weird and not real.. "echo A" is the arg of -c, waht is the pytest after it? why does the command for a docker plugin look like this?

False,
{
DOCKER_PLUGIN_NAME: {
"image": "test_image:latest",
"always-pull": True,
"propagate-environment": True,
"gpus": "all",
"command": ["bash", "-c", "echo A", "pytest -v -s a.py"],
"environment": [
"HF_HOME=/root/.cache/huggingface",
"VLLM_USAGE_SOURCE=ci-test",
"HF_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN"
],
"mount-buildkite-agent": False,
"volumes": [
"/dev/shm:/dev/shm",
"/root/.cache/huggingface:/root/.cache/huggingface"
]
}
}
),
(
"cpu_image:latest",
["bash", "-c", "echo B", "pytest -v -s b.py"],
True,
{
DOCKER_PLUGIN_NAME: {
"image": "cpu_image:latest",
"always-pull": True,
"propagate-environment": True,
"command": ["bash", "-c", "echo B", "pytest -v -s b.py"],
"environment": [
"HF_HOME=/root/.cache/huggingface",
"VLLM_USAGE_SOURCE=ci-test",
"HF_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN"
],
"mount-buildkite-agent": False,
"volumes": [
"/dev/shm:/dev/shm",
"/root/.cache/huggingface:/root/.cache/huggingface"
]
}
}
),
]
)
def test_get_docker_plugin_config(docker_image_path, test_bash_command, no_gpu, expected_config):
assert get_docker_plugin_config(docker_image_path, test_bash_command, no_gpu) == expected_config


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
66 changes: 66 additions & 0 deletions scripts/tests/pipeline_generator/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest
import sys
from typing import List

from scripts.pipeline_generator.utils import (
get_agent_queue,
get_full_test_command,
get_multi_node_test_command,
AgentQueue,
MULTI_NODE_TEST_SCRIPT,
)


@pytest.mark.parametrize(
("no_gpu", "gpu_type", "num_gpus", "expected_result"),
[
(True, None, None, AgentQueue.AWS_SMALL_CPU),
(False, "a100", None, AgentQueue.A100),
(False, None, 1, AgentQueue.AWS_1xL4),
(False, None, 4, AgentQueue.AWS_4xL4),
],
)
def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue):
assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result


@pytest.mark.parametrize(
("test_commands", "step_working_dir", "expected_result"),
[
(["echo 'hello'"], None, "cd /vllm-workspace/tests; echo 'hello'"),
(["echo 'hello'"], "/vllm-workspace/tests", "cd /vllm-workspace/tests; echo 'hello'"),
(["echo 'hello1'", "echo 'hello2'"], None, "cd /vllm-workspace/tests; echo 'hello1'; echo 'hello2'"),
],
)
def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str):
assert get_full_test_command(test_commands, step_working_dir) == expected_result


def test_get_multi_node_test_command():
test_commands = [
(
"distributed/test_same_node.py;"
"pytest -v -s distributed/test_multi_node_assignment.py;"
"pytest -v -s distributed/test_pipeline_parallel.py"
),
"distributed/test_same_node.py",
]
working_dir = "/vllm-workspace/tests"
num_nodes = 2
num_gpus = 4
docker_image_path = "ecr-path/vllm-ci-test-repo:latest"
expected_multi_node_command = [
MULTI_NODE_TEST_SCRIPT,
working_dir,
num_nodes,
num_gpus,
docker_image_path,
f"'{test_commands[0]}'",
f"'{test_commands[1]}'",
]
expected_result = " ".join(map(str, expected_multi_node_command))
assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))