diff --git a/scripts/test-template-aws.j2 b/scripts/test-template-aws.j2 index 718e31f..ef98bc7 100644 --- a/scripts/test-template-aws.j2 +++ b/scripts/test-template-aws.j2 @@ -1,4 +1,7 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} +{% if branch == "main" %} +{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %} +{% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %} @@ -185,6 +188,8 @@ steps: limit: 2 - exit_status: -10 # Agent was lost limit: 2 + - exit_status: 1 # Machine occasionally fail + limit: 2 agents: queue: amd-cpu @@ -204,6 +209,15 @@ steps: {% else %} soft_fail: false {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + - exit_status: 1 # Machine occasionally fail + limit: 2 + {% endif %} {% endfor %} diff --git a/scripts/test-template-fastcheck.j2 b/scripts/test-template-fastcheck.j2 index a72af1d..c163e1c 100644 --- a/scripts/test-template-fastcheck.j2 +++ b/scripts/test-template-fastcheck.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" key: image-build agents: - queue: cpu_queue + queue: cpu_queue_premerge commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." @@ -38,9 +38,9 @@ steps: depends_on: image-build agents: {% if step.label == "Documentation Build" %} - queue: small_cpu_queue + queue: small_cpu_queue_premerge {% elif step.no_gpu %} - queue: cpu_queue + queue: cpu_queue_premerge {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -91,9 +91,9 @@ steps: depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") }} agents: {% if step.label == "Documentation Build" %} - queue: small_cpu_queue + queue: small_cpu_queue_premerge {% elif step.no_gpu %} - queue: cpu_queue + queue: cpu_queue_premerge {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -157,7 +157,7 @@ steps: - label: "{{ step.label }}" priority: 10000 agents: - queue: a100-queue + queue: a100_queue soft_fail: {{ step.soft_fail or false }} {% if step.parallelism %} parallelism: {{ step.parallelism }} @@ -212,7 +212,7 @@ steps: - label: "TPU Test" depends_on: ~ agents: - queue: tpu + queue: tpu_queue commands: - if [[ -f ".buildkite/run-tpu-test.sh" ]]; then bash .buildkite/run-tpu-test.sh; fi - yes | docker system prune -a