diff --git a/build_tools/rocm/run_xla_multi_gpu.sh b/build_tools/rocm/run_xla_multi_gpu.sh
new file mode 100755
index 00000000000000..d030b3e3f20b04
--- /dev/null
+++ b/build_tools/rocm/run_xla_multi_gpu.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+# If rocm-smi exists locally (it should) use it to find
+# out how many GPUs we have to test with.
+rocm-smi -i
+STATUS=$?
+if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
+   TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
+fi
+if [[ $TF_GPU_COUNT -lt 4 ]]; then
+    echo "Found only ${TF_GPU_COUNT} gpus, multi-gpu tests need atleast 4 gpus."
+    exit
+fi
+
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+echo ""
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+echo ""
+
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+else
+    if [[ -z "${ROCM_PATH}" ]]; then
+        ROCM_INSTALL_DIR=/opt/rocm-6.0.2
+    else
+        ROCM_INSTALL_DIR=$ROCM_PATH
+    fi
+fi
+
+export PYTHON_BIN_PATH=`which python3`
+export TF_NEED_ROCM=1
+export ROCM_PATH=$ROCM_INSTALL_DIR
+TAGS_FILTER="-oss_excluded,-oss_serial"
+UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
+TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+
+bazel \
+    test \
+    --config=rocm \
+    --build_tag_filters=${TAGS_FILTER} \
+    --test_tag_filters=${TAGS_FILTER} \
+    --test_timeout=920,2400,7200,9600 \
+    --test_sharding_strategy=disabled \
+    --test_output=errors \
+    --flaky_test_attempts=3 \
+    --keep_going \
+    --local_test_jobs=${N_TEST_JOBS} \
+    --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+    --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+    --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+    --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+    -- //xla/tests:collective_ops_test_e2e_gpu_amd_any \
+       //xla/tests:collective_ops_test_gpu_amd_any \
+       //xla/tests:replicated_io_feed_test_gpu_amd_any \
+       //xla/tools/multihost_hlo_runner:functional_hlo_runner_test_gpu_amd_any \
+       //xla/pjrt/distributed:topology_util_test \
+       //xla/pjrt/distributed:client_server_test
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
index c1164551aaa2e1..8997fc9e44cc99 100644
--- a/xla/service/gpu/BUILD
+++ b/xla/service/gpu/BUILD
@@ -752,7 +752,6 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_h100",
-        "gpu_amd_any",
     ],
     shard_count = 10,
     tags = ["nomac"],
@@ -1255,7 +1254,6 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_h100",
-        "gpu_amd_any",
     ],
     deps = [
         ":gpu_device_info_for_tests",
@@ -1263,7 +1261,7 @@ xla_test(
         ":triton_fusion_analysis",
         ":triton_support",
         ":triton_test_utils",
-        "//third_party/protobuf",
+	"third_party/protobuf",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -1284,7 +1282,6 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_h100",
-        "gpu_amd_any",
     ],
     tags = ["nomac"],
     deps = [
@@ -6165,7 +6162,7 @@ xla_test(
     backend_tags = {"gpu": [
         "requires-gpu-sm80",
     ]},
-    backends = ["gpu"],
+    backends = ["gpu_a100", "gpu_h100"],
     deps = [
         ":autotuner_compile_util",
         ":autotuner_util",
diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD
index fdf73310efd166..8f8317f88114bc 100644
--- a/xla/service/gpu/tests/BUILD
+++ b/xla/service/gpu/tests/BUILD
@@ -469,7 +469,6 @@ xla_test(
     backends = [
         "gpu_a100",
         "gpu_v100",
-        "gpu_amd_any",
     ],
     deps = [
         ":gpu_codegen_test",