[XLA:GPU] Always use GpuAsyncTracker.

PiperOrigin-RevId: 705095500
openxla · Dec 13, 2024 · a22c0c7 · a22c0c7
1 parent f5d511b
commit a22c0c7
Show file tree

Hide file tree

Showing 5 changed files with 4 additions and 34 deletions.
diff --git a/xla/debug_options_flags.cc b/xla/debug_options_flags.cc
@@ -166,7 +166,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_shape_checks(DebugOptions::RUNTIME);
   opts.set_xla_dump_latency_hiding_schedule(false);
   opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
-  opts.set_xla_gpu_lhs_enable_gpu_async_tracker(true);
   opts.set_xla_gpu_enable_analytical_latency_estimator(false);
   opts.set_xla_gpu_pgle_profile_file_or_directory_path("");
   opts.set_xla_gpu_memory_limit_slop_factor(95);
@@ -1566,11 +1565,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_pgle_profile_file_or_directory_path),
       debug_options->xla_gpu_pgle_profile_file_or_directory_path(),
       "Directory or file for PGLE profiles in XLA:GPU"));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_lhs_enable_gpu_async_tracker",
-      bool_setter_for(&DebugOptions::set_xla_gpu_lhs_enable_gpu_async_tracker),
-      debug_options->xla_gpu_lhs_enable_gpu_async_tracker(),
-      "Enable GPU async tracker for latency-hiding scheduler in XLA:GPU"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_memory_limit_slop_factor",
       int32_setter_for(&DebugOptions::set_xla_gpu_memory_limit_slop_factor),

diff --git a/xla/service/gpu/gpu_hlo_schedule.cc b/xla/service/gpu/gpu_hlo_schedule.cc
@@ -514,12 +514,7 @@ absl::Status RunLatencyHidingSchedulerPasses(
     return GetSizeOfShape(shape, pointer_size);
   };
 
-  const DebugOptions& options = module->config().debug_options();
-  auto async_tracker = [&]() -> std::unique_ptr<AsyncTracker> {
-    return options.xla_gpu_lhs_enable_gpu_async_tracker()
-               ? std::make_unique<GpuAsyncTracker>(config)
-               : std::make_unique<GpuAsyncTrackerBase>(config);
-  }();
+  auto async_tracker = std::make_unique<GpuAsyncTracker>(config);
 
   HloPassPipeline pipeline("latency-hiding-scheduler");
   std::unique_ptr<LatencyEstimator> latency_estimator = GetLatencyEstimator(

diff --git a/xla/service/gpu/gpu_hlo_schedule_test.cc b/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -78,7 +78,6 @@ class GpuHloScheduleTest : public HloTestBase {
 
   struct TestConfig {
     bool enable_latency_hiding_scheduler = false;
-    bool enable_gpu_async_tracker = false;
     bool enable_pipelined_p2p = false;
     std::string fdo_profile = "";
   };
@@ -88,8 +87,6 @@ class GpuHloScheduleTest : public HloTestBase {
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_latency_hiding_scheduler(
         test_config.enable_latency_hiding_scheduler);
-    debug_options.set_xla_gpu_lhs_enable_gpu_async_tracker(
-        test_config.enable_gpu_async_tracker);
     debug_options.set_xla_gpu_enable_pipelined_p2p(
         test_config.enable_pipelined_p2p);
     config.set_debug_options(debug_options);
@@ -510,7 +507,6 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModel) {
   for (const SubTest& subtest : subtests) {
     TestConfig test_config;
     test_config.enable_latency_hiding_scheduler = true;
-    test_config.enable_gpu_async_tracker = true;
     test_config.fdo_profile = subtest.profile;
     TF_ASSERT_OK_AND_ASSIGN(
         auto module,
@@ -573,7 +569,6 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelFailsWithIncompleteProfile) {
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.fdo_profile = kProfile;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -634,7 +629,6 @@ TEST_F(
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.fdo_profile = kProfile;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -692,7 +686,6 @@ TEST_F(GpuHloScheduleTest, ProfileGuidedCostModelWithRematData) {
   )pb";
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.fdo_profile = ar_long_latency_proto_text;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -876,7 +869,6 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPairs2) {
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.enable_pipelined_p2p = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -973,7 +965,6 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvAllReduce) {
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.enable_pipelined_p2p = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -1095,7 +1086,6 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined1) {
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.enable_pipelined_p2p = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -1291,7 +1281,6 @@ TEST_F(GpuHloScheduleTest, LHSSendRecvPipelined2) {
 
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = true;
   test_config.enable_pipelined_p2p = true;
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
@@ -1520,7 +1509,7 @@ TEST_P(GpuHloScheduleParameterizedTest, AsyncAllReduce) {
   EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 
-TEST_P(GpuHloScheduleParameterizedTest, LHSResourceModel) {
+TEST_F(GpuHloScheduleTest, LHSResourceModel) {
   const char* hlo_text = R"(
   HloModule AsyncModule
   apply_op {
@@ -1559,19 +1548,13 @@ TEST_P(GpuHloScheduleParameterizedTest, LHSResourceModel) {
     ROOT t = (f32[32], f32[64], f32[32,32]) tuple(ar-done, %ag-done, add5)
   })";
 
-  const bool enable_gpu_async_tracker = GetParam();
   TestConfig test_config;
   test_config.enable_latency_hiding_scheduler = true;
-  test_config.enable_gpu_async_tracker = GetParam();
   TF_ASSERT_OK_AND_ASSIGN(
       auto module,
       ParseAndReturnVerifiedModule(hlo_text, GetModuleConfig(test_config)));
   SequentialHloOrdering order = BuildHloOrdering(module.get());
 
-  // Count the number of collectives in flight. Without gpu async tracker, we
-  // will incorrectly have 2 in-flight (as base async tracker assumes each
-  // collective can be scheduled independently as they use different resource
-  // types), but with gpu async tracker we will have 1.
   uint32_t in_flight = 0;
   uint32_t max_in_flight = 0;
   for (const HloInstruction* inst :
@@ -1584,8 +1567,7 @@ TEST_P(GpuHloScheduleParameterizedTest, LHSResourceModel) {
     }
   }
 
-  const uint32_t expected_max_in_flight = enable_gpu_async_tracker ? 1 : 2;
-  EXPECT_EQ(expected_max_in_flight, max_in_flight);
+  EXPECT_EQ(max_in_flight, 1);
   EXPECT_TRUE(HasValidFingerprint(module.get()));
 }
 

diff --git a/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -84,7 +84,6 @@ class GpuLatencyHidingSchedulerBaseTest : public HloTestBase {
     HloModuleConfig config;
     DebugOptions debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
-    debug_options.set_xla_gpu_lhs_enable_gpu_async_tracker(true);
     debug_options.set_xla_gpu_enable_experimental_pipeline_parallelism_opt(
         enable_experimental_pipeline_parallelism_opt);
     config.set_debug_options(debug_options);

diff --git a/xla/xla.proto b/xla/xla.proto
@@ -681,7 +681,7 @@ message DebugOptions {
   bool xla_gpu_enable_highest_priority_async_stream = 216;
   bool xla_gpu_enable_analytical_latency_estimator = 255;
 
-  bool xla_gpu_lhs_enable_gpu_async_tracker = 204;
+  reserved 204;  // Was xla_gpu_lhs_enable_gpu_async_tracker.
   string xla_gpu_pgle_profile_file_or_directory_path = 210;
   int32 xla_gpu_memory_limit_slop_factor = 260;