Skip to content

Commit

Permalink
[XLA:GPU] Readability and performance nits.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 705498812
  • Loading branch information
allanrenucci authored and Google-ML-Automation committed Dec 13, 2024
1 parent e650c3d commit c4bc2b2
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 100 deletions.
1 change: 0 additions & 1 deletion xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -2113,7 +2113,6 @@ cc_library(
"//xla/hlo/utils:hlo_query",
"//xla/service:buffer_value",
"//xla/service:collective_ops_utils",
"//xla/service:collective_utils",
"//xla/service:latency_hiding_scheduler",
"//xla/service:p2p_schedule_preparation",
"//xla/service:profile_guided_latency_estimator",
Expand Down
186 changes: 87 additions & 99 deletions xla/service/gpu/gpu_hlo_schedule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ limitations under the License.
#include "absl/status/statusor.h"
#include "absl/strings/match.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "xla/hlo/ir/hlo_computation.h"
Expand Down Expand Up @@ -278,16 +279,15 @@ SchedulerConfig GetSchedulerConfig(int64_t memory_limit,

tensorflow::profiler::ProfiledInstructionsProto GetProfileForFingerprint(
tensorflow::profiler::ProfiledInstructionsProto& profile,
const std::string& fingerprint) {
absl::string_view fingerprint) {
tensorflow::profiler::ProfiledInstructionsProto result;
bool merge_remat_clones = false;
for (const auto& cost : profile.costs()) {
absl::string_view cost_name = cost.name();
std::string new_cost_name = cost.name();
absl::string_view cost_sep = "::";
if (absl::StrContains(cost_name, cost_sep)) {
std::vector<std::string> split_names =
absl::StrSplit(cost_name, cost_sep);
if (absl::StrContains(cost.name(), cost_sep)) {
std::vector<absl::string_view> split_names =
absl::StrSplit(cost.name(), cost_sep);
if (split_names.size() != 2 || split_names[0] != fingerprint) {
continue;
}
Expand Down Expand Up @@ -325,30 +325,33 @@ tensorflow::profiler::ProfiledInstructionsProto GetProfileForFingerprint(
return name;
};

// Map from stripped name -> pair<accumulated cost, count>
absl::flat_hash_map<absl::string_view, std::pair<double, int64_t>> costs;
struct Data {
double accumulated_cost = 0.0;
int64_t count = 0;
};
absl::flat_hash_map<absl::string_view, Data> costs;
for (const auto& cost : result.costs()) {
std::pair<double, int64_t>& data = costs[strip_remat_suffix(cost.name())];
data.first += cost.cost_us();
data.second++;
Data& data = costs[strip_remat_suffix(cost.name())];
data.accumulated_cost += cost.cost_us();
data.count++;
}

tensorflow::profiler::ProfiledInstructionsProto merged_result;
for (const auto& cost : costs) {
for (const auto& [name, data] : costs) {
auto* new_cost = merged_result.add_costs();
double average = cost.second.first / cost.second.second;
double average = data.accumulated_cost / data.count;
new_cost->set_cost_us(average);
new_cost->set_name(std::string(cost.first));
new_cost->set_name(std::string(name));
}

return merged_result;
}

std::optional<tensorflow::profiler::ProfiledInstructionsProto> ReadPGLEProfile(
const HloModule* module, const std::string& fingerprint) {
const HloModule& module, absl::string_view fingerprint) {
tensorflow::profiler::ProfiledInstructionsProto profile;

absl::string_view fdo_profile = module->config().fdo_profile();
absl::string_view fdo_profile = module.config().fdo_profile();
// First attempt to read the profile from `fdo_profile` in ModuleConfig
if (!fdo_profile.empty()) {
// Attempt to parse it as a binary proto.
Expand All @@ -369,14 +372,14 @@ std::optional<tensorflow::profiler::ProfiledInstructionsProto> ReadPGLEProfile(
}

const std::string& pgle_profile_file_or_dir_path =
module->config()
module.config()
.debug_options()
.xla_gpu_pgle_profile_file_or_directory_path();
if (pgle_profile_file_or_dir_path.empty()) {
return std::nullopt;
}
tsl::Env* env = tsl::Env::Default();
auto read_text_or_binary_profile = [&profile, env, &fingerprint](
auto read_text_or_binary_profile = [&profile, env, fingerprint](
const std::string& text_path,
const std::string& binary_path)
-> std::optional<tensorflow::profiler::ProfiledInstructionsProto> {
Expand Down Expand Up @@ -409,7 +412,7 @@ std::optional<tensorflow::profiler::ProfiledInstructionsProto> ReadPGLEProfile(
// specific module.
if (env->IsDirectory(pgle_profile_file_or_dir_path).ok()) {
std::string pgle_profile_path_prefix =
pgle_profile_file_or_dir_path + "/" + fingerprint;
absl::StrCat(pgle_profile_file_or_dir_path, "/", fingerprint);
return read_text_or_binary_profile(pgle_profile_path_prefix + ".pbtxt",
pgle_profile_path_prefix + ".pb");
}
Expand Down Expand Up @@ -446,7 +449,7 @@ std::string TagWithFingerprint(HloModule* module) {
HloPrintOptions::Canonical().set_print_backend_config(true));
FrontendAttributes attributes;
(*attributes.mutable_map())[std::string(kFingerprintBeforeLHS)] = fingerprint;
module->add_frontend_attributes(attributes);
module->add_frontend_attributes(std::move(attributes));
VLOG(1) << "Fingerprint before LHS for module " << module->name() << "("
<< module->unique_id() << ") = " << fingerprint;
return fingerprint;
Expand All @@ -457,16 +460,16 @@ std::string TagWithFingerprint(HloModule* module) {
// additionally add fail-fast/warn checks to the pipeline which act in the
// absence of instruction in the profile. See `PGLEAccuracyChecker` for details.
std::unique_ptr<LatencyEstimator> GetLatencyEstimator(
HloModule* module, int pointer_size,
const HloModule& module, int pointer_size,
const se::DeviceDescription& gpu_device_info, absl::string_view fingerprint,
const SchedulerConfig& config, HloPassPipeline& pipeline) {
const DebugOptions& options = module->config().debug_options();
const DebugOptions& options = module.config().debug_options();

auto gpu_latency_estimator =
std::make_unique<GpuLatencyEstimator>(pointer_size);

std::optional<tensorflow::profiler::ProfiledInstructionsProto> profile =
ReadPGLEProfile(module, std::string(fingerprint));
ReadPGLEProfile(module, fingerprint);

if (profile.has_value()) {
auto aggregator = std::make_unique<GPUProfileStatisticsAggregator>();
Expand All @@ -491,7 +494,7 @@ std::unique_ptr<LatencyEstimator> GetLatencyEstimator(
[input_pointer_size = pointer_size](const Shape& shape) {
return GetSizeOfShape(shape, input_pointer_size);
},
module->entry_computation());
module.entry_computation());
}
return gpu_latency_estimator;
}
Expand Down Expand Up @@ -520,7 +523,7 @@ absl::Status RunLatencyHidingSchedulerPasses(

HloPassPipeline pipeline("latency-hiding-scheduler");
std::unique_ptr<LatencyEstimator> latency_estimator = GetLatencyEstimator(
module, pointer_size, gpu_device_info, fingerprint, config, pipeline);
*module, pointer_size, gpu_device_info, fingerprint, config, pipeline);

auto scheduler_core = std::make_unique<DefaultSchedulerCore>(
shape_size_in_bytes, async_tracker.get(), latency_estimator.get(), config,
Expand All @@ -538,11 +541,56 @@ absl::Status RunLatencyHidingSchedulerPasses(
return pipeline.Run(module).status();
}

} // end namespace
// Compute the device memory limit to be used by passes like scheduler and
// HLO rematerialization.
int64_t GetSchedulerMemoryLimit(const HloModule& module,
const se::DeviceDescription& gpu_device_info,
int pointer_size) {
// There is a "base" value which is either specified in HloModuleConfig (this
// value should take into account the fact that we need to leave some memory
// free for allocations that happen outside of XLA's allocator) or
// obtained from GPU device info (we scale down this value to leave some space
// for these outside XLA's allocator allocation).
//
// From that base value, subtract any input and output sizes (assuming they
// are live throughout the execution) and then apply a slop factor.
const int64_t base_limit =
module.config().device_memory_size() != 0
? module.config().device_memory_size()
: gpu_device_info.device_memory_size() * 80 / 100;

static int64_t GetSchedulerMemoryLimit(
const HloModule* module, const se::DeviceDescription& gpu_device_info,
int pointer_size);
// Find the total size of inputs and outputs.
int64_t total_io_size = 0;
for (HloInstruction* param :
module.entry_computation()->parameter_instructions()) {
ShapeUtil::ForEachSubshape(
param->shape(),
[&](const Shape& subshape, const ShapeIndex& /*index*/) {
total_io_size += GetSizeOfShape(subshape, pointer_size);
});
}
ShapeUtil::ForEachSubshape(
module.result_shape(),
[&](const Shape& subshape, const ShapeIndex& /*index*/) {
total_io_size += GetSizeOfShape(subshape, pointer_size);
});

// If any inputs and outputs are aliased, do not double count them.
module.input_output_alias_config().ForEachAlias(
[&](const ShapeIndex& output_index,
const HloInputOutputAliasConfig::Alias&) {
const Shape& subshape =
ShapeUtil::GetSubshape(module.result_shape(), output_index);
total_io_size -= GetSizeOfShape(subshape, pointer_size);
});

int64_t limit =
(base_limit - total_io_size) *
module.config().debug_options().xla_gpu_memory_limit_slop_factor() / 100;
return limit;
}

} // end namespace

absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
HloModule* module, int64_t pointer_size,
Expand All @@ -553,45 +601,34 @@ absl::StatusOr<ScheduleMetadata> ScheduleGpuModule(
// instruction name with ids.
std::string fingerprint = TagWithFingerprint(module);
int64_t memory_limit =
GetSchedulerMemoryLimit(module, gpu_device_info, pointer_size);
GetSchedulerMemoryLimit(*module, gpu_device_info, pointer_size);

// Case 1: Module has a schedule.
//
// Return already existing schedule.
// Module already has a schedule, do nothing.
if (module->has_schedule()) {
return ScheduleMetadata{memory_limit};
}

// Case 2: Module does not have a schedule.
//
// Running default scheduler.
// Run the scheduler which minimizes peak memory usage.
// We need to run it anyway because LHS relies on it track buffers. See
// `xla::BufferInfoTracker::BufferInfoTracker()`.
TF_RETURN_IF_ERROR(RunP2PSchedulePreparation(module));
TF_ASSIGN_OR_RETURN(
HloSchedule schedule,
ScheduleGpuModuleWithMemoryScheduler(module, pointer_size));
TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));

bool enable_latency_hiding_scheduler =
module->config()
.debug_options()
.xla_gpu_enable_latency_hiding_scheduler() ||
IsPassEnabledAtOptimizationEffort<LatencyHidingScheduler>(*module);

// Default behaviour. Run the scheduler which minimizes peak memory usage.
TF_ASSIGN_OR_RETURN(
HloSchedule schedule,
ScheduleGpuModuleWithMemoryScheduler(module, pointer_size));
TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));

// LHS disabled, we return a default schedule.
if (!enable_latency_hiding_scheduler) {
return ScheduleMetadata{memory_limit};
}

// Case 3: LHS enabled.
//
// Run Latency Hiding Scheduler (LHS). It maximizes the compute-communication
// overlap, potentially at the cost of memory usage.
TF_RETURN_IF_ERROR(RunLatencyHidingSchedulerPasses(
module, pointer_size, fingerprint, memory_limit, gpu_device_info));
if (enable_latency_hiding_scheduler) {
TF_RETURN_IF_ERROR(RunLatencyHidingSchedulerPasses(
module, pointer_size, fingerprint, memory_limit, gpu_device_info));
}

return ScheduleMetadata{memory_limit};
}
Expand All @@ -614,54 +651,5 @@ HloInstructionSequence PostProcessSchedule(
return PostprocessorToScheduleAsEarlyOrLateAsPossible(result);
}

// Compute the device memory limit to be used by passes like scheduler and
// HLO rematerialization.
static int64_t GetSchedulerMemoryLimit(
const HloModule* module, const se::DeviceDescription& gpu_device_info,
int pointer_size) {
// There is a "base" value which is either specified in HloModuleConfig (this
// value should take into account the fact that we need to leave some memory
// free for allocations that happen outside of XLA's allocator) or
// obtained from GPU device info (we scale down this value to leave some space
// for these outside XLA's allocator allocation).
//
// From that base value, subtract any input and output sizes (assuming they
// are live throughout the execution) and then apply a slop factor.
const int64_t base_limit =
module->config().device_memory_size() != 0
? module->config().device_memory_size()
: gpu_device_info.device_memory_size() * 80 / 100;

// Find the total size of inputs and outputs.
int64_t total_io_size = 0;
for (HloInstruction* param :
module->entry_computation()->parameter_instructions()) {
ShapeUtil::ForEachSubshape(
param->shape(),
[&](const Shape& subshape, const ShapeIndex& /*index*/) {
total_io_size += GetSizeOfShape(subshape, pointer_size);
});
}
ShapeUtil::ForEachSubshape(
module->result_shape(),
[&](const Shape& subshape, const ShapeIndex& /*index*/) {
total_io_size += GetSizeOfShape(subshape, pointer_size);
});

// If any inputs and outputs are aliased, do not double count them.
module->input_output_alias_config().ForEachAlias(
[&](const ShapeIndex& output_index,
const HloInputOutputAliasConfig::Alias&) {
const Shape& subshape =
ShapeUtil::GetSubshape(module->result_shape(), output_index);
total_io_size -= GetSizeOfShape(subshape, pointer_size);
});

int64_t limit =
(base_limit - total_io_size) *
module->config().debug_options().xla_gpu_memory_limit_slop_factor() / 100;
return limit;
}

} // namespace gpu
} // namespace xla

0 comments on commit c4bc2b2

Please sign in to comment.