Skip to content

Commit

Permalink
Merge branch 'ci-computex-demo-kingsleyl' into 'main'
Browse files Browse the repository at this point in the history
Extend the 147GB HPS_TRT benchmark CI to cover 1 FC, 3 FC and DLRM models

See merge request dl/hugectr/hugectr!1452
  • Loading branch information
minseokl committed Sep 1, 2023
2 parents ef4f0e2 + c9882e3 commit c9ace48
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 52 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash

srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/3fc_147gb_model_benchmark/test.sh"
srun --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx "bash /workdir/ci/benchmark/147gb_model_benchmark/test.sh"
42 changes: 42 additions & 0 deletions ci/benchmark/147gb_model_benchmark/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash

mkdir -p /147gb_model_benchmark/model_repo

cd /147gb_model_benchmark

cp /model_repo/light.json ./

cp /model_repo/dynamic_build.py ./

cp /model_repo/*.onnx ./

cp -r /model_repo/dynamic*trt ./model_repo

python3 dynamic_build.py

mv dynamic_1fc_lite.trt model_repo/dynamic_1fc_lite_hps_trt/1

mv dynamic_3fc_lite.trt model_repo/dynamic_3fc_lite_hps_trt/1

mv dynamic_dlrm.trt model_repo/dynamic_dlrm_hps_trt/1

LD_PRELOAD=/usr/local/hps_trt/lib/libhps_plugin.so tritonserver --model-repository=model_repo --load-model=dynamic_1fc_lite_hps_trt --load-model=dynamic_3fc_lite_hps_trt --load-model=dynamic_dlrm_hps_trt --model-control-mode=explicit &

while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do
sleep 10;
done

echo "Successfully launching the Triton server for all models"

batch_size=(256 1024 4096 16384)

model_name=("dynamic_1fc_lite_hps_trt" "dynamic_3fc_lite_hps_trt" "dynamic_dlrm_hps_trt")

for b in ${batch_size[*]};
do
for m in ${model_name[*]};
do
echo $b $m
perf_analyzer -m ${m} -u localhost:8000 --input-data /model_repo/perf_data/${b}.json --shape categorical_features:${b},26 --shape numerical_features:${b},13
done
done
28 changes: 0 additions & 28 deletions ci/benchmark/3fc_147gb_model_benchmark/test.sh

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash

srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
python3 /workdir/ci/post_test/check_performance.py --job_name 3fc_147gb_model_benchmark --log_path /logs"
python3 /workdir/ci/post_test/check_performance.py --job_name 147gb_model_benchmark --log_path /logs"
28 changes: 17 additions & 11 deletions ci/post_test/check_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"cmd_log": r"compute infer",
"result_log": r"compute infer (\d+\.?\d*) usec",
},
"3fc_147gb_model_benchmark": {
"147gb_model_benchmark": {
"cmd_log": r"compute infer",
"result_log": r"compute infer (\d+\.?\d*) usec",
},
Expand Down Expand Up @@ -101,7 +101,7 @@ def extract_result_from_log(job_name, log_path):
if (
job_name == "hps_plugin_benchmark"
or job_name == "hps_tf_fuse_table_benchmark"
or job_name == "3fc_147gb_model_benchmark"
or job_name == "147gb_model_benchmark"
):
return results
return sum(results) / len(results) if len(results) > 0 else float("inf")
Expand Down Expand Up @@ -348,25 +348,31 @@ def check_perf_result(perf_result, expected_result):
expected = expected_result[model_name][batch_size]
check_perf_result(perf, expected)
idx += 1
elif args.job_name == "3fc_147gb_model_benchmark":
elif args.job_name == "147gb_model_benchmark":
perf_result = extract_result_from_log(args.job_name, args.log_path)
idx = 0
batch_sizes = ["256", "1024", "4096", "16384"]
print("3FC 147GB Model Inference Latency (usec)")
print("-" * 137)
print("batch_size\tdynamic_3fc_lite_hps_trt")
print("-" * 137)
for i in range(len(perf_result)):
print("147GB Model Inference Latency (usec)")
print("-" * 100)
print(
"batch_size\tdynamic_1fc_lite_hps_trt\tdynamic_3fc_lite_hps_trt\tdynamic_dlrm_hps_trt"
)
print("-" * 100)
for i in range(len(perf_result) // 3):
print(
"{}\t\t{}".format(
"{}\t\t{}\t\t\t\t{}\t\t\t\t{}".format(
batch_sizes[i],
perf_result[i],
perf_result[i * 3],
perf_result[i * 3 + 1],
perf_result[i * 3 + 2],
)
)
print("-" * 137)
print("-" * 100)
for batch_size in batch_sizes:
for model_name in [
"dynamic_1fc_lite_hps_trt",
"dynamic_3fc_lite_hps_trt",
"dynamic_dlrm_hps_trt",
]:
perf = perf_result[idx]
expected = expected_result[model_name][batch_size]
Expand Down
6 changes: 4 additions & 2 deletions ci/post_test/perf_benchmark.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
"8_dynamic_table_unfused": {"256":2200, "1024":5200, "4096":9500, "16384":13000},
"8_dynamic_table_autofused": {"256":1100, "1024":1500, "4096":3000, "16384":7000}
},
"3fc_147gb_model_benchmark": {
"dynamic_3fc_lite_hps_trt": {"256":300, "1024":600, "4096":2000, "16384":6000}
"147gb_model_benchmark": {
"dynamic_1fc_lite_hps_trt": {"256":250, "1024":550, "4096":1900, "16384":5800},
"dynamic_3fc_lite_hps_trt": {"256":300, "1024":600, "4096":2000, "16384":6000},
"dynamic_dlrm_hps_trt": {"256":500, "1024":800, "4096":2500, "16384":7000}
},
"hps_backend_benchmark": {
"avg_latency": {"256":600, "1024":1700, "2048":9000, "8192":36000, "131072":520000}
Expand Down
18 changes: 9 additions & 9 deletions ci/selene/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -497,25 +497,25 @@ hps_tf_fuse_table_benchmark_check:
WALLTIME: "00:15:00"
TEST_CMD: ./ci/post_test/check_hps_tf_fuse_table_benchmark.sub

3fc_147gb_model_benchmark:
147gb_model_benchmark:
extends: .selene_test_job
needs:
- pipeline: $PARENT_PIPELINE_ID
job: build_tf_hps_trt_plugin
variables:
GPFSFOLDER: $LOGDIR/3fc_147gb_model_benchmark
GPFSFOLDER: $LOGDIR/147gb_model_benchmark
CONT: $TF_TRT_IMAGE_VERSIONED
MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_tf_benchmark/3fc_147gb_ci_model_repo:/model_repo
MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_tf_benchmark/147gb_ci_model_repo:/model_repo
WALLTIME: "00:45:00"
TEST_CMD: ./ci/benchmark/3fc_147gb_model_benchmark/run.sub
TEST_CMD: ./ci/benchmark/147gb_model_benchmark/run.sub

3fc_147gb_model_benchmark_check:
147gb_model_benchmark_check:
extends: .selene_post_test_job
needs:
- 3fc_147gb_model_benchmark
- 147gb_model_benchmark
variables:
GPFSFOLDER: $LOGDIR/3fc_147gb_model_benchmark_check
GPFSFOLDER: $LOGDIR/147gb_model_benchmark_check
CONT: $TF_TRT_IMAGE_VERSIONED
MOUNTS: $LOGDIR/3fc_147gb_model_benchmark:/logs
MOUNTS: $LOGDIR/147gb_model_benchmark:/logs
WALLTIME: "00:15:00"
TEST_CMD: ./ci/post_test/check_3fc_147gb_model_benchmark.sub
TEST_CMD: ./ci/post_test/check_147gb_model_benchmark.sub

0 comments on commit c9ace48

Please sign in to comment.