Merge branch 'fea-hps-torch-plugin-kingsleyl' into 'main'

Add torch extension for hps Closes #833 See merge request dl/hugectr/hugectr!1434
NVIDIA-Merlin · Sep 21, 2023 · 7c6c4d4 · 7c6c4d4
2 parents ffb18fd + 72a4e65
commit 7c6c4d4
Show file tree

Hide file tree

Showing 38 changed files with 1,631 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,10 @@ docs/source/release_notes.md
 
 docs/source/hps_tf/notebooks
 docs/source/hps_trt/notebooks/
+docs/source/hps_torch/notebooks/
 hps_tf/_skbuild
 hps_tf/dist/
 hps_tf/merlin_hps.egg-info/
+hps_torch/_skbuild
+hps_torch/dist/
+hps_torch/merlin_hps.egg-info/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -208,6 +208,7 @@ build_pytorch_hps_trt_plugin:
   variables:
     FROM_IMAGE: ${IMAGE_PYTORCH}
     DST_IMAGE: $PYTORCH_TRT_IMAGE_VERSIONED
+    BUILD_TORCH_PLUGIN: 1    
     BUILD_TRT_PLUGIN: 1
     TRT_CMAKE_OPTION: "-DSM=\"70;75;80;90\""
     #BUILD_HPS_BACKEND: 1

diff --git a/HugeCTR/include/hps/plugin/lookup_manager.hpp b/HugeCTR/include/hps/plugin/lookup_manager.hpp
@@ -26,6 +26,7 @@ using namespace HugeCTR;
 typedef enum {
   TENSORFLOW = 0,
   TENSORRT = 1,
+  TORCH = 2,
 } pluginType_t;
 
 class LookupManager final {

diff --git a/HugeCTR/src/hps/lookup_session.cpp b/HugeCTR/src/hps/lookup_session.cpp
@@ -218,6 +218,21 @@ void LookupSession::lookup_with_table_fusion_impl(const void* keys, float* d_vec
       }
     }
   }
+
+  // Wait for outputs of each table to be ready
+  {
+    std::unique_lock lock(mutex_);
+    if (cv_.wait_for(lock, wait_duration_, [this, fused_table_id] {
+          return counter_for_each_fused_table_[fused_table_id] ==
+                 num_original_tables_in_each_fused_table_[fused_table_id];
+        })) {
+    } else {
+      HCTR_LOG_S(ERROR, WORLD) << "Time out. The fusing table feature of HPS requires CPU "
+                                  "multithreading for embedding lookup."
+                               << std::endl;
+      return;
+    }
+  }
 }
 
 void LookupSession::lookup_from_device_impl(const void* d_keys, float* d_vectors, size_t num_keys,

diff --git a/HugeCTR/src/hps/plugin/lookup_manager.cpp b/HugeCTR/src/hps/plugin/lookup_manager.cpp
@@ -143,6 +143,10 @@ bool LookupManager::init_check(parameter_server_config& ps_config, int32_t globa
       }
       break;
     }
+    case TORCH: {
+      // Currently no check is needed for HPS Torch plugin
+      break;
+    }
     default: {
       assert(!"Error: no such layer && should never get here!");
     }

diff --git a/ci/benchmark/hps_torch_fuse_table_benchmark/run.sub b/ci/benchmark/hps_torch_fuse_table_benchmark/run.sub
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+srun --container-image="${CONT}" --container-mounts="${MOUNTS}"  bash -cx "bash /workdir/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh"
diff --git a/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh b/ci/benchmark/hps_torch_fuse_table_benchmark/test.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+mkdir -p /hps_torch_fuse_table_benchmark
+
+cd /hps_torch_fuse_table_benchmark
+
+cp -r /model_repo ./
+
+cp -r /model_repo/8_table.json ./
+
+cp -r /model_repo/embeddings ./
+
+LD_PRELOAD=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/merlin_hps-0.0.0-py${PYTHON_VERSION}-linux-x86_64.egg/hps_torch/lib/libhps_torch.so tritonserver --model-repository=model_repo --load-model=8_static_table_autofused --load-model=8_static_table_unfused --load-model=8_dynamic_table_autofused --load-model=8_dynamic_table_unfused --model-control-mode=explicit &
+
+while [[ $(curl -v localhost:8000/v2/health/ready 2>&1 | grep "OK" | wc -l) -eq 0 ]]; do
+        sleep 10;
+done
+
+echo "Successfully launching the Triton server for all models"
+
+batch_size=(256 1024 4096 16384)
+
+model_name=("8_static_table_unfused" "8_static_table_autofused" "8_dynamic_table_unfused" "8_dynamic_table_autofused")
+
+for b in ${batch_size[*]};
+do
+  for m in ${model_name[*]};
+  do
+    echo $b $m
+    perf_analyzer -m ${m} -u localhost:8000 --input-data /perf_data/${b}.json --shape input_1:8,${b},10
+  done
+done
diff --git a/ci/dracorno/ci.yml b/ci/dracorno/ci.yml
@@ -180,16 +180,28 @@ ebc_multi_node:
     TEST_CMD: ./ci/integration_test/ebc/ebc.sub
 
 ### Stage: test
-hierarchical_parameter_server:
+hps_tf_plugin:
   extends: .dracorno_test_job
   needs:
     - pipeline: $PARENT_PIPELINE_ID
       job: build_tf_hps_trt_plugin
   variables:
-    GPFSFOLDER: $DRACO_LOGDIR/hierarchical_parameter_server
+    GPFSFOLDER: $DRACO_LOGDIR/hps_tf
     CONT: $TF_TRT_IMAGE_VERSIONED
     MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT}
-    TEST_CMD: ./ci/integration_test/hps/hps.sub
+    TEST_CMD: ./ci/integration_test/hps/hps_tf.sub
+
+# hps_torch_plugin
+hps_torch_plugin:
+  extends: .dracorno_test_job
+  needs:
+    - pipeline: $PARENT_PIPELINE_ID
+      job: build_pytorch_hps_trt_plugin
+  variables:
+    GPFSFOLDER: $DRACO_LOGDIR/hps_torch
+    CONT: $PYTORCH_TRT_IMAGE_VERSIONED
+    MOUNTS: ${DRACO_DATASET}:${DATASET_MOUNT}
+    TEST_CMD: ./ci/integration_test/hps/hps_torch.sub
 
 s3_backend_test:
   extends: .dracorno_test_job

diff --git a/ci/integration_test/hps/hps.sub → ci/integration_test/hps/hps_tf.sub b/ci/integration_test/hps/hps.sub → ci/integration_test/hps/hps_tf.sub
diff --git a/ci/integration_test/hps/hps_torch.sub b/ci/integration_test/hps/hps_torch.sub
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
+    cd /workdir/hps_torch/test/unit && \
+    pytest -s && \
+    cd /workdir/hps_torch/test/integration && \
+    pytest test_hps_table_fusion.py -s"
+
+# Add workaround due to the known issue of tensorflow
+sleep 10
+EXITCODE=`sacct -j "${SLURM_JOBID}" -n --format=exitcode | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g'`
+echo "Job exit code: ${EXITCODE}"
+
+if [ ${EXITCODE} -eq 6 ]&&[ ${CI_JOB_NAME}=="hps_tf" ]; then
+       echo "Rerun the job, if job exit code is 6 and job name is hps_tf."
+       srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
+                cd /workdir/hps_torch/test/unit && \
+                pytest -s && \
+                cd /workdir/hps_torch/test/integration && \
+                pytest test_hps_table_fusion.py -s"
+       # Get the last job step exit code as job exit code.
+       sed -i 's/sort -r -u | head -1/tail -1/g' ${JOBSCRIPTSDIR}/mlperf-ci/jobexitcode.sh
+       echo "Rerun job finished!";
+fi
+
diff --git a/ci/post_test/check_hps_torch_fuse_table_benchmark.sub b/ci/post_test/check_hps_torch_fuse_table_benchmark.sub
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+srun --ntasks=1 --container-image="${CONT}" --container-mounts="${MOUNTS}" bash -cx " \
+      python3 /workdir/ci/post_test/check_performance.py --job_name hps_torch_fuse_table_benchmark --log_path /logs"
diff --git a/ci/post_test/check_performance.py b/ci/post_test/check_performance.py
@@ -61,6 +61,10 @@
         "cmd_log": r"compute infer",
         "result_log": r"compute infer (\d+\.?\d*) usec",
     },
+    "hps_torch_fuse_table_benchmark": {
+        "cmd_log": r"compute infer",
+        "result_log": r"compute infer (\d+\.?\d*) usec",
+    },
     "hps_tf_fuse_table_benchmark": {
         "cmd_log": r"compute infer",
         "result_log": r"compute infer (\d+\.?\d*) usec",
@@ -100,6 +104,7 @@ def extract_result_from_log(job_name, log_path):
                     results.append(result)
     if (
         job_name == "hps_plugin_benchmark"
+        or job_name == "hps_torch_fuse_table_benchmark"
         or job_name == "hps_tf_fuse_table_benchmark"
         or job_name == "147gb_model_benchmark"
     ):
@@ -316,11 +321,15 @@ def check_perf_result(perf_result, expected_result):
                     expected = expected_result[model_name][batch_size]
                     check_perf_result(perf, expected)
                     idx += 1
-        elif args.job_name == "hps_tf_fuse_table_benchmark":
+        elif (
+            args.job_name == "hps_tf_fuse_table_benchmark"
+            or args.job_name == "hps_torch_fuse_table_benchmark"
+        ):
             perf_result = extract_result_from_log(args.job_name, args.log_path)
             idx = 0
             batch_sizes = ["256", "1024", "4096", "16384"]
-            print("HPS Fuse Table TF Model Inference Latency (usec)")
+            print(f"Job Name: {args.job_name}")
+            print("HPS Fuse Table Model Inference Latency (usec)")
             print("-" * 137)
             print(
                 "batch_size\t8_static_table_unfused\t\t8_static_table_autofused\t8_dynamic_table_unfused\t\t8_dynamic_table_autofused"

diff --git a/ci/post_test/perf_benchmark.json b/ci/post_test/perf_benchmark.json
@@ -12,6 +12,12 @@
         "fp32_trt_with_hps": {"32":600, "1024":1000, "16384":5000},
         "fp16_trt_with_hps": {"32":500, "1024":800, "16384":4000}
     },
+    "hps_torch_fuse_table_benchmark": {
+        "8_static_table_unfused": {"256":900, "1024":1000, "4096":1300, "16384":3000},
+        "8_static_table_autofused": {"256":700, "1024":800, "4096":1300, "16384":3600},
+        "8_dynamic_table_unfused": {"256":2200, "1024":5400, "4096":9500, "16384":13000},
+        "8_dynamic_table_autofused": {"256":1000, "1024":1300, "4096":2600, "16384":6000}
+    },
     "hps_tf_fuse_table_benchmark": {
         "8_static_table_unfused": {"256":1000, "1024":1200, "4096":1600, "16384":3500},
         "8_static_table_autofused": {"256":800, "1024":1000, "4096":1800, "16384":4500},

diff --git a/ci/selene/ci.yml b/ci/selene/ci.yml
@@ -303,7 +303,7 @@ hugectr2onnx:
     TEST_CMD: ./ci/integration_test/hugectr2onnx/hugectr2onnx.sub
 
 # hps_tf_plugin
-hierarchical_parameter_server:
+hps_tf_plugin:
   extends: .selene_test_job
   needs:
     - pipeline: $PARENT_PIPELINE_ID
@@ -312,7 +312,19 @@ hierarchical_parameter_server:
     GPFSFOLDER: $LOGDIR/hps_tf
     CONT: $TF_TRT_IMAGE_VERSIONED
     MOUNTS: ${DATASET}:${DATASET_MOUNT}
-    TEST_CMD: ./ci/integration_test/hps/hps.sub
+    TEST_CMD: ./ci/integration_test/hps/hps_tf.sub
+
+# hps_torch_plugin
+hps_torch_plugin:
+  extends: .selene_test_job
+  needs:
+    - pipeline: $PARENT_PIPELINE_ID
+      job: build_pytorch_hps_trt_plugin
+  variables:
+    GPFSFOLDER: $LOGDIR/hps_torch
+    CONT: $PYTORCH_TRT_IMAGE_VERSIONED
+    MOUNTS: ${DATASET}:${DATASET_MOUNT}
+    TEST_CMD: ./ci/integration_test/hps/hps_torch.sub
 
 # embedding_plugin
 sparse_operation_kit_ut-TF2:
@@ -404,6 +416,18 @@ gcs_backend_test:
     CONT: $TRAIN_IMAGE_VERSIONED_WITH_GCS
     TEST_CMD: ./ci/integration_test/gcs/gcs_backend_test.sub
 
+hps_torch_fuse_table_benchmark:
+  extends: .selene_test_job
+  needs:
+    - pipeline: $PARENT_PIPELINE_ID
+      job: build_pytorch_hps_trt_plugin
+  variables:
+    GPFSFOLDER: $LOGDIR/hps_torch_fuse_table_benchmark
+    CONT: $PYTORCH_TRT_IMAGE_VERSIONED
+    MOUNTS: /lustre/fsw/devtech/hpc-hugectr/hps_torch_fuse_table_benchmark/ci_model_repo:/model_repo,/lustre/fsw/devtech/hpc-hugectr/hps_torch_fuse_table_benchmark/perf_data:/perf_data
+    WALLTIME: "00:45:00"
+    TEST_CMD: ./ci/benchmark/hps_torch_fuse_table_benchmark/run.sub
+
 hps_tf_fuse_table_benchmark:
   extends: .selene_test_job
   needs:
@@ -451,6 +475,17 @@ dlrm_dcnv2_1node_check:
     WALLTIME: "00:15:00"
     TEST_CMD: ./ci/post_test/check_dcnv2_dlrm_1node.sub
 
+hps_torch_fuse_table_benchmark_check:
+  extends: .selene_post_test_job
+  needs:
+    - hps_torch_fuse_table_benchmark
+  variables:
+    GPFSFOLDER: $LOGDIR/hps_torch_fuse_table_benchmark_check
+    CONT: $PYTORCH_TRT_IMAGE_VERSIONED
+    MOUNTS: $LOGDIR/hps_torch_fuse_table_benchmark:/logs
+    WALLTIME: "00:15:00"
+    TEST_CMD: ./ci/post_test/check_hps_torch_fuse_table_benchmark.sub
+
 hps_tf_fuse_table_benchmark_check:
   extends: .selene_post_test_job
   needs:

diff --git a/ci/template.yml b/ci/template.yml
@@ -146,6 +146,7 @@ stages:
     - echo "BUILD_HUGECTR2ONNX=${BUILD_HUGECTR2ONNX}"
     - echo "BUILD_SOK=${BUILD_SOK}"
     - echo "BUILD_TF_PLUGIN=${BUILD_TF_PLUGIN}"
+    - echo "BUILD_TORCH_PLUGIN=${BUILD_TORCH_PLUGIN}"
     #- git submodule update --init --recursive
     - if [[ "$TEST_NEW_IMAGE" == "1" ]]; then
         echo "FROM ${FROM_IMAGE}.new_image" > ${JOB_DOCKERFILE};
@@ -181,6 +182,10 @@ stages:
         echo "RUN pip install tf2onnx" >> ${JOB_DOCKERFILE};
         echo "RUN cd /workdir/hps_tf/ && python setup.py install" >> ${JOB_DOCKERFILE};
       fi
+    - if [[ "$BUILD_TORCH_PLUGIN" == 1 ]]; then
+        echo "RUN pip install ninja" >> ${JOB_DOCKERFILE};
+        echo "RUN cd /workdir/hps_torch/ && TORCH_CUDA_ARCH_LIST=\"7.0 7.5 8.0 9.0\" python setup.py install" >> ${JOB_DOCKERFILE};
+      fi
     - if [[ "$BUILD_TRT_PLUGIN" == 1 ]]; then
         echo "RUN pip install tf2onnx" >> ${JOB_DOCKERFILE};
         echo "RUN mkdir /workdir/hps_trt/build && cd /workdir/hps_trt/build && cmake ${TRT_CMAKE_OPTION} .. && make -j\$(nproc) && make install" >> ${JOB_DOCKERFILE};

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -184,7 +184,8 @@
     "../../notebooks",
     "../../release_notes.md",
     "../../hps_tf/notebooks",
-    "../../hps_trt/notebooks",    
+    "../../hps_trt/notebooks",
+    "../../hps_torch/notebooks",
 ]
 copydirs_file_rename = {
     "README.md": "index.md",

diff --git a/docs/source/hierarchical_parameter_server/hps_database_backend.md b/docs/source/hierarchical_parameter_server/hps_database_backend.md
@@ -117,7 +117,7 @@ Set the `supportlonglong` field to `True` when you need to use a 64-bit integer
 You must set this field to `true` if you specify `True` for the `i64_input_key` parameter.
 The default value is `True`.
 
-Set the `fuse_embedding_table` field to `True` when you want to fuse embedding tables. The tables with the same embedding vector size will be fused in storage during HPS initialization. At each iteration, original lookup queries are packed into one via CPU multi-thread synchronization and the packed query is forward to the fused embedding table. To use this feature, please ensure that key values in different tables have no overlap and the embedding lookup layers have no dependency to each other in the model graph. This is only valid for [HPS Plugin for TensorFlow](hps_tf_user_guide.md) and [HPS Backend for Triton Inference Server](https://github.com/triton-inference-server/hugectr_backend/tree/main/hps_backend). The default value is `False`.
+Set the `fuse_embedding_table` field to `True` when you want to fuse embedding tables. The tables with the same embedding vector size will be fused in storage during HPS initialization. At each iteration, original lookup queries are packed into one via CPU multi-thread synchronization and the packed query is forward to the fused embedding table. To use this feature, please ensure that key values in different tables have no overlap and the embedding lookup layers have no dependency to each other in the model graph. This is valid for [HPS Plugin for TensorFlow](hps_tf_user_guide.md),  [HPS Plugin for Torch](hps_torch_user_guide.md) and [HPS Backend for Triton Inference Server](https://github.com/triton-inference-server/hugectr_backend/tree/main/hps_backend). The default value is `False`.
 
 The following sections describe the configuration parameters.
 Generally speaking, each node in your HugeCTR cluster should deploy the same configuration.

diff --git a/docs/source/hierarchical_parameter_server/hps_tf_user_guide.md b/docs/source/hierarchical_parameter_server/hps_tf_user_guide.md
@@ -74,6 +74,7 @@ We support the following compute capabilities:
 | 7.0                | NVIDIA V100 (Volta)  | 70  |
 | 7.5                | NVIDIA T4 (Turing)   | 75  |
 | 8.0                | NVIDIA A100 (Ampere) | 80  |
+| 9.0                | NVIDIA H100 (Hopper) | 90  |
 
 ### Installing HPS Using NGC Containers
 

diff --git a/docs/source/hierarchical_parameter_server/hps_torch_api/index.rst b/docs/source/hierarchical_parameter_server/hps_torch_api/index.rst
@@ -0,0 +1,7 @@
+HPS Plugin for Torch API
+===========================
+
+.. toctree::
+   :maxdepth: 2
+
+   LookupLayer <lookup_layer>
diff --git a/docs/source/hierarchical_parameter_server/hps_torch_api/lookup_layer.md b/docs/source/hierarchical_parameter_server/hps_torch_api/lookup_layer.md
@@ -0,0 +1,35 @@
+# HPS Plugin for Torch
+
+```{contents}
+---
+depth: 2
+local: true
+backlinks: none
+---
+```
+
+#### LookupLayer class
+
+This is a wrapper class for HPS lookup layer, which basically performs the same function as ``torch.nn.Embedding``. It inherits `torch.nn.Module`.
+
+```python
+hps_torch.LookupLayer.__init__
+```
+**Arguments**
+* `ps_config_file`: String. The JSON configuration file for HPS initialization.
+
+* `model_name`: String. The name of the model that has embedding tables.
+
+* `table_id`: Integer. The index of the embedding table for the model specified by `model_name`.
+
+* `emb_vec_size`: Integer. The embedding vector size for the embedding table specified by `model_name` and `table_id`.
+
+
+```python
+hps_torch.LookupLayer.forward
+```
+**Arguments**
+* `keys`: Tensor of ``torch.int32`` or ``torch.int64``.
+
+**Returns**
+* `vectors`: Tensor of `torch.float32`.