From dde1d13e0bb13d0026c81531d797359d81f24b11 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Wed, 17 Apr 2024 15:51:20 -0400 Subject: [PATCH 01/38] Use conda env create --yes instead of --force (#1636) conda dropped support for the --force flag to conda env create. This changes that flag name to --yes. See https://github.com/conda/conda/blob/main/CHANGELOG.md#2430-2024-03-12 and https://github.com/rapidsai/miniforge-cuda/pull/63 for more info. ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - David Gardner (https://github.com/dagardner-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1636 --- ci/check_style.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/check_style.sh b/ci/check_style.sh index 9205625726..beb561bb4f 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -16,7 +16,7 @@ rapids-dependency-file-generator \ --file_key checks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml -rapids-mamba-retry env create --force -f env.yaml -n checks +rapids-mamba-retry env create --yes -f env.yaml -n checks conda activate checks # Run pre-commit checks From 6b9cb71b0fd2b9ffa47202302cd40b340bb366c4 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Thu, 18 Apr 2024 07:47:22 -0700 Subject: [PATCH 02/38] Misc CI improvements (#1618) * Fetch git tags when performing documentation builds. Allowing for the version number to appear properly in the generated documentation. This should allow for us to publish the documentation build from CI when performing a release. * Allow overriding the GIT_URL, useful when performing CI against a commit/branch/tag that exists in a remote other than origin * Replace list of CUDA architectures with RAPIDS place-holder (we were building for 60 even though we no longer support it) * Construct the `CMAKE_BUILD_ALL_FEATURES` var in a more readable way * Allow overriding the build dir, useful for local builds using `USE_HOST_GIT=1` to avoid conflicting with a potentially existing build directory * Move generated env.yaml to `$WORKSPACE_TMP`, prevents the file from being written to the root of the git repo. * Rather than init submodules in each stage by hand, use the submodules flag in the checkout action ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1618 --- .github/workflows/ci_pipe.yml | 7 +++++ ci/scripts/bootstrap_local_ci.sh | 2 ++ ci/scripts/common.sh | 2 +- ci/scripts/github/build.sh | 24 ++++++----------- ci/scripts/github/checks.sh | 24 ++++++----------- ci/scripts/github/common.sh | 35 ++++++++++++++++++++++--- ci/scripts/github/conda.sh | 2 -- ci/scripts/github/docs.sh | 16 +++++------- ci/scripts/github/test.sh | 20 +++++--------- ci/scripts/run_ci_local.sh | 45 ++++++++++++++++++++------------ 10 files changed, 99 insertions(+), 78 deletions(-) diff --git a/.github/workflows/ci_pipe.yml b/.github/workflows/ci_pipe.yml index 0ffd718b3d..34dfbd38cd 100644 --- a/.github/workflows/ci_pipe.yml +++ b/.github/workflows/ci_pipe.yml @@ -85,6 +85,7 @@ jobs: lfs: false path: 'morpheus' fetch-depth: 0 + submodules: 'recursive' - name: Get AWS credentials using OIDC uses: aws-actions/configure-aws-credentials@v1-node16 @@ -115,6 +116,7 @@ jobs: with: lfs: false path: 'morpheus' + submodules: 'recursive' - name: Get AWS credentials using OIDC uses: aws-actions/configure-aws-credentials@v1-node16 @@ -149,6 +151,7 @@ jobs: with: lfs: false path: 'morpheus' + submodules: 'recursive' - name: Get AWS credentials using OIDC uses: aws-actions/configure-aws-credentials@v1-node16 @@ -180,6 +183,9 @@ jobs: with: lfs: false path: 'morpheus' + # Fetch tags so that documentation builds for releases will report the version number correctly + fetch-tags: true + submodules: 'recursive' - name: Get AWS credentials using OIDC uses: aws-actions/configure-aws-credentials@v1-node16 @@ -213,6 +219,7 @@ jobs: lfs: false path: 'morpheus' fetch-depth: 0 + submodules: 'recursive' - name: Get AWS credentials using OIDC uses: aws-actions/configure-aws-credentials@v1-node16 diff --git a/ci/scripts/bootstrap_local_ci.sh b/ci/scripts/bootstrap_local_ci.sh index 3051b13af1..45c68b3ae4 100755 --- a/ci/scripts/bootstrap_local_ci.sh +++ b/ci/scripts/bootstrap_local_ci.sh @@ -25,6 +25,8 @@ else git checkout ${GIT_BRANCH} git pull git checkout ${GIT_COMMIT} + git fetch --tags + git submodule update --init --recursive fi export MORPHEUS_ROOT=$(pwd) diff --git a/ci/scripts/common.sh b/ci/scripts/common.sh index 3bfa4e0870..75e83a0c7c 100644 --- a/ci/scripts/common.sh +++ b/ci/scripts/common.sh @@ -73,7 +73,7 @@ function get_modified_files() { local GIT_DIFF_BASE=${GIT_DIFF_BASE:-$(get_merge_base)} # If invoked by a git-commit-hook, this will be populated - local result=( $(git diff ${GIT_DIFF_ARGS} $(get_merge_base) | grep -P ${1:-'.*'}) ) + local result=( $(git diff ${GIT_DIFF_ARGS} ${GIT_DIFF_BASE} | grep -P ${1:-'.*'}) ) local files=() diff --git a/ci/scripts/github/build.sh b/ci/scripts/github/build.sh index b75107f637..5941ca03bd 100755 --- a/ci/scripts/github/build.sh +++ b/ci/scripts/github/build.sh @@ -21,41 +21,33 @@ source ${WORKSPACE}/ci/scripts/github/common.sh rapids-dependency-file-generator \ --output conda \ --file_key build \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${WORKSPACE_TMP}/env.yaml" -update_conda_env env.yaml +update_conda_env "${WORKSPACE_TMP}/env.yaml" log_toolchain -git submodule update --init --recursive - CMAKE_FLAGS="${CMAKE_BUILD_ALL_FEATURES}" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_BUILD_WHEEL=ON" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_BUILD_STUBS=OFF" CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON" -if [[ "${LOCAL_CI}" == "" ]]; then - CMAKE_FLAGS="${CMAKE_FLAGS} -DCCACHE_PROGRAM_PATH=$(which sccache)" -fi rapids-logger "Configuring cmake for Morpheus with ${CMAKE_FLAGS}" -cmake -B build -G Ninja ${CMAKE_FLAGS} . +cmake ${CMAKE_FLAGS} . rapids-logger "Building Morpheus" -cmake --build build --parallel ${PARALLEL_LEVEL} +cmake --build ${BUILD_DIR} --parallel ${PARALLEL_LEVEL} -if [[ "${LOCAL_CI}" == "" ]]; then - rapids-logger "sccache usage for morpheus build:" - sccache --show-stats -fi +log_sccache_stats rapids-logger "Archiving results" -tar cfj "${WORKSPACE_TMP}/wheel.tar.bz" build/dist +tar cfj "${WORKSPACE_TMP}/wheel.tar.bz" ${BUILD_DIR}/dist -MORPHEUS_LIBS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;) \ +MORPHEUS_LIBS=($(find ${MORPHEUS_ROOT}/${BUILD_DIR}/morpheus/_lib -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;) \ $(find ${MORPHEUS_ROOT}/examples -name "*.so" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;)) tar cfj "${WORKSPACE_TMP}/morhpeus_libs.tar.bz" "${MORPHEUS_LIBS[@]}" -CPP_TESTS=($(find ${MORPHEUS_ROOT}/build/morpheus/_lib/tests -name "*.x" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;)) +CPP_TESTS=($(find ${MORPHEUS_ROOT}/${BUILD_DIR}/morpheus/_lib/tests -name "*.x" -exec realpath --relative-to ${MORPHEUS_ROOT} {} \;)) tar cfj "${WORKSPACE_TMP}/cpp_tests.tar.bz" "${CPP_TESTS[@]}" rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}" diff --git a/ci/scripts/github/checks.sh b/ci/scripts/github/checks.sh index 487e053c7a..22f06f1557 100755 --- a/ci/scripts/github/checks.sh +++ b/ci/scripts/github/checks.sh @@ -21,42 +21,34 @@ source ${WORKSPACE}/ci/scripts/github/common.sh rapids-dependency-file-generator \ --output conda \ --file_key build \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${WORKSPACE_TMP}/env.yaml" -update_conda_env env.yaml +update_conda_env "${WORKSPACE_TMP}/env.yaml" log_toolchain cd ${MORPHEUS_ROOT} +# Fetching the base branch will try methods that might fail, then fallback to one that does, set +e for this section +set +e fetch_base_branch - -git submodule update --init --recursive +set -e rapids-logger "Configuring cmake for Morpheus" CMAKE_FLAGS="${CMAKE_BUILD_ALL_FEATURES}" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_BUILD_STUBS=OFF" export CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_INPLACE_BUILD=ON" -if [[ "${LOCAL_CI}" == "" ]]; then - CMAKE_FLAGS="${CMAKE_FLAGS} -DCCACHE_PROGRAM_PATH=$(which sccache)" -fi -cmake -B build -G Ninja ${CMAKE_FLAGS} . +cmake ${CMAKE_FLAGS} . rapids-logger "Building Morpheus" -cmake --build build --parallel ${PARALLEL_LEVEL} +cmake --build ${BUILD_DIR} --parallel ${PARALLEL_LEVEL} -if [[ "${LOCAL_CI}" == "" ]]; then - rapids-logger "sccache usage for source build:" - sccache --show-stats -fi +log_sccache_stats rapids-logger "Installing Morpheus" pip install ./ -# Setting this prevents loading of cudf since we don't have a GPU -export MORPHEUS_IN_SPHINX_BUILD=1 - rapids-logger "Checking copyright headers" python ${MORPHEUS_ROOT}/ci/scripts/copyright.py --verify-apache-v2 --git-diff-commits ${CHANGE_TARGET} ${GIT_COMMIT} diff --git a/ci/scripts/github/common.sh b/ci/scripts/github/common.sh index 3aa6c4c69e..a4269828c2 100644 --- a/ci/scripts/github/common.sh +++ b/ci/scripts/github/common.sh @@ -61,7 +61,26 @@ export SCCACHE_REGION="us-east-2" export SCCACHE_IDLE_TIMEOUT=32768 #export SCCACHE_LOG=debug -export CMAKE_BUILD_ALL_FEATURES="-DCMAKE_MESSAGE_CONTEXT_SHOW=ON -DMORPHEUS_CUDA_ARCHITECTURES=60;70;75;80 -DMORPHEUS_BUILD_BENCHMARKS=ON -DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_BUILD_TESTS=ON -DMORPHEUS_USE_CONDA=ON -DMORPHEUS_PYTHON_INPLACE_BUILD=OFF -DMORPHEUS_PYTHON_BUILD_STUBS=ON -DMORPHEUS_USE_CCACHE=ON" +# Set the build flags +export BUILD_DIR=${BUILD_DIR:-build} + +_FLAGS=() +_FLAGS+=("-B" "${BUILD_DIR}") +_FLAGS+=("-G" "Ninja") +_FLAGS+=("-DCMAKE_MESSAGE_CONTEXT_SHOW=ON") +_FLAGS+=("-DMORPHEUS_CUDA_ARCHITECTURES=RAPIDS") +_FLAGS+=("-DMORPHEUS_USE_CONDA=ON") +_FLAGS+=("-DMORPHEUS_USE_CCACHE=ON") +_FLAGS+=("-DMORPHEUS_PYTHON_INPLACE_BUILD=OFF") +_FLAGS+=("-DMORPHEUS_PYTHON_BUILD_STUBS=ON") +_FLAGS+=("-DMORPHEUS_BUILD_BENCHMARKS=ON") +_FLAGS+=("-DMORPHEUS_BUILD_EXAMPLES=ON") +_FLAGS+=("-DMORPHEUS_BUILD_TESTS=ON") +if [[ "${LOCAL_CI}" == "" ]]; then + _FLAGS+=("-DCCACHE_PROGRAM_PATH=$(which sccache)") +fi +export CMAKE_BUILD_ALL_FEATURES="${_FLAGS[@]}" +unset _FLAGS export FETCH_STATUS=0 @@ -112,8 +131,11 @@ function fetch_base_branch_gh_api() { function fetch_base_branch_local() { rapids-logger "Retrieving base branch from git" - git remote add upstream ${GIT_UPSTREAM_URL} - git fetch upstream --tags + if [[ "${USE_HOST_GIT}" == "0" ]]; then + git remote add upstream ${GIT_UPSTREAM_URL} + git fetch upstream --tags + fi + source ${MORPHEUS_ROOT}/ci/scripts/common.sh export BASE_BRANCH=$(get_base_branch) export CHANGE_TARGET="upstream/${BASE_BRANCH}" @@ -147,6 +169,13 @@ function log_toolchain() { sccache --version } +function log_sccache_stats() { + if [[ "${LOCAL_CI}" == "" ]]; then + rapids-logger "sccache usage for morpheus build:" + sccache --show-stats + fi +} + function upload_artifact() { FILE_NAME=$1 BASE_NAME=$(basename "${FILE_NAME}") diff --git a/ci/scripts/github/conda.sh b/ci/scripts/github/conda.sh index f92374f222..4114bd9ab9 100755 --- a/ci/scripts/github/conda.sh +++ b/ci/scripts/github/conda.sh @@ -23,8 +23,6 @@ cd ${MORPHEUS_ROOT} fetch_base_branch -git submodule update --init --recursive - # Its important that we are in the base environment for the build rapids-logger "Activating Base Conda Environment" diff --git a/ci/scripts/github/docs.sh b/ci/scripts/github/docs.sh index f928d02a38..f4a33b91b3 100755 --- a/ci/scripts/github/docs.sh +++ b/ci/scripts/github/docs.sh @@ -21,15 +21,15 @@ source ${WORKSPACE}/ci/scripts/github/common.sh rapids-dependency-file-generator \ --output conda \ --file_key docs \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${WORKSPACE_TMP}/env.yaml" -update_conda_env env.yaml +update_conda_env "${WORKSPACE_TMP}/env.yaml" download_artifact "wheel.tar.bz" tar xf "${WORKSPACE_TMP}/wheel.tar.bz" -pip install ${MORPHEUS_ROOT}/build/dist/*.whl +pip install ${MORPHEUS_ROOT}/${BUILD_DIR}/dist/*.whl rapids-logger "Pulling LFS assets" cd ${MORPHEUS_ROOT} @@ -37,17 +37,15 @@ cd ${MORPHEUS_ROOT} git lfs install ${MORPHEUS_ROOT}/scripts/fetch_data.py fetch docs examples -git submodule update --init --recursive - rapids-logger "Configuring for docs" -cmake -B build -G Ninja ${CMAKE_BUILD_ALL_FEATURES} -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} -DMORPHEUS_PYTHON_BUILD_STUBS=OFF -DMORPHEUS_BUILD_DOCS=ON . +cmake ${CMAKE_BUILD_ALL_FEATURES} -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} -DMORPHEUS_PYTHON_BUILD_STUBS=OFF -DMORPHEUS_BUILD_DOCS=ON . rapids-logger "Building docs" -cmake --build build --parallel ${PARALLEL_LEVEL} --target install -cmake --build build --parallel ${PARALLEL_LEVEL} --target morpheus_docs +cmake --build ${BUILD_DIR} --parallel ${PARALLEL_LEVEL} --target install +cmake --build ${BUILD_DIR} --parallel ${PARALLEL_LEVEL} --target morpheus_docs rapids-logger "Archiving the docs" -tar cfj "${WORKSPACE_TMP}/docs.tar.bz" build/docs/html +tar cfj "${WORKSPACE_TMP}/docs.tar.bz" ${BUILD_DIR}/docs/html rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}" set_job_summary_preamble diff --git a/ci/scripts/github/test.sh b/ci/scripts/github/test.sh index fe4fe23813..e050895083 100755 --- a/ci/scripts/github/test.sh +++ b/ci/scripts/github/test.sh @@ -22,34 +22,26 @@ source ${WORKSPACE}/ci/scripts/github/common.sh rapids-dependency-file-generator \ --output conda \ --file_key test \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${WORKSPACE_TMP}/env.yaml" -update_conda_env env.yaml +update_conda_env "${WORKSPACE_TMP}/env.yaml" log_toolchain -git submodule update --init --recursive - CMAKE_FLAGS="${CMAKE_BUILD_ALL_FEATURES}" CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_BUILD_STUBS=ON" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_BUILD_WHEEL=OFF" CMAKE_FLAGS="${CMAKE_FLAGS} -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" CMAKE_FLAGS="${CMAKE_FLAGS} -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX}" -if [[ "${LOCAL_CI}" == "" ]]; then - CMAKE_FLAGS="${CMAKE_FLAGS} -DCCACHE_PROGRAM_PATH=$(which sccache)" -fi rapids-logger "Configuring cmake for Morpheus with ${CMAKE_FLAGS}" -cmake -B build -G Ninja ${CMAKE_FLAGS} . +cmake ${CMAKE_FLAGS} . rapids-logger "Building Morpheus" -cmake --build build --parallel ${PARALLEL_LEVEL} --target install +cmake --build ${BUILD_DIR} --parallel ${PARALLEL_LEVEL} --target install -if [[ "${LOCAL_CI}" == "" ]]; then - rapids-logger "sccache usage for morpheus build:" - sccache --show-stats -fi +log_sccache_stats rapids-logger "Checking Python stub files" @@ -62,7 +54,7 @@ if [[ $(git status --short --untracked | grep .pyi) != "" ]]; then exit 1 fi -CPP_TESTS=($(find ${MORPHEUS_ROOT}/build -name "*.x")) +CPP_TESTS=($(find ${MORPHEUS_ROOT}/${BUILD_DIR} -name "*.x")) rapids-logger "Pulling LFS assets" diff --git a/ci/scripts/run_ci_local.sh b/ci/scripts/run_ci_local.sh index fb29fdf139..979fd07e23 100755 --- a/ci/scripts/run_ci_local.sh +++ b/ci/scripts/run_ci_local.sh @@ -45,7 +45,10 @@ MORPHEUS_ROOT=${MORPHEUS_ROOT:-$(git rev-parse --show-toplevel)} # match CI, the default) USE_HOST_GIT=${USE_HOST_GIT:-0} -GIT_URL=$(git remote get-url origin) +# Useful when using a host git repo to avoid conflicting with a potentially existing 'build' directory +BUILD_DIR=${BUILD_DIR:-build-ci} + +GIT_URL=${GIT_URL:-$(git remote get-url origin)} GIT_URL=$(git_ssh_to_https ${GIT_URL}) GIT_UPSTREAM_URL=$(git remote get-url upstream) @@ -62,33 +65,41 @@ DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} BUILD_CONTAINER="nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-build-${CONTAINER_VER}" TEST_CONTAINER="nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-${CONTAINER_VER}" -ENV_LIST="--env LOCAL_CI_TMP=/ci_tmp" -ENV_LIST="${ENV_LIST} --env GIT_URL=${GIT_URL}" -ENV_LIST="${ENV_LIST} --env GIT_UPSTREAM_URL=${GIT_UPSTREAM_URL}" -ENV_LIST="${ENV_LIST} --env GIT_BRANCH=${GIT_BRANCH}" -ENV_LIST="${ENV_LIST} --env GIT_COMMIT=${GIT_COMMIT}" -ENV_LIST="${ENV_LIST} --env PARALLEL_LEVEL=$(nproc)" -ENV_LIST="${ENV_LIST} --env CUDA_VER=${CUDA_VER}" -ENV_LIST="${ENV_LIST} --env SKIP_CONDA_ENV_UPDATE=${SKIP_CONDA_ENV_UPDATE}" -ENV_LIST="${ENV_LIST} --env USE_HOST_GIT=${USE_HOST_GIT}" +ENV_LIST=() +ENV_LIST+=("--env" "LOCAL_CI_TMP=/ci_tmp") +ENV_LIST+=("--env" "GIT_URL=${GIT_URL}") +ENV_LIST+=("--env" "GIT_UPSTREAM_URL=${GIT_UPSTREAM_URL}") +ENV_LIST+=("--env" "GIT_BRANCH=${GIT_BRANCH}") +ENV_LIST+=("--env" "GIT_COMMIT=${GIT_COMMIT}") +ENV_LIST+=("--env" "PARALLEL_LEVEL=$(nproc)") +ENV_LIST+=("--env" "CUDA_VER=${CUDA_VER}") +ENV_LIST+=("--env" "SKIP_CONDA_ENV_UPDATE=${SKIP_CONDA_ENV_UPDATE}") +ENV_LIST+=("--env" "USE_HOST_GIT=${USE_HOST_GIT}") +ENV_LIST+=("--env" "BUILD_DIR=${BUILD_DIR}") mkdir -p ${LOCAL_CI_TMP} cp ${MORPHEUS_ROOT}/ci/scripts/bootstrap_local_ci.sh ${LOCAL_CI_TMP} for STAGE in "${STAGES[@]}"; do - DOCKER_RUN_ARGS="--rm -ti --net=host -v "${LOCAL_CI_TMP}":/ci_tmp ${ENV_LIST} --env STAGE=${STAGE}" + DOCKER_RUN_ARGS=() + DOCKER_RUN_ARGS+=("--rm") + DOCKER_RUN_ARGS+=("-ti") + DOCKER_RUN_ARGS+=("--net=host") + DOCKER_RUN_ARGS+=("-v" "${LOCAL_CI_TMP}:/ci_tmp") + DOCKER_RUN_ARGS+=("${ENV_LIST[@]}") + DOCKER_RUN_ARGS+=("--env STAGE=${STAGE}") if [[ "${STAGE}" == "test" || "${USE_GPU}" == "1" ]]; then CONTAINER="${TEST_CONTAINER}" - DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} --runtime=nvidia" - DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} --gpus all" - DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} --cap-add=sys_nice" + DOCKER_RUN_ARGS+=("--runtime=nvidia") + DOCKER_RUN_ARGS+=("--gpus all") + DOCKER_RUN_ARGS+=("--cap-add=sys_nice") else CONTAINER="${BUILD_CONTAINER}" - DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} --runtime=runc" + DOCKER_RUN_ARGS+=("--runtime=runc") fi if [[ "${USE_HOST_GIT}" == "1" ]]; then - DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} -v ${MORPHEUS_ROOT}:/Morpheus" + DOCKER_RUN_ARGS+=("-v" "${MORPHEUS_ROOT}:/Morpheus") fi if [[ "${STAGE}" == "bash" ]]; then @@ -99,7 +110,7 @@ for STAGE in "${STAGES[@]}"; do echo "Running ${STAGE} stage in ${CONTAINER}" set -x - docker run ${DOCKER_RUN_ARGS} ${DOCKER_EXTRA_ARGS} ${CONTAINER} ${DOCKER_RUN_CMD} + docker run ${DOCKER_RUN_ARGS[@]} ${DOCKER_EXTRA_ARGS} ${CONTAINER} ${DOCKER_RUN_CMD} set +x STATUS=$? From 82ce14cc9a439c6310cf734e4fa55928f643c433 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Fri, 19 Apr 2024 11:01:19 -0500 Subject: [PATCH 03/38] Fix a typo in the devcontainer base image (#1638) Closes https://github.com/nv-morpheus/Morpheus/issues/1624, where the devcontainer fails to build due to a mis-typed base container. ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1638 --- .devcontainer/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 9195d475d4..c102b78a8a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -13,6 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM rapidsai/devcontainers:23.12-cpp-cuda12.1-mambaforge-ubuntu22.04 AS base +FROM rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04 AS base ENV PATH="${PATH}:/workspaces/morpheus/.devcontainer/bin" From 883b804572b8a65aea947427f53bed97cfbff791 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Fri, 19 Apr 2024 13:43:27 -0700 Subject: [PATCH 04/38] Don't set pe_count for the C++ impl of the TritonInferenceStage (#1640) * Ensure that both `pe_count` & `engines_per_pe` are both set to `1` for the C++ impl of the `TritonInferenceStage` * Remove hard-coded `--num_threads=1` from validation scripts * Disable hammah validation script until #1641 can be resolved * Back-port of #1636 Closes #1639 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1640 --- ci/check_style.sh | 2 +- morpheus/stages/inference/triton_inference_stage.py | 10 ++++++++++ scripts/validation/val-run-all.sh | 10 ++++++++-- scripts/validation/val-run-pipeline.sh | 12 ++++++------ 4 files changed, 25 insertions(+), 9 deletions(-) diff --git a/ci/check_style.sh b/ci/check_style.sh index 9205625726..beb561bb4f 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -16,7 +16,7 @@ rapids-dependency-file-generator \ --file_key checks \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml -rapids-mamba-retry env create --force -f env.yaml -n checks +rapids-mamba-retry env create --yes -f env.yaml -n checks conda activate checks # Run pre-commit checks diff --git a/morpheus/stages/inference/triton_inference_stage.py b/morpheus/stages/inference/triton_inference_stage.py index e5901363f9..e6c5c0fbb7 100644 --- a/morpheus/stages/inference/triton_inference_stage.py +++ b/morpheus/stages/inference/triton_inference_stage.py @@ -781,3 +781,13 @@ def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: self._needs_logits, self._input_mapping, self._output_mapping) + + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + node = super()._build_single(builder, input_node) + + # ensure that the C++ impl only uses a single progress engine + if (self._build_cpp_node()): + node.launch_options.pe_count = 1 + node.launch_options.engines_per_pe = 1 + + return node diff --git a/scripts/validation/val-run-all.sh b/scripts/validation/val-run-all.sh index 905ee7f7e5..c85711cdbf 100755 --- a/scripts/validation/val-run-all.sh +++ b/scripts/validation/val-run-all.sh @@ -31,7 +31,10 @@ ensure_triton_running export USE_CPP=0 ${SCRIPT_DIR}/abp/val-abp-all.sh -${SCRIPT_DIR}/hammah/val-hammah-all.sh + +# Disabled per #1641 +# ${SCRIPT_DIR}/hammah/val-hammah-all.sh + ${SCRIPT_DIR}/phishing/val-phishing-all.sh ${SCRIPT_DIR}/sid/val-sid-all.sh @@ -39,6 +42,9 @@ ${SCRIPT_DIR}/sid/val-sid-all.sh export USE_CPP=1 ${SCRIPT_DIR}/abp/val-abp-all.sh -${SCRIPT_DIR}/hammah/val-hammah-all.sh + +# Disabled per #1641 +# ${SCRIPT_DIR}/hammah/val-hammah-all.sh + ${SCRIPT_DIR}/phishing/val-phishing-all.sh ${SCRIPT_DIR}/sid/val-sid-all.sh diff --git a/scripts/validation/val-run-pipeline.sh b/scripts/validation/val-run-pipeline.sh index ee8b00075c..65641a1370 100755 --- a/scripts/validation/val-run-pipeline.sh +++ b/scripts/validation/val-run-pipeline.sh @@ -37,7 +37,7 @@ function run_pipeline_sid_minibert(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ pipeline-nlp --model_seq_length=256 \ from-file --filename=${INPUT_FILE} \ deserialize \ @@ -58,7 +58,7 @@ function run_pipeline_sid_bert(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ pipeline-nlp --model_seq_length=256 \ from-file --filename=${INPUT_FILE} \ deserialize \ @@ -79,7 +79,7 @@ function run_pipeline_abp_nvsmi(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ pipeline-fil --columns_file=${MORPHEUS_ROOT}/morpheus/data/columns_fil.txt \ from-file --filename=${INPUT_FILE} \ deserialize \ @@ -100,7 +100,7 @@ function run_pipeline_phishing_email(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=32 --use_cpp=${USE_CPP} \ pipeline-nlp --model_seq_length=128 --labels_file=${MORPHEUS_ROOT}/morpheus/data/labels_phishing.txt \ from-file --filename=${INPUT_FILE} \ deserialize \ @@ -121,7 +121,7 @@ function run_pipeline_hammah_user123(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ @@ -143,7 +143,7 @@ function run_pipeline_hammah_role-g(){ VAL_FILE=$4 VAL_OUTPUT=$5 - morpheus --log_level=DEBUG run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ + morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ pipeline-ae --columns_file="${MORPHEUS_ROOT}/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ From 31d963a357557c9e91b8ad608fca23cbf736bd93 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Mon, 22 Apr 2024 11:55:40 -0700 Subject: [PATCH 05/38] Fix vdb_upload runtime error (#1643) * Add `ControlMessage` to the `accepted_types` for `InferenceStage` when in Python mode * fix import of `CppTensorMemory` * Set default value of `['rss']` for `--source_type` avoids issue where command line flag values are ignored. * Fix bug in overrides of `config` fixture which prevented parameterization on the `use_cpp` fixture. * fix type-o in config value `stop_after_rec` not `stop_after_sec` * Ensure a default int value for `stop_after_rec` to avoid schema validation error * Revert the default value for `--vector_db_resource_name` back to 'RSS', allowing the output of running this example to be used as the input for the RAG pipeline Closes #1642 Closes #1645 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Yuchen Zhang (https://github.com/yuchenz427) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1643 --- examples/llm/rag/README.md | 9 ++++----- examples/llm/vdb_upload/module/rss_source_pipe.py | 4 ++-- examples/llm/vdb_upload/run.py | 4 ++-- examples/llm/vdb_upload/vdb_config.yaml | 4 ++-- examples/llm/vdb_upload/vdb_utils.py | 4 ++-- morpheus/modules/input/rss_source.py | 2 +- morpheus/modules/schemas/rss_source_schema.py | 2 +- morpheus/stages/inference/inference_stage.py | 5 ++++- morpheus/stages/input/rss_source_stage.py | 2 +- .../stages/preprocess/preprocess_fil_stage.py | 5 +++-- .../stages/preprocess/preprocess_nlp_stage.py | 15 ++++++++------- .../gnn_fraud_detection_pipeline/conftest.py | 2 +- tests/examples/log_parsing/conftest.py | 2 +- tests/examples/ransomware_detection/conftest.py | 2 +- tests/stages/test_preprocess_fil_stage.py | 2 +- tests/stages/test_preprocess_nlp_stage.py | 2 +- tests/test_add_classifications_stage.py | 2 +- tests/test_add_scores_stage.py | 2 +- 18 files changed, 37 insertions(+), 33 deletions(-) diff --git a/examples/llm/rag/README.md b/examples/llm/rag/README.md index 3868fab377..7c1579040e 100644 --- a/examples/llm/rag/README.md +++ b/examples/llm/rag/README.md @@ -214,14 +214,14 @@ pipeline option of `rag`: ```bash export NGC_API_KEY=[YOUR_KEY_HERE] -NGC_API_KEY=${NGC_API_KEY} python examples/llm/main.py rag pipeline +python examples/llm/main.py rag pipeline ``` **Using OpenAI LLM models** ```bash export OPENAI_API_KEY=[YOUR_KEY_HERE] -OPENAI_API_KEY=${OPENAI_API_KEY} python examples/llm/main.py rag pipeline +python examples/llm/main.py rag pipeline --llm_service=OpenAI --model_name=gpt-3.5-turbo ``` ### Run example (Persistent Pipeline): @@ -232,14 +232,14 @@ OPENAI_API_KEY=${OPENAI_API_KEY} python examples/llm/main.py rag pipeline ```bash export NGC_API_KEY=[YOUR_KEY_HERE] -python examples/llm/main.py rag persistent +python examples/llm/main.py rag persistent ``` **Using OpenAI LLM models** ```bash export OPENAI_API_KEY=[YOUR_KEY_HERE] -python examples/llm/main.py rag persistent +python examples/llm/main.py rag persistent ``` ### Options: @@ -273,4 +273,3 @@ The `rag` command has its own set of options and commands: - `persistant` - `pipeline` - diff --git a/examples/llm/vdb_upload/module/rss_source_pipe.py b/examples/llm/vdb_upload/module/rss_source_pipe.py index c424e03dbc..ff61940b8c 100644 --- a/examples/llm/vdb_upload/module/rss_source_pipe.py +++ b/examples/llm/vdb_upload/module/rss_source_pipe.py @@ -48,7 +48,7 @@ class RSSSourcePipeSchema(BaseModel): output_batch_size: int = 2048 request_timeout_sec: float = 2.0 run_indefinitely: bool = True - stop_after_sec: int = 0 + stop_after_rec: int = 0 vdb_resource_name: str web_scraper_config: Optional[Dict[Any, Any]] = None @@ -130,7 +130,7 @@ def _rss_source_pipe(builder: mrc.Builder): "cooldown_interval_sec": validated_config.cooldown_interval_sec, "request_timeout_sec": validated_config.request_timeout_sec, "interval_sec": validated_config.interval_sec, - "stop_after_sec": validated_config.stop_after_sec, + "stop_after_rec": validated_config.stop_after_rec, } rss_source_loader = RSSSourceLoaderFactory.get_instance("rss_source", {"rss_source": rss_source_config}) diff --git a/examples/llm/vdb_upload/run.py b/examples/llm/vdb_upload/run.py index 04627f8359..974e5ec213 100644 --- a/examples/llm/vdb_upload/run.py +++ b/examples/llm/vdb_upload/run.py @@ -104,7 +104,7 @@ def run(): @click.option("--source_type", multiple=True, type=click.Choice(['rss', 'filesystem'], case_sensitive=False), - default=[], + default=['rss'], show_default=True, help="The type of source to use. Can specify multiple times for different source types.") @click.option( @@ -128,7 +128,7 @@ def run(): @click.option( "--vector_db_resource_name", type=str, - default="VDBUploadExample", + default="RSS", help="The identifier of the resource on which operations are to be performed in the vector database.", ) @click.option( diff --git a/examples/llm/vdb_upload/vdb_config.yaml b/examples/llm/vdb_upload/vdb_config.yaml index 0c1af37d22..ac93a47615 100644 --- a/examples/llm/vdb_upload/vdb_config.yaml +++ b/examples/llm/vdb_upload/vdb_config.yaml @@ -75,7 +75,7 @@ vdb_pipeline: output_batch_size: 2048 # Number of chunked documents per output batch request_timeout_sec: 2.0 run_indefinitely: true - stop_after_sec: 0 + stop_after_rec: 0 web_scraper_config: chunk_overlap: 51 chunk_size: 512 @@ -300,4 +300,4 @@ vdb_pipeline: dtype: FLOAT_VECTOR description: Embedding vectors representing the data entry dim: 384 # Size of the embeddings to store in the vector database - description: Collection schema for diverse data sources \ No newline at end of file + description: Collection schema for diverse data sources diff --git a/examples/llm/vdb_upload/vdb_utils.py b/examples/llm/vdb_upload/vdb_utils.py index 2b399fcd21..d3aed615d7 100644 --- a/examples/llm/vdb_upload/vdb_utils.py +++ b/examples/llm/vdb_upload/vdb_utils.py @@ -135,7 +135,7 @@ def _build_default_rss_source(enable_cache, "output_batch_size": 2048, "cache_dir": "./.cache/http", "cooldown_interval_sec": interval_secs, - "stop_after_sec": stop_after, + "stop_after_rec": stop_after or 0, "enable_cache": enable_cache, "enable_monitor": enable_monitors, "feed_input": feed_inputs if feed_inputs else build_rss_urls(), @@ -448,7 +448,7 @@ def build_final_config(vdb_conf_path, interval_secs=60, run_indefinitely=True, stop_after=None, - vector_db_resource_name="VDBUploadExample", + vector_db_resource_name="RSS", content_chunking_size=128, rss_request_timeout_sec=30, feed_inputs=build_rss_urls())) diff --git a/morpheus/modules/input/rss_source.py b/morpheus/modules/input/rss_source.py index 6133e3d673..9f5dd6c316 100644 --- a/morpheus/modules/input/rss_source.py +++ b/morpheus/modules/input/rss_source.py @@ -101,7 +101,7 @@ def fetch_feeds() -> MessageMeta: records_emitted += df_size - if (0 < validated_config.stop_after_sec <= records_emitted): + if (0 < validated_config.stop_after_rec <= records_emitted): stop_requested = True logger.info("Stop limit reached... preparing to halt the source.") break diff --git a/morpheus/modules/schemas/rss_source_schema.py b/morpheus/modules/schemas/rss_source_schema.py index b0468b1ace..53c0928391 100644 --- a/morpheus/modules/schemas/rss_source_schema.py +++ b/morpheus/modules/schemas/rss_source_schema.py @@ -30,7 +30,7 @@ class RSSSourceSchema(BaseModel): cooldown_interval_sec: int = 600 request_timeout_sec: float = 2.0 interval_sec: int = 600 - stop_after_sec: int = 0 + stop_after_rec: int = 0 class Config: extra = "forbid" diff --git a/morpheus/stages/inference/inference_stage.py b/morpheus/stages/inference/inference_stage.py index e4111926e9..579ddccd53 100644 --- a/morpheus/stages/inference/inference_stage.py +++ b/morpheus/stages/inference/inference_stage.py @@ -192,7 +192,10 @@ def accepted_types(self) -> typing.Tuple: typing.Tuple Tuple of input types. """ - return (MultiInferenceMessage, ) + if (self._build_cpp_node()): + return (MultiInferenceMessage, ) + + return (MultiInferenceMessage, ControlMessage) def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(MultiResponseMessage) diff --git a/morpheus/stages/input/rss_source_stage.py b/morpheus/stages/input/rss_source_stage.py index 31e408c290..d56a443542 100644 --- a/morpheus/stages/input/rss_source_stage.py +++ b/morpheus/stages/input/rss_source_stage.py @@ -81,7 +81,7 @@ def __init__(self, "rss_source": { "feed_input": feed_input, "interval_sec": interval_secs, - "stop_after_sec": stop_after, + "stop_after_rec": stop_after, "run_indefinitely": run_indefinitely, "batch_size": batch_size, "enable_cache": enable_cache, diff --git a/morpheus/stages/preprocess/preprocess_fil_stage.py b/morpheus/stages/preprocess/preprocess_fil_stage.py index 45b1640d72..cbfc6a581f 100644 --- a/morpheus/stages/preprocess/preprocess_fil_stage.py +++ b/morpheus/stages/preprocess/preprocess_fil_stage.py @@ -23,6 +23,7 @@ import cudf +import morpheus._lib.messages as _messages import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config @@ -32,7 +33,6 @@ from morpheus.messages import MultiInferenceFILMessage from morpheus.messages import MultiInferenceMessage from morpheus.messages import MultiMessage -from morpheus.messages import TensorMemory as CppTensorMemory from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage logger = logging.getLogger(__name__) @@ -123,7 +123,8 @@ def process_control_message(x: ControlMessage, fea_len: int, fea_cols: typing.Li seg_ids[:, 0] = cp.arange(0, count, dtype=cp.uint32) seg_ids[:, 2] = fea_len - 1 - x.tensors(CppTensorMemory(count=count, tensors={"input__0": data, "seq_ids": seg_ids})) + # We need the C++ impl of TensorMemory until #1646 is resolved + x.tensors(_messages.TensorMemory(count=count, tensors={"input__0": data, "seq_ids": seg_ids})) return x @staticmethod diff --git a/morpheus/stages/preprocess/preprocess_nlp_stage.py b/morpheus/stages/preprocess/preprocess_nlp_stage.py index feace923dc..de610ab52c 100644 --- a/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -24,6 +24,7 @@ import cudf +import morpheus._lib.messages as _messages import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.cli.utils import MorpheusRelativePath @@ -35,7 +36,6 @@ from morpheus.messages import MultiInferenceMessage from morpheus.messages import MultiInferenceNLPMessage from morpheus.messages import MultiMessage -from morpheus.messages import TensorMemory as CppTensorMemory from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage from morpheus.utils.cudf_subword_helper import tokenize_text_series @@ -204,13 +204,14 @@ def process_control_message(message: ControlMessage, del text_series + # We need the C++ impl of TensorMemory until #1646 is resolved message.tensors( - CppTensorMemory(count=tokenized.input_ids.shape[0], - tensors={ - "input_ids": tokenized.input_ids, - "input_mask": tokenized.input_mask, - "seq_ids": tokenized.segment_ids - })) + _messages.TensorMemory(count=tokenized.input_ids.shape[0], + tensors={ + "input_ids": tokenized.input_ids, + "input_mask": tokenized.input_mask, + "seq_ids": tokenized.segment_ids + })) message.set_metadata("inference_memory_params", {"inference_type": "nlp"}) return message diff --git a/tests/examples/gnn_fraud_detection_pipeline/conftest.py b/tests/examples/gnn_fraud_detection_pipeline/conftest.py index a625d51862..30176f71e4 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/conftest.py +++ b/tests/examples/gnn_fraud_detection_pipeline/conftest.py @@ -44,7 +44,7 @@ def cuml_fixture(fail_missing: bool): @pytest.fixture(name="config") -def config_fixture(config): +def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument """ The GNN fraud detection pipeline utilizes the "other" pipeline mode. """ diff --git a/tests/examples/log_parsing/conftest.py b/tests/examples/log_parsing/conftest.py index d31891873a..f927c3fcc1 100644 --- a/tests/examples/log_parsing/conftest.py +++ b/tests/examples/log_parsing/conftest.py @@ -17,7 +17,7 @@ @pytest.fixture(name="config") -def config_fixture(config): +def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument """ The log_parsing pipelie requires NLP mode. Set this here so all the tests don't need to set it themselves. """ diff --git a/tests/examples/ransomware_detection/conftest.py b/tests/examples/ransomware_detection/conftest.py index e1c5e2541d..a92786555a 100644 --- a/tests/examples/ransomware_detection/conftest.py +++ b/tests/examples/ransomware_detection/conftest.py @@ -39,7 +39,7 @@ def dask_distributed(fail_missing: bool): @pytest.fixture(name="config") -def config_fixture(config): +def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument """ The ransomware detection pipeline utilizes the FIL pipeline mode. """ diff --git a/tests/stages/test_preprocess_fil_stage.py b/tests/stages/test_preprocess_fil_stage.py index eb6dc8b620..638fcaa994 100644 --- a/tests/stages/test_preprocess_fil_stage.py +++ b/tests/stages/test_preprocess_fil_stage.py @@ -27,7 +27,7 @@ @pytest.fixture(name='config') -def fixture_config(config: Config): +def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument config.feature_length = 1 config.fil = ConfigFIL() config.fil.feature_columns = ["data"] diff --git a/tests/stages/test_preprocess_nlp_stage.py b/tests/stages/test_preprocess_nlp_stage.py index 9c2b5d4e39..22fc99e04a 100644 --- a/tests/stages/test_preprocess_nlp_stage.py +++ b/tests/stages/test_preprocess_nlp_stage.py @@ -29,7 +29,7 @@ @pytest.fixture(name='config') -def fixture_config(config: Config): +def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument config.class_labels = [ "address", "bank_acct", diff --git a/tests/test_add_classifications_stage.py b/tests/test_add_classifications_stage.py index 279963ba9a..80091f3dc5 100755 --- a/tests/test_add_classifications_stage.py +++ b/tests/test_add_classifications_stage.py @@ -31,7 +31,7 @@ @pytest.fixture(name="config") -def config_fixture(config: Config): +def config_fixture(config: Config, use_cpp: bool): # pylint: disable=unused-argument config.class_labels = ['frogs', 'lizards', 'toads'] yield config diff --git a/tests/test_add_scores_stage.py b/tests/test_add_scores_stage.py index ad67709959..e454a0e35f 100755 --- a/tests/test_add_scores_stage.py +++ b/tests/test_add_scores_stage.py @@ -31,7 +31,7 @@ @pytest.fixture(name='config') -def fixture_config(config: Config): +def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument config.class_labels = ['frogs', 'lizards', 'toads'] config.feature_length = 12 yield config From 0a0a20d41c96dee8bf5f2f30165895efbde6641e Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Mon, 22 Apr 2024 16:09:14 -0700 Subject: [PATCH 06/38] Document current known issues in 24.03.02 (#1656) ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1656 --- docs/source/extra_info/known_issues.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/source/extra_info/known_issues.md b/docs/source/extra_info/known_issues.md index 2ade48011e..014fac3471 100644 --- a/docs/source/extra_info/known_issues.md +++ b/docs/source/extra_info/known_issues.md @@ -17,4 +17,9 @@ limitations under the License. # Known Issues +- TrainAEStage fails with a Segmentation fault ([#1641](https://github.com/nv-morpheus/Morpheus/pull/1641)) +- vdb_upload example pipeline triggers an internal error in Triton ([#1649](https://github.com/nv-morpheus/Morpheus/pull/1649)) +- vdb_upload example pipeline error on inserting large strings ([#1650](https://github.com/nv-morpheus/Morpheus/pull/1650)) +- vdb_upload example pipeline only works with C++ mode disabled ([#1651](https://github.com/nv-morpheus/Morpheus/pull/1651)) + Refer to [open issues in the Morpheus project](https://github.com/nv-morpheus/Morpheus/issues) From eb0bc254aaa26bbc0f64a4bc66f97ce2642d9c35 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 22 Apr 2024 16:21:46 -0700 Subject: [PATCH 07/38] Updating CHANGELOG --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f2994d2c0..d79781bdc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> +# Morpheus 24.03.02 (22 Apr 2024) + +## 🐛 Bug Fixes + +- Don't set pe_count for the C++ impl of the TritonInferenceStage ([#1640](https://github.com/nv-morpheus/Morpheus/pull/1640)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix vdb_upload runtime error ([#1643](https://github.com/nv-morpheus/Morpheus/pull/1643)) [@dagardner-nv](https://github.com/dagardner-nv) + +## 📖 Documentation + +- Document current known issues in 24.03.02 ([#1656](https://github.com/nv-morpheus/Morpheus/pull/1656)) [@dagardner-nv](https://github.com/dagardner-nv) + # Morpheus 24.03.01 (10 Apr 2024) ## 🚨 Breaking Changes From cbfea7d6708ad9f45ff41997b19ccd149773395b Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 23 Apr 2024 10:12:39 -0400 Subject: [PATCH 08/38] Fix `cupy_to_tensor` to also infer `uint8` and `int8` dtypes (#1621) - Update to `DType::from_numpy` to handle strings that identify `uint8` and `int8` dtypes - Add unit tests for DType - Update to throw invalid argument exceptions on invalid numpy typestr's. Closes #1619 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Eli Fajardo (https://github.com/efajardo-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1621 --- .../_lib/include/morpheus/objects/dtype.hpp | 1 + morpheus/_lib/src/objects/dtype.cpp | 60 ++-- morpheus/_lib/tests/CMakeLists.txt | 6 + morpheus/_lib/tests/objects/test_dtype.cpp | 286 ++++++++++++++++++ 4 files changed, 336 insertions(+), 17 deletions(-) create mode 100644 morpheus/_lib/tests/objects/test_dtype.cpp diff --git a/morpheus/_lib/include/morpheus/objects/dtype.hpp b/morpheus/_lib/include/morpheus/objects/dtype.hpp index aa8c42e92f..63dbd1594a 100644 --- a/morpheus/_lib/include/morpheus/objects/dtype.hpp +++ b/morpheus/_lib/include/morpheus/objects/dtype.hpp @@ -173,6 +173,7 @@ struct DType } private: + char byte_order_char() const; char type_char() const; TypeId m_type_id; diff --git a/morpheus/_lib/src/objects/dtype.cpp b/morpheus/_lib/src/objects/dtype.cpp index 912a945b3a..870cdb8059 100644 --- a/morpheus/_lib/src/objects/dtype.cpp +++ b/morpheus/_lib/src/objects/dtype.cpp @@ -20,7 +20,6 @@ #include "morpheus/utilities/string_util.hpp" // for MORPHEUS_CONCAT_STR #include -#include // for CHECK #include #include // Needed by MORPHEUS_CONCAT_STR @@ -30,7 +29,7 @@ namespace { const std::map> StrToTypeId = { - {'?', {{1, morpheus::TypeId::BOOL8}}}, + {'b', {{1, morpheus::TypeId::BOOL8}}}, {'i', {{1, morpheus::TypeId::INT8}, @@ -100,14 +99,7 @@ std::string DType::name() const std::string DType::type_str() const { - if (m_type_id != TypeId::BOOL8 && m_type_id != TypeId::STRING) - { - return MORPHEUS_CONCAT_STR("<" << this->type_char() << this->item_size()); - } - else - { - return std::string{this->type_char()}; - } + return MORPHEUS_CONCAT_STR(this->byte_order_char() << this->type_char() << this->item_size()); } // Cudf representation @@ -214,19 +206,22 @@ DType DType::from_cudf(cudf::type_id tid) case cudf::type_id::EMPTY: case cudf::type_id::NUM_TYPE_IDS: default: - throw std::runtime_error("Not supported"); + throw std::invalid_argument("Not supported"); } } DType DType::from_numpy(const std::string& numpy_str) { - CHECK(!numpy_str.empty()) << "Cannot create DataType from empty string"; + if (numpy_str.empty()) + { + throw std::invalid_argument("Cannot create DataType from empty string"); + } char type_char = numpy_str[0]; size_t size_start = 1; - // Can start with < or > or none - if (numpy_str[0] == '<' || numpy_str[0] == '>') + // Can start with <, >, | or none + if (numpy_str[0] == '<' || numpy_str[0] == '>' || numpy_str[0] == '|') { type_char = numpy_str[1]; size_start = 2; @@ -241,11 +236,17 @@ DType DType::from_numpy(const std::string& numpy_str) // Now lookup in the map auto found_type = StrToTypeId.find(type_char); - CHECK(found_type != StrToTypeId.end()) << "Type char '" << type_char << "' not supported"; + if (found_type == StrToTypeId.end()) + { + throw std::invalid_argument(MORPHEUS_CONCAT_STR("Type char '" << type_char << "' not supported")); + } auto found_enum = found_type->second.find(dtype_size); - CHECK(found_enum != found_type->second.end()) << "Type str '" << type_char << dtype_size << "' not supported"; + if (found_enum == found_type->second.end()) + { + throw std::invalid_argument(MORPHEUS_CONCAT_STR("Type str '" << type_char << dtype_size << "' not supported")); + } return {found_enum->second}; } @@ -299,6 +300,31 @@ DType DType::from_triton(const std::string& type_str) } else { + throw std::invalid_argument("Not supported"); + } +} + +char DType::byte_order_char() const +{ + switch (m_type_id) + { + case TypeId::BOOL8: + case TypeId::INT8: + case TypeId::UINT8: + return '|'; + case TypeId::INT16: + case TypeId::UINT16: + case TypeId::INT32: + case TypeId::UINT32: + case TypeId::INT64: + case TypeId::UINT64: + case TypeId::FLOAT32: + case TypeId::FLOAT64: + return '<'; + case TypeId::EMPTY: + case TypeId::NUM_TYPE_IDS: + case TypeId::STRING: + default: throw std::runtime_error("Not supported"); } } @@ -318,7 +344,7 @@ char DType::type_char() const case TypeId::UINT64: return 'u'; case TypeId::BOOL8: - return '?'; + return 'b'; case TypeId::FLOAT32: case TypeId::FLOAT64: return 'f'; diff --git a/morpheus/_lib/tests/CMakeLists.txt b/morpheus/_lib/tests/CMakeLists.txt index 7e71bd2eb1..a17a297aca 100644 --- a/morpheus/_lib/tests/CMakeLists.txt +++ b/morpheus/_lib/tests/CMakeLists.txt @@ -113,6 +113,12 @@ add_morpheus_test( modules/test_data_loader_module.cpp ) +add_morpheus_test( + NAME objects + FILES + objects/test_dtype.cpp +) + add_morpheus_test( NAME deserializers FILES diff --git a/morpheus/_lib/tests/objects/test_dtype.cpp b/morpheus/_lib/tests/objects/test_dtype.cpp new file mode 100644 index 0000000000..230d68dcd6 --- /dev/null +++ b/morpheus/_lib/tests/objects/test_dtype.cpp @@ -0,0 +1,286 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // IWYU pragma: associated + +#include "morpheus/objects/dtype.hpp" // for DType + +#include +#include + +#include + +using namespace morpheus; +using namespace morpheus::test; + +TEST_CLASS(DType); + +TEST_F(TestDType, FromNumpyValidStr) +{ + DType dtype = DType::from_numpy("|i1"); + ASSERT_EQ(dtype.type_id(), TypeId::INT8); + ASSERT_EQ(dtype.item_size(), 1); + ASSERT_EQ(dtype.type_str(), "|i1"); + + dtype = DType::from_numpy(" Date: Tue, 23 Apr 2024 11:08:52 -0700 Subject: [PATCH 09/38] Fix documentation for building examples (#1659) * The Morpheus python package needs to be built and installed prior to building the examples, updated documentation to ensure `-DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` is added ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1659 --- docs/source/developer_guide/guides.md | 4 ++-- docs/source/developer_guide/guides/3_simple_cpp_stage.md | 4 ++-- docs/source/developer_guide/guides/4_source_cpp_stage.md | 4 ++-- examples/developer_guide/4_rabbitmq_cpp_stage/README.md | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/developer_guide/guides.md b/docs/source/developer_guide/guides.md index 9e4fba5ff7..2d141e96db 100644 --- a/docs/source/developer_guide/guides.md +++ b/docs/source/developer_guide/guides.md @@ -33,9 +33,9 @@ in both Python and C++. - [Simple C++ Stage](./guides/3_simple_cpp_stage.md) - [Creating a C++ Source Stage](./guides/4_source_cpp_stage.md) -> **Note**: The code for the above guides can be found in the `examples/developer_guide` directory of the Morpheus repository. To build the C++ examples, pass `-DMORPHEUS_BUILD_EXAMPLES=ON` to CMake when building Morpheus. Users building Morpheus with the provided `scripts/compile.sh` script can do do by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: +> **Note**: The code for the above guides can be found in the `examples/developer_guide` directory of the Morpheus repository. To build the C++ examples, pass `-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` to CMake when building Morpheus. Users building Morpheus with the provided `scripts/compile.sh` script can do do by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: > ```bash -> CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON" ./scripts/compile.sh +> CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh > ``` ## Morpheus Modules diff --git a/docs/source/developer_guide/guides/3_simple_cpp_stage.md b/docs/source/developer_guide/guides/3_simple_cpp_stage.md index 2b203ef42b..3b0982d21e 100644 --- a/docs/source/developer_guide/guides/3_simple_cpp_stage.md +++ b/docs/source/developer_guide/guides/3_simple_cpp_stage.md @@ -17,9 +17,9 @@ limitations under the License. # Simple C++ Stage ## Building the Example -The code for this guide can be found in the `examples/developer_guide/3_simple_cpp_stage` directory of the Morpheus repository. There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` flag to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: +The code for this guide can be found in the `examples/developer_guide/3_simple_cpp_stage` directory of the Morpheus repository. There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` and `-DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` flags to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: ```bash -CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON" ./scripts/compile.sh +CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh ``` The second method is to build the example as a standalone project. From the root of the Morpheus repo execute: diff --git a/docs/source/developer_guide/guides/4_source_cpp_stage.md b/docs/source/developer_guide/guides/4_source_cpp_stage.md index 8bc17f1347..476d0f661b 100644 --- a/docs/source/developer_guide/guides/4_source_cpp_stage.md +++ b/docs/source/developer_guide/guides/4_source_cpp_stage.md @@ -17,9 +17,9 @@ limitations under the License. # Creating a C++ Source Stage ## Building the Example -The code for this guide can be found in the `examples/developer_guide/4_rabbitmq_cpp_stage` directory of the Morpheus repository. There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` flag to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: +The code for this guide can be found in the `examples/developer_guide/4_rabbitmq_cpp_stage` directory of the Morpheus repository. There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` and `-DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` flags to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: ```bash -CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON" ./scripts/compile.sh +CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh ``` The second method is to build the example as a standalone project. From the root of the Morpheus repo execute: diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md index 2e3319b65e..ed55204f85 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md @@ -27,9 +27,9 @@ pip install -r examples/developer_guide/4_rabbitmq_cpp_stage/requirements.txt ``` ## Building the Example -There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` flag to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: +There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` and `-DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` flags to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: ```bash -CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON" ./scripts/compile.sh +CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh ``` The second is to build the example as a standalone project. From the root of the Morpheus repo execute: From ce7ab99b221927d3fbb5f317f87279be62d2109d Mon Sep 17 00:00:00 2001 From: David Gardner Date: Wed, 24 Apr 2024 08:11:00 -0700 Subject: [PATCH 10/38] Updating CHANGELOG --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d79781bdc6..bbd2ef563e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> -# Morpheus 24.03.02 (22 Apr 2024) +# Morpheus 24.03.02 (24 Apr 2024) ## 🐛 Bug Fixes @@ -24,6 +24,7 @@ limitations under the License. ## 📖 Documentation - Document current known issues in 24.03.02 ([#1656](https://github.com/nv-morpheus/Morpheus/pull/1656)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix documentation for building examples ([#1659](https://github.com/nv-morpheus/Morpheus/pull/1659)) [@dagardner-nv](https://github.com/dagardner-nv) # Morpheus 24.03.01 (10 Apr 2024) From e2942e6b49ac923b2ff7b22341bfb719546c00f3 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 24 Apr 2024 09:42:09 -0700 Subject: [PATCH 11/38] Fix type-o in documentation (#1662) ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1662 --- examples/developer_guide/4_rabbitmq_cpp_stage/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md index ed55204f85..c0710524e4 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md @@ -29,7 +29,7 @@ pip install -r examples/developer_guide/4_rabbitmq_cpp_stage/requirements.txt ## Building the Example There are two ways to build the example. The first is to build the examples along with Morpheus by passing the `-DMORPHEUS_BUILD_EXAMPLES=ON` and `-DMORPHEUS_PYTHON_PERFORM_INSTALL=ON` flags to cmake, for users using the `scripts/compile.sh` at the root of the Morpheus repo can do this by setting the `CMAKE_CONFIGURE_EXTRA_ARGS` environment variable: ```bash -CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh +CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_EXAMPLES=ON -DMORPHEUS_PYTHON_PERFORM_INSTALL=ON" ./scripts/compile.sh ``` The second is to build the example as a standalone project. From the root of the Morpheus repo execute: From 88d5211487b9ce389f1a10f98e9bcafb59019a5f Mon Sep 17 00:00:00 2001 From: David Gardner Date: Wed, 24 Apr 2024 09:44:22 -0700 Subject: [PATCH 12/38] Updating CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbd2ef563e..8386e6ac8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ limitations under the License. - Document current known issues in 24.03.02 ([#1656](https://github.com/nv-morpheus/Morpheus/pull/1656)) [@dagardner-nv](https://github.com/dagardner-nv) - Fix documentation for building examples ([#1659](https://github.com/nv-morpheus/Morpheus/pull/1659)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix type-o in documentation ([#1662](https://github.com/nv-morpheus/Morpheus/pull/1662)) [@dagardner-nv](https://github.com/dagardner-nv) # Morpheus 24.03.01 (10 Apr 2024) From ec183006194025ab8ff1408fe780a71a699c29d1 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:38:40 -0700 Subject: [PATCH 13/38] Fix tests to detect issue #1626 (#1629) * PR #659 inadvertently excluded the monitor stage from several of the end-to-end pipeline tests. * Adds an environment variable `MORPHEUS_MONITOR_ALWAYS_ENABLED` which when set, will force the monitor stage to always be enabled. * Adds an auto-use fixture `monitor_stage_always_enabled` which ensures the environment variable is set & present. Requires nv-morpheus/MRC#473 to be merged first ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Christopher Harris (https://github.com/cwharris) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1629 --- tests/benchmarks/test_bench_e2e_pipelines.py | 6 +- tests/benchmarks/test_bench_monitor_stage.py | 9 +-- tests/conftest.py | 9 +++ .../developer_guide/test_python_modules.py | 4 +- .../test_dfp_inference_stage.py | 7 +-- .../test_dfp_postprocessing_stage.py | 7 +-- .../test_dfp_preprocessing_stage.py | 7 +-- tests/test_abp.py | 18 ++++-- tests/test_abp_kafka.py | 12 ++-- tests/test_dfp.py | 16 +++-- tests/test_dfp_kafka.py | 12 ++-- tests/test_monitor_stage.py | 19 +++--- tests/test_phishing.py | 11 ++-- tests/test_phishing_kafka.py | 12 ++-- tests/test_sid.py | 62 ++++++++++++------- tests/test_sid_kafka.py | 12 ++-- 16 files changed, 137 insertions(+), 86 deletions(-) diff --git a/tests/benchmarks/test_bench_e2e_pipelines.py b/tests/benchmarks/test_bench_e2e_pipelines.py index 14283cf154..e99e7bbc07 100644 --- a/tests/benchmarks/test_bench_e2e_pipelines.py +++ b/tests/benchmarks/test_bench_e2e_pipelines.py @@ -67,7 +67,7 @@ def nlp_pipeline(config: Config, input_file, repeat, vocab_hash_file, output_fil server_url=E2E_TEST_CONFIGS["triton_server_url"], force_convert_inputs=True)) pipeline.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="")) - pipeline.add_stage(MonitorStage(config)) + pipeline.add_stage(MonitorStage(config, log_level=logging.INFO)) pipeline.add_stage(SerializeStage(config)) pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) @@ -89,7 +89,7 @@ def fil_pipeline(config: Config, input_file, repeat, output_file, model_name): server_url=E2E_TEST_CONFIGS["triton_server_url"], force_convert_inputs=True)) pipeline.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="")) - pipeline.add_stage(MonitorStage(config)) + pipeline.add_stage(MonitorStage(config, log_level=logging.INFO)) pipeline.add_stage(SerializeStage(config)) pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) @@ -111,7 +111,7 @@ def ae_pipeline(config: Config, input_glob, repeat, train_data_glob, output_file pipeline.add_stage(PreprocessAEStage(config)) pipeline.add_stage(AutoEncoderInferenceStage(config)) pipeline.add_stage(AddScoresStage(config)) - pipeline.add_stage(MonitorStage(config)) + pipeline.add_stage(MonitorStage(config, log_level=logging.INFO)) pipeline.add_stage(SerializeStage(config)) pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) diff --git a/tests/benchmarks/test_bench_monitor_stage.py b/tests/benchmarks/test_bench_monitor_stage.py index 7af2406acc..5ddbdef42d 100644 --- a/tests/benchmarks/test_bench_monitor_stage.py +++ b/tests/benchmarks/test_bench_monitor_stage.py @@ -14,6 +14,7 @@ # limitations under the License. import logging +import typing import pytest from static_message_source import StaticMessageSource @@ -29,7 +30,7 @@ from morpheus.utils.logger import configure_logging -def build_and_run_pipeline(config: Config, df: cudf.DataFrame): +def build_and_run_pipeline(*, config: Config, df: cudf.DataFrame, morpheus_log_level: int): # Pipeline pipeline = LinearPipeline(config) @@ -39,7 +40,7 @@ def build_and_run_pipeline(config: Config, df: cudf.DataFrame): pipeline.add_stage(DeserializeStage(config)) # Stage we want to benchmark - pipeline.add_stage(MonitorStage(config)) + pipeline.add_stage(MonitorStage(config, log_level=morpheus_log_level)) pipeline.build() pipeline.run() @@ -47,7 +48,7 @@ def build_and_run_pipeline(config: Config, df: cudf.DataFrame): @pytest.mark.benchmark @pytest.mark.parametrize("num_messages", [1, 100, 10000, 1000000]) -def test_monitor_stage(benchmark, num_messages): +def test_monitor_stage(benchmark: typing.Callable, num_messages: int, morpheus_log_level: int): # Test Data @@ -70,4 +71,4 @@ def test_monitor_stage(benchmark, num_messages): config.edge_buffer_size = 4 # would prefer to benchmark just pipeline.run, but it asserts when called multiple times - benchmark(build_and_run_pipeline, config, df) + benchmark(build_and_run_pipeline, config=config, df=df, morpheus_log_level=morpheus_log_level) diff --git a/tests/conftest.py b/tests/conftest.py index 0a33fa7891..1f8f0ef425 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -861,6 +861,15 @@ def loglevel_fatal(): _wrap_set_log_level(logging.FATAL) +@pytest.fixture(scope="function") +def morpheus_log_level(): + """ + Returns the log level of the morpheus logger + """ + logger = logging.getLogger("morpheus") + yield logger.getEffectiveLevel() + + # ==== DataFrame Fixtures ==== @pytest.fixture(scope="function") def dataset(df_type: typing.Literal['cudf', 'pandas']): diff --git a/tests/examples/developer_guide/test_python_modules.py b/tests/examples/developer_guide/test_python_modules.py index 1c433d6f78..aad7333ce7 100644 --- a/tests/examples/developer_guide/test_python_modules.py +++ b/tests/examples/developer_guide/test_python_modules.py @@ -38,7 +38,7 @@ os.path.join(EXAMPLES_DIR, "my_compound_module_consumer_stage.py"), os.path.join(EXAMPLES_DIR, "my_test_module_consumer_stage.py") ]) -def test_pipeline(config: Config, import_mod: list[types.ModuleType]): +def test_pipeline(config: Config, import_mod: list[types.ModuleType], morpheus_log_level: int): my_compound_module_consumer_stage = import_mod[-2] my_test_module_consumer_stage = import_mod[-1] @@ -72,7 +72,7 @@ def test_pipeline(config: Config, import_mod: list[types.ModuleType]): pipeline.add_stage(my_test_module_consumer_stage.MyPassthroughModuleWrapper(config)) pipeline.add_stage(my_compound_module_consumer_stage.MyCompoundOpModuleWrapper(config)) - pipeline.add_stage(MonitorStage(config)) + pipeline.add_stage(MonitorStage(config, log_level=morpheus_log_level)) comp_stage = pipeline.add_stage(CompareDataFrameStage(config, expected_df)) pipeline.run() diff --git a/tests/examples/digital_fingerprinting/test_dfp_inference_stage.py b/tests/examples/digital_fingerprinting/test_dfp_inference_stage.py index f4dda7c815..46defbbbee 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_inference_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_inference_stage.py @@ -71,19 +71,18 @@ def test_get_model(config: Config, mock_mlflow_client: mock.MagicMock, mock_mode @pytest.mark.usefixtures("reset_loglevel") -@pytest.mark.parametrize('morpheus_log_level', - [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) +@pytest.mark.parametrize('log_level', [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) def test_on_data( config: Config, mock_mlflow_client: mock.MagicMock, # pylint: disable=unused-argument mock_model_manager: mock.MagicMock, dfp_multi_message: "MultiDFPMessage", # noqa: F821 - morpheus_log_level: int, + log_level: int, dataset_pandas: DatasetManager): from dfp.messages.multi_dfp_message import MultiDFPMessage from dfp.stages.dfp_inference_stage import DFPInferenceStage - set_log_level(morpheus_log_level) + set_log_level(log_level) expected_results = list(range(1000, dfp_multi_message.mess_count + 1000)) diff --git a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py index 4b13bacde5..6eed4c0d9e 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_postprocessing_stage.py @@ -35,14 +35,13 @@ def test_constructor(config: Config): @pytest.mark.usefixtures("reset_loglevel") @pytest.mark.parametrize('use_on_data', [True, False]) -@pytest.mark.parametrize('morpheus_log_level', - [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) +@pytest.mark.parametrize('log_level', [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) @mock.patch('dfp.stages.dfp_postprocessing_stage.datetime') def test_process_events_on_data(mock_datetime: mock.MagicMock, config: Config, dfp_multi_ae_message: MultiAEMessage, use_on_data: bool, - morpheus_log_level: int): + log_level: int): from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage mock_dt_obj = mock.MagicMock() @@ -54,7 +53,7 @@ def test_process_events_on_data(mock_datetime: mock.MagicMock, df.loc[10, 'v2'] = np.nan df['event_time'] = '' - set_log_level(morpheus_log_level) + set_log_level(log_level) stage = DFPPostprocessingStage(config) # on_data is a thin wrapper around process_events, tests should be the same for non-empty messages diff --git a/tests/examples/digital_fingerprinting/test_dfp_preprocessing_stage.py b/tests/examples/digital_fingerprinting/test_dfp_preprocessing_stage.py index bf82381879..c7859cd90c 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_preprocessing_stage.py +++ b/tests/examples/digital_fingerprinting/test_dfp_preprocessing_stage.py @@ -36,17 +36,16 @@ def test_constructor(config: Config): @pytest.mark.usefixtures("reset_loglevel") -@pytest.mark.parametrize('morpheus_log_level', - [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) +@pytest.mark.parametrize('log_level', [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) def test_process_features( config: Config, dfp_multi_message: "MultiDFPMessage", # noqa: F821 dataset_pandas: DatasetManager, - morpheus_log_level: int): + log_level: int): from dfp.messages.multi_dfp_message import MultiDFPMessage from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage - set_log_level(morpheus_log_level) + set_log_level(log_level) expected_df = dfp_multi_message.get_meta_dataframe().copy(deep=True) expected_df['v210'] = expected_df['v2'] + 10 diff --git a/tests/test_abp.py b/tests/test_abp.py index 86778bfdb6..a3248deb7e 100755 --- a/tests/test_abp.py +++ b/tests/test_abp.py @@ -52,7 +52,7 @@ @pytest.mark.slow @pytest.mark.use_python @mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_no_cpp(mock_triton_client, config: Config, tmp_path): +def test_abp_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): mock_metadata = { "inputs": [{ 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] @@ -98,7 +98,8 @@ def test_abp_no_cpp(mock_triton_client, config: Config, tmp_path): pipe.add_stage(PreprocessFILStage(config)) pipe.add_stage( TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(AddScoresStage(config, prefix="score_")) pipe.add_stage( @@ -115,7 +116,7 @@ def test_abp_no_cpp(mock_triton_client, config: Config, tmp_path): @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_abp_cpp(config, tmp_path): +def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.FIL config.class_labels = ["mining"] config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -141,7 +142,8 @@ def test_abp_cpp(config, tmp_path): pipe.add_stage( TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(AddScoresStage(config, prefix="score_")) pipe.add_stage( @@ -158,7 +160,10 @@ def test_abp_cpp(config, tmp_path): @pytest.mark.slow @pytest.mark.use_python @mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_multi_segment_no_cpp(mock_triton_client, config: Config, tmp_path): +def test_abp_multi_segment_no_cpp(mock_triton_client: mock.MagicMock, + config: Config, + tmp_path: str, + morpheus_log_level: int): mock_metadata = { "inputs": [{ 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] @@ -213,7 +218,8 @@ def test_abp_multi_segment_no_cpp(mock_triton_client, config: Config, tmp_path): pipe.add_segment_boundary(MultiResponseMessage) # Boundary 3 - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config)) pipe.add_segment_boundary(MultiResponseMessage) # Boundary 4 diff --git a/tests/test_abp_kafka.py b/tests/test_abp_kafka.py index 0e1f040612..46306ff29c 100755 --- a/tests/test_abp_kafka.py +++ b/tests/test_abp_kafka.py @@ -61,7 +61,8 @@ def test_abp_no_cpp(mock_triton_client: mock.MagicMock, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): mock_metadata = { "inputs": [{ 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] @@ -115,7 +116,8 @@ def test_abp_no_cpp(mock_triton_client: mock.MagicMock, pipe.add_stage(PreprocessFILStage(config)) pipe.add_stage( TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( @@ -151,7 +153,8 @@ def test_abp_cpp(config: Config, dataset_pandas: DatasetManager, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): config.mode = PipelineModes.FIL config.class_labels = ["mining"] config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -183,7 +186,8 @@ def test_abp_cpp(config: Config, pipe.add_stage( TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( diff --git a/tests/test_dfp.py b/tests/test_dfp.py index d32ad3c1e8..2f3bacbdae 100755 --- a/tests/test_dfp.py +++ b/tests/test_dfp.py @@ -23,6 +23,7 @@ from _utils import TEST_DIRS from _utils import calc_error_val +from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.config import PipelineModes from morpheus.messages.message_meta import MessageMeta @@ -50,7 +51,7 @@ @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_roleg(mock_ae, config, tmp_path): +def test_dfp_roleg(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_tensor.csv'), delimiter=',') anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_anomaly_score.csv'), delimiter=',') exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_exp_results.csv')) @@ -107,7 +108,8 @@ def test_dfp_roleg(mock_ae, config, tmp_path): cold_end=False, filter_percent=90.0, zscore_threshold=8.0)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, @@ -135,7 +137,7 @@ def test_dfp_roleg(mock_ae, config, tmp_path): @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_user123(mock_ae, config, tmp_path): +def test_dfp_user123(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) @@ -190,7 +192,8 @@ def test_dfp_user123(mock_ae, config, tmp_path): cold_end=False, filter_percent=90.0, zscore_threshold=8.0)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, @@ -217,7 +220,7 @@ def test_dfp_user123(mock_ae, config, tmp_path): @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_user123_multi_segment(mock_ae, config, tmp_path): +def test_dfp_user123_multi_segment(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) @@ -278,7 +281,8 @@ def test_dfp_user123_multi_segment(mock_ae, config, tmp_path): filter_percent=90.0, zscore_threshold=8.0)) pipe.add_segment_boundary(MultiResponseMessage) # Boundary 6 - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, diff --git a/tests/test_dfp_kafka.py b/tests/test_dfp_kafka.py index 5b28ae6f7c..8bd4900b96 100755 --- a/tests/test_dfp_kafka.py +++ b/tests/test_dfp_kafka.py @@ -64,7 +64,8 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_tensor.csv'), delimiter=',') anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_anomaly_score.csv'), delimiter=',') exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_exp_results.csv')) @@ -116,7 +117,8 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, cold_end=False, filter_percent=90.0, zscore_threshold=8.0)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(SerializeStage(config, include=[])) pipe.add_stage( WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) @@ -166,7 +168,8 @@ def test_dfp_user123(mock_ae: mock.MagicMock, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) @@ -217,7 +220,8 @@ def test_dfp_user123(mock_ae: mock.MagicMock, cold_end=False, filter_percent=90.0, zscore_threshold=8.0)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(SerializeStage(config, include=[])) pipe.add_stage( WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) diff --git a/tests/test_monitor_stage.py b/tests/test_monitor_stage.py index e023f159b3..68b1b35ca7 100755 --- a/tests/test_monitor_stage.py +++ b/tests/test_monitor_stage.py @@ -151,23 +151,22 @@ def test_progress_sink(mock_morph_tqdm: mock.MagicMock, config: Config): @pytest.mark.usefixtures("reset_loglevel") -@pytest.mark.parametrize('morpheus_log_level', - [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) +@pytest.mark.parametrize('log_level', [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]) @mock.patch('morpheus.stages.general.monitor_stage.MonitorController.sink_on_completed', autospec=True) @mock.patch('morpheus.stages.general.monitor_stage.MonitorController.progress_sink', autospec=True) def test_log_level(mock_progress_sink: mock.MagicMock, mock_sink_on_completed: mock.MagicMock, config: Config, - morpheus_log_level: int): + log_level: int): """ Test ensures the monitor stage doesn't add itself to the MRC pipeline if not configured for the current log-level """ input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") - set_log_level(morpheus_log_level) + set_log_level(log_level) monitor_stage_level = logging.INFO - should_be_included = (morpheus_log_level <= monitor_stage_level) + should_be_included = (log_level <= monitor_stage_level) pipe = LinearPipeline(config) pipe.set_source(FileSourceStage(config, filename=input_file)) @@ -179,16 +178,13 @@ def test_log_level(mock_progress_sink: mock.MagicMock, assert mock_sink_on_completed.call_count == expected_call_count -@pytest.mark.usefixtures("reset_loglevel") @pytest.mark.use_python -def test_thread(config: Config): +def test_thread(config: Config, morpheus_log_level: int): """ - Test ensures the monitor stage doesn't add itself to the MRC pipeline if not configured for the current log-level + Test ensures the monitor stage executes on the same thread as the parent stage """ input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") - set_log_level(log_level=logging.INFO) - monitor_thread_id = None # Create a dummy count function where we can save the thread id from the monitor stage @@ -202,8 +198,9 @@ def fake_determine_count_fn(x): pipe = LinearPipeline(config) pipe.set_source(FileSourceStage(config, filename=input_file)) dummy_stage = pipe.add_stage(RecordThreadIdStage(config)) - pipe.add_stage(MonitorStage(config, determine_count_fn=fake_determine_count_fn)) + pipe.add_stage(MonitorStage(config, determine_count_fn=fake_determine_count_fn, log_level=morpheus_log_level)) pipe.run() # Check that the thread ids are the same + assert monitor_thread_id is not None assert dummy_stage.thread_id == monitor_thread_id diff --git a/tests/test_phishing.py b/tests/test_phishing.py index 4f434e993e..77e752ef3f 100755 --- a/tests/test_phishing.py +++ b/tests/test_phishing.py @@ -23,6 +23,7 @@ from _utils import TEST_DIRS from _utils import calc_error_val from _utils import mk_async_infer +from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -44,7 +45,7 @@ @pytest.mark.slow @pytest.mark.use_python @mock.patch('tritonclient.grpc.InferenceServerClient') -def test_email_no_cpp(mock_triton_client, config, tmp_path): +def test_email_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): mock_metadata = { "inputs": [{ "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] @@ -96,7 +97,8 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path): pipe.add_stage( TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) @@ -111,7 +113,7 @@ def test_email_no_cpp(mock_triton_client, config, tmp_path): @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_email_cpp(config, tmp_path): +def test_email_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.NLP config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -139,7 +141,8 @@ def test_email_cpp(config, tmp_path): model_name='phishing-bert-onnx', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) diff --git a/tests/test_phishing_kafka.py b/tests/test_phishing_kafka.py index ba8fa1a14f..1a04061cc9 100755 --- a/tests/test_phishing_kafka.py +++ b/tests/test_phishing_kafka.py @@ -60,7 +60,8 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): mock_metadata = { "inputs": [{ "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] @@ -120,7 +121,8 @@ def test_email_no_cpp(mock_triton_client: mock.MagicMock, pipe.add_stage( TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( @@ -153,7 +155,8 @@ def test_email_cpp(dataset_pandas: DatasetManager, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): config.mode = PipelineModes.NLP config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -187,7 +190,8 @@ def test_email_cpp(dataset_pandas: DatasetManager, model_name='phishing-bert-onnx', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) pipe.add_stage(SerializeStage(config)) pipe.add_stage( diff --git a/tests/test_sid.py b/tests/test_sid.py index 67ca36161c..2221abe930 100755 --- a/tests/test_sid.py +++ b/tests/test_sid.py @@ -25,6 +25,7 @@ from _utils import calc_error_val from _utils import compare_class_to_scores from _utils import mk_async_infer +from morpheus.config import Config from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline import LinearPipeline @@ -44,7 +45,15 @@ MODEL_MAX_BATCH_SIZE = 32 -def _run_minibert_pipeline(config, tmp_path, model_name, truncated, data_col_name: str = "data"): +def _run_minibert_pipeline( + *, + config: Config, + tmp_path: str, + model_name: str, + truncated: bool, + morpheus_log_level: int, + data_col_name: str = "data", +): """ Runs just the Minibert Pipeline """ @@ -100,7 +109,8 @@ def _run_minibert_pipeline(config, tmp_path, model_name, truncated, data_col_nam column=data_col_name)) pipe.add_stage( TritonInferenceStage(config, model_name=model_name, server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="si_")) pipe.add_stage(AddScoresStage(config, prefix="score_")) pipe.add_stage( @@ -113,7 +123,13 @@ def _run_minibert_pipeline(config, tmp_path, model_name, truncated, data_col_nam return calc_error_val(results_file_name) -def _run_minibert(config, tmp_path, model_name, truncated, data_col_name: str = "data"): +def _run_minibert(*, + config: Config, + tmp_path: str, + model_name: str, + truncated: bool, + morpheus_log_level: int, + data_col_name: str = "data"): """ Runs the minibert pipeline and mocks the Triton Python interface """ @@ -145,15 +161,24 @@ def _run_minibert(config, tmp_path, model_name, truncated, data_col_name: str = async_infer = mk_async_infer(inf_results) mock_triton_client.async_infer.side_effect = async_infer - return _run_minibert_pipeline(config, tmp_path, model_name, truncated, data_col_name) + return _run_minibert_pipeline(config=config, + tmp_path=tmp_path, + model_name=model_name, + truncated=truncated, + data_col_name=data_col_name, + morpheus_log_level=morpheus_log_level) @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_minibert_no_trunc(config, tmp_path): +def test_minibert_no_trunc(config: Config, tmp_path: str, morpheus_log_level: int): - results = _run_minibert(config, tmp_path, "sid-minibert-onnx-no-trunc", False) + results = _run_minibert(config=config, + tmp_path=tmp_path, + model_name="sid-minibert-onnx-no-trunc", + truncated=False, + morpheus_log_level=morpheus_log_level) # Not sure why these are different if (CppConfig.get_should_use_cpp()): @@ -164,22 +189,15 @@ def test_minibert_no_trunc(config, tmp_path): @pytest.mark.slow @pytest.mark.usefixtures("launch_mock_triton") -def test_minibert_truncated(config, tmp_path): - - results = _run_minibert(config, tmp_path, 'sid-minibert-onnx', True) - - # Not sure why these are different - if (CppConfig.get_should_use_cpp()): - assert results.diff_rows == 1204 - else: - assert results.diff_rows == 1333 - - -@pytest.mark.slow -@pytest.mark.usefixtures("launch_mock_triton") -def test_minibert_data_col_name(config, tmp_path): - - results = _run_minibert(config, tmp_path, 'sid-minibert-onnx', True, "definitely_not_data") +@pytest.mark.parametrize("data_col_name", ["data", "definitely_not_data"]) +def test_minibert_truncated(config: Config, tmp_path: str, morpheus_log_level: int, data_col_name: str): + + results = _run_minibert(config=config, + tmp_path=tmp_path, + model_name='sid-minibert-onnx', + truncated=True, + data_col_name=data_col_name, + morpheus_log_level=morpheus_log_level) # Not sure why these are different if (CppConfig.get_should_use_cpp()): diff --git a/tests/test_sid_kafka.py b/tests/test_sid_kafka.py index ecc87de4b3..a50544c9c9 100755 --- a/tests/test_sid_kafka.py +++ b/tests/test_sid_kafka.py @@ -58,7 +58,8 @@ def test_minibert_no_cpp(mock_triton_client: mock.MagicMock, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): mock_metadata = { "inputs": [{ "name": "input_ids", "datatype": "INT32", "shape": [-1, FEATURE_LENGTH] @@ -117,7 +118,8 @@ def test_minibert_no_cpp(mock_triton_client: mock.MagicMock, add_special_tokens=False)) pipe.add_stage( TritonInferenceStage(config, model_name='sid-minibert-onnx', server_url='fake:001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="si_")) pipe.add_stage(SerializeStage(config)) pipe.add_stage( @@ -150,7 +152,8 @@ def test_minibert_cpp(dataset_pandas: DatasetManager, config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer"): + kafka_consumer: "KafkaConsumer", + morpheus_log_level: int): config.mode = PipelineModes.NLP config.class_labels = [ "address", @@ -187,7 +190,8 @@ def test_minibert_cpp(dataset_pandas: DatasetManager, model_name='sid-minibert-onnx', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) + pipe.add_stage( + MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) pipe.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="si_")) pipe.add_stage(SerializeStage(config)) pipe.add_stage( From 69b8f193495c90438a4f338d643bcb7894b6f63d Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:42:30 -0700 Subject: [PATCH 14/38] Fix mis-spelling of examples (#1664) ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1664 --- examples/llm/agents/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llm/agents/README.md b/examples/llm/agents/README.md index 00c84baee9..f336fac245 100644 --- a/examples/llm/agents/README.md +++ b/examples/llm/agents/README.md @@ -118,7 +118,7 @@ This example demonstrates the basic implementation of Morpheus pipeline, showcas ```bash -python exmaples/llm/main.py agents simple [OPTIONS] +python examples/llm/main.py agents simple [OPTIONS] ``` ### Options: @@ -170,7 +170,7 @@ kafka-topics.sh --bootstrap-server ${BOOTSTRAP_SERVER} --alter --topic input --p Now Kafka example can be run using the following command with the below listed options: ```bash -python exmaples/llm/main.py agents kafka [OPTIONS] +python examples/llm/main.py agents kafka [OPTIONS] ``` ### Options: From 1e7db1fc4bdfb35f3f133a6ccfd41318ebc581b5 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Wed, 24 Apr 2024 12:55:10 -0700 Subject: [PATCH 15/38] Updating CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8386e6ac8e..ef1d9eff16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ limitations under the License. - Document current known issues in 24.03.02 ([#1656](https://github.com/nv-morpheus/Morpheus/pull/1656)) [@dagardner-nv](https://github.com/dagardner-nv) - Fix documentation for building examples ([#1659](https://github.com/nv-morpheus/Morpheus/pull/1659)) [@dagardner-nv](https://github.com/dagardner-nv) - Fix type-o in documentation ([#1662](https://github.com/nv-morpheus/Morpheus/pull/1662)) [@dagardner-nv](https://github.com/dagardner-nv) +- Fix mis-spelling of examples ([#1664](https://github.com/nv-morpheus/Morpheus/pull/1664)) [@dagardner-nv](https://github.com/dagardner-nv) # Morpheus 24.03.01 (10 Apr 2024) From 31b610191f4c520360e2b18635a7486d4d6ef540 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 24 Apr 2024 15:43:38 -0500 Subject: [PATCH 16/38] `ControlMessage` support in `TritonInferenceStage` and `PreallocatorMixin` (#1610) Introduces `ControlMessage` support for `TritonInferenceStage`, and updates some of the abp tests accordingly. Also updates `PreallocatorMixin` for `ControlMessage` support. ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1610 --- .../stages/inference_client_stage.hpp | 46 +++- .../include/morpheus/stages/preallocate.hpp | 8 + .../src/stages/inference_client_stage.cpp | 231 ++++++++++++++---- morpheus/_lib/src/stages/triton_inference.cpp | 9 +- morpheus/_lib/stages/__init__.pyi | 12 +- morpheus/_lib/stages/module.cpp | 33 ++- .../stages/test_triton_inference_stage.cpp | 3 +- morpheus/pipeline/preallocator_mixin.py | 15 +- morpheus/pipeline/stage_base.py | 2 + morpheus/stages/inference/inference_stage.py | 61 +---- .../inference/triton_inference_stage.py | 24 +- .../stages/postprocess/validation_stage.py | 3 +- .../preprocess/preprocess_base_stage.py | 14 +- tests/test_abp.py | 36 ++- tests/test_inference_stage.py | 62 ----- tests/test_sid.py | 19 +- 16 files changed, 365 insertions(+), 213 deletions(-) diff --git a/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp b/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp index 24d142184d..fd115de5af 100644 --- a/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp +++ b/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp @@ -18,10 +18,12 @@ #pragma once #include "morpheus/export.h" +#include "morpheus/messages/control.hpp" #include "morpheus/messages/multi_inference.hpp" #include "morpheus/messages/multi_response.hpp" #include "morpheus/types.hpp" +#include #include #include #include @@ -29,6 +31,7 @@ #include #include #include +#include #include #include @@ -93,12 +96,13 @@ class MORPHEUS_EXPORT IInferenceClient * @brief Perform inference with Triton Inference Server. * This class specifies which inference implementation category (Ex: NLP/FIL) is needed for inferencing. */ +template class MORPHEUS_EXPORT InferenceClientStage - : public mrc::pymrc::AsyncioRunnable, std::shared_ptr> + : public mrc::pymrc::AsyncioRunnable, std::shared_ptr> { public: - using sink_type_t = std::shared_ptr; - using source_type_t = std::shared_ptr; + using sink_type_t = std::shared_ptr; + using source_type_t = std::shared_ptr; /** * @brief Construct a new Inference Client Stage object @@ -117,11 +121,11 @@ class MORPHEUS_EXPORT InferenceClientStage std::vector output_mapping); /** - * Process a single MultiInferenceMessage by running the constructor-provided inference client against it's Tensor, - * and yields the result as a MultiResponseMessage + * Process a single InputT by running the constructor-provided inference client against it's Tensor, + * and yields the result as a OutputT */ - mrc::coroutines::AsyncGenerator> on_data( - std::shared_ptr&& data, std::shared_ptr on) override; + mrc::coroutines::AsyncGenerator> on_data( + std::shared_ptr&& data, std::shared_ptr on) override; private: std::string m_model_name; @@ -142,7 +146,7 @@ class MORPHEUS_EXPORT InferenceClientStage struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy { /** - * @brief Create and initialize a InferenceClientStage, and return the result + * @brief Create and initialize a MultiMessage-based InferenceClientStage, and return the result * * @param builder : Pipeline context object reference * @param name : Name of a stage reference @@ -152,9 +156,31 @@ struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy * @param needs_logits : Determines if logits are required. * @param inout_mapping : Dictionary used to map pipeline input/output names to Triton input/output names. Use this * if the Morpheus names do not match the model. - * @return std::shared_ptr> + * @return std::shared_ptr>> */ - static std::shared_ptr> init( + static std::shared_ptr>> + init_mm(mrc::segment::Builder& builder, + const std::string& name, + std::string model_name, + std::string server_url, + bool needs_logits, + std::map input_mapping, + std::map output_mapping); + + /** + * @brief Create and initialize a ControlMessage-based InferenceClientStage, and return the result + * + * @param builder : Pipeline context object reference + * @param name : Name of a stage reference + * @param model_name : Name of the model specifies which model can handle the inference requests that are sent to + * Triton inference + * @param server_url : Triton server URL. + * @param needs_logits : Determines if logits are required. + * @param inout_mapping : Dictionary used to map pipeline input/output names to Triton input/output names. Use this + * if the Morpheus names do not match the model. + * @return std::shared_ptr>> + */ + static std::shared_ptr>> init_cm( mrc::segment::Builder& builder, const std::string& name, std::string model_name, diff --git a/morpheus/_lib/include/morpheus/stages/preallocate.hpp b/morpheus/_lib/include/morpheus/stages/preallocate.hpp index 30b6b186c6..ab1cabdde0 100644 --- a/morpheus/_lib/include/morpheus/stages/preallocate.hpp +++ b/morpheus/_lib/include/morpheus/stages/preallocate.hpp @@ -17,6 +17,7 @@ #pragma once +#include "morpheus/messages/control.hpp" #include "morpheus/messages/meta.hpp" #include "morpheus/messages/multi.hpp" #include "morpheus/objects/dtype.hpp" // for TypeId @@ -51,11 +52,18 @@ void preallocate(std::shared_ptr msg, table.insert_missing_columns(columns); } +void preallocate(std::shared_ptr msg, + const std::vector>& columns) +{ + preallocate(msg->payload(), columns); +} + void preallocate(std::shared_ptr msg, const std::vector>& columns) { preallocate(msg->meta, columns); } + } // namespace /****** Component public implementations *******************/ diff --git a/morpheus/_lib/src/stages/inference_client_stage.cpp b/morpheus/_lib/src/stages/inference_client_stage.cpp index 069ccd557e..26428aa159 100644 --- a/morpheus/_lib/src/stages/inference_client_stage.cpp +++ b/morpheus/_lib/src/stages/inference_client_stage.cpp @@ -17,8 +17,13 @@ #include "morpheus/stages/inference_client_stage.hpp" +#include "morpheus/messages/control.hpp" #include "morpheus/messages/memory/response_memory.hpp" #include "morpheus/messages/memory/tensor_memory.hpp" +#include "morpheus/messages/meta.hpp" +#include "morpheus/messages/multi_inference.hpp" +#include "morpheus/messages/multi_response.hpp" +#include "morpheus/objects/data_table.hpp" #include "morpheus/objects/dev_mem_info.hpp" #include "morpheus/objects/dtype.hpp" #include "morpheus/objects/tensor.hpp" @@ -26,22 +31,26 @@ #include "morpheus/stages/triton_inference.hpp" #include "morpheus/utilities/matx_util.hpp" -#include #include #include #include +#include #include #include #include +#include #include #include #include +#include #include namespace { -static morpheus::ShapeType get_seq_ids(const morpheus::InferenceClientStage::sink_type_t& message) +using namespace morpheus; + +static ShapeType get_seq_ids(const std::shared_ptr& message) { // Take a copy of the sequence Ids allowing us to map rows in the response to rows in the dataframe // The output tensors we store in `reponse_memory` will all be of the same length as the the @@ -49,7 +58,7 @@ static morpheus::ShapeType get_seq_ids(const morpheus::InferenceClientStage::sin auto seq_ids = message->get_input("seq_ids"); const auto item_size = seq_ids.dtype().item_size(); - morpheus::ShapeType host_seq_ids(message->count); + ShapeType host_seq_ids(message->count); MRC_CHECK_CUDA(cudaMemcpy2D(host_seq_ids.data(), item_size, seq_ids.data(), @@ -61,35 +70,109 @@ static morpheus::ShapeType get_seq_ids(const morpheus::InferenceClientStage::sin return host_seq_ids; } -static void reduce_outputs(const morpheus::InferenceClientStage::sink_type_t& x, morpheus::TensorMap& output_tensors) +static ShapeType get_seq_ids(const std::shared_ptr& message) +{ + // Take a copy of the sequence Ids allowing us to map rows in the response to rows in the dataframe + // The output tensors we store in `reponse_memory` will all be of the same length as the the + // dataframe. seq_ids has three columns, but we are only interested in the first column. + auto seq_ids = message->tensors()->get_tensor("seq_ids"); + const auto item_size = seq_ids.dtype().item_size(); + + ShapeType host_seq_ids(message->tensors()->count); + MRC_CHECK_CUDA(cudaMemcpy2D(host_seq_ids.data(), + item_size, + seq_ids.data(), + seq_ids.stride(0) * item_size, + item_size, + host_seq_ids.size(), + cudaMemcpyDeviceToHost)); + + return host_seq_ids; +} + +static bool has_tensor(std::shared_ptr message, std::string const& tensor_name) +{ + return message->memory->has_tensor(tensor_name); +} + +static bool has_tensor(std::shared_ptr message, std::string const& tensor_name) +{ + return message->tensors()->has_tensor(tensor_name); +} + +static TensorObject get_tensor(std::shared_ptr message, std::string const& tensor_name) +{ + return message->get_input(tensor_name); +} + +static TensorObject get_tensor(std::shared_ptr message, std::string const& tensor_name) { + return message->tensors()->get_tensor(tensor_name); +} + +static void reduce_outputs(std::shared_ptr const& message, TensorMap& output_tensors) +{ + if (message->mess_count == message->count) + { + return; + } + // When our tensor lengths are longer than our dataframe we will need to use the seq_ids array to // lookup how the values should map back into the dataframe. - auto host_seq_ids = get_seq_ids(x); + auto host_seq_ids = get_seq_ids(message); for (auto& mapping : output_tensors) { auto& output_tensor = mapping.second; - morpheus::ShapeType shape = output_tensor.get_shape(); - morpheus::ShapeType stride = output_tensor.get_stride(); + ShapeType shape = output_tensor.get_shape(); + ShapeType stride = output_tensor.get_stride(); - morpheus::ShapeType reduced_shape{shape}; - reduced_shape[0] = x->mess_count; + ShapeType reduced_shape{shape}; + reduced_shape[0] = message->mess_count; - auto reduced_buffer = morpheus::MatxUtil::reduce_max( - morpheus::DevMemInfo{ - output_tensor.data(), output_tensor.dtype(), output_tensor.get_memory(), shape, stride}, + auto reduced_buffer = MatxUtil::reduce_max( + DevMemInfo{output_tensor.data(), output_tensor.dtype(), output_tensor.get_memory(), shape, stride}, host_seq_ids, 0, reduced_shape); - output_tensor.swap( - morpheus::Tensor::create(std::move(reduced_buffer), output_tensor.dtype(), reduced_shape, stride, 0)); + output_tensor.swap(Tensor::create(std::move(reduced_buffer), output_tensor.dtype(), reduced_shape, stride, 0)); } } -static void apply_logits(morpheus::TensorMap& output_tensors) +static void reduce_outputs(std::shared_ptr const& message, TensorMap& output_tensors) +{ + if (message->payload()->count() == message->tensors()->count) + { + return; + } + + // When our tensor lengths are longer than our dataframe we will need to use the seq_ids array to + // lookup how the values should map back into the dataframe. + auto host_seq_ids = get_seq_ids(message); + + for (auto& mapping : output_tensors) + { + auto& output_tensor = mapping.second; + + ShapeType shape = output_tensor.get_shape(); + ShapeType stride = output_tensor.get_stride(); + + ShapeType reduced_shape{shape}; + reduced_shape[0] = message->payload()->count(); + + auto reduced_buffer = MatxUtil::reduce_max( + DevMemInfo{output_tensor.data(), output_tensor.dtype(), output_tensor.get_memory(), shape, stride}, + host_seq_ids, + 0, + reduced_shape); + + output_tensor.swap(Tensor::create(std::move(reduced_buffer), output_tensor.dtype(), reduced_shape, stride, 0)); + } +} + +static void apply_logits(TensorMap& output_tensors) { for (auto& mapping : output_tensors) { @@ -110,11 +193,12 @@ static void apply_logits(morpheus::TensorMap& output_tensors) namespace morpheus { -InferenceClientStage::InferenceClientStage(std::unique_ptr&& client, - std::string model_name, - bool needs_logits, - std::vector input_mapping, - std::vector output_mapping) : +template +InferenceClientStage::InferenceClientStage(std::unique_ptr&& client, + std::string model_name, + bool needs_logits, + std::vector input_mapping, + std::vector output_mapping) : m_model_name(std::move(model_name)), m_client(std::move(client)), m_needs_logits(needs_logits), @@ -149,8 +233,26 @@ struct ExponentialBackoff } }; -mrc::coroutines::AsyncGenerator> InferenceClientStage::on_data( - std::shared_ptr&& x, std::shared_ptr on) +static std::shared_ptr make_response(std::shared_ptr message, + TensorMap&& output_tensor_map) +{ + // Final output of all mini-batches + auto response_mem = std::make_shared(message->mess_count, std::move(output_tensor_map)); + + return std::make_shared( + message->meta, message->mess_offset, message->mess_count, std::move(response_mem), 0, response_mem->count); +} + +static std::shared_ptr make_response(std::shared_ptr message, + TensorMap&& output_tensor_map) +{ + message->tensors(std::make_shared(message->payload()->count(), std::move(output_tensor_map))); + return message; +} + +template +mrc::coroutines::AsyncGenerator> InferenceClientStage::on_data( + std::shared_ptr&& message, std::shared_ptr on) { int32_t retry_count = 0; @@ -192,9 +294,9 @@ mrc::coroutines::AsyncGenerator> Inference for (auto mapping : message_session->get_input_mappings(m_input_mapping)) { - if (x->memory->has_tensor(mapping.tensor_field_name)) + if (has_tensor(message, mapping.tensor_field_name)) { - model_input_tensors[mapping.model_field_name].swap(x->get_input(mapping.tensor_field_name)); + model_input_tensors[mapping.model_field_name].swap(get_tensor(message, mapping.tensor_field_name)); } } @@ -202,10 +304,7 @@ mrc::coroutines::AsyncGenerator> Inference co_await on->yield(); - if (x->mess_count != x->count) - { - reduce_outputs(x, model_output_tensors); - } + reduce_outputs(message, model_output_tensors); // If we need to do logits, do that here if (m_needs_logits) @@ -228,16 +327,28 @@ mrc::coroutines::AsyncGenerator> Inference } } - // Final output of all mini-batches - auto response_mem = std::make_shared(x->mess_count, std::move(output_tensor_map)); - - auto response = std::make_shared( - x->meta, x->mess_offset, x->mess_count, std::move(response_mem), 0, response_mem->count); + auto result = make_response(message, std::move(output_tensor_map)); - co_yield std::move(response); + co_yield result; co_return; + } catch (std::runtime_error ex) + { + auto lock = std::unique_lock(m_session_mutex); + + if (m_session == message_session) + { + m_session.reset(); + } + + if (m_retry_max >= 0 and ++retry_count > m_retry_max) + { + throw; + } + + LOG(WARNING) << "Exception while processing message for InferenceClientStage, attempting retry. ex.what(): " + << ex.what(); } catch (...) { auto lock = std::unique_lock(m_session_mutex); @@ -260,14 +371,45 @@ mrc::coroutines::AsyncGenerator> Inference } // ************ InferenceClientStageInterfaceProxy********* // -std::shared_ptr> InferenceClientStageInterfaceProxy::init( - mrc::segment::Builder& builder, - const std::string& name, - std::string server_url, - std::string model_name, - bool needs_logits, - std::map input_mappings, - std::map output_mappings) +std::shared_ptr>> +InferenceClientStageInterfaceProxy::init_mm(mrc::segment::Builder& builder, + const std::string& name, + std::string server_url, + std::string model_name, + bool needs_logits, + std::map input_mappings, + std::map output_mappings) +{ + std::vector input_mappings_{}; + std::vector output_mappings_{}; + + for (auto& mapping : input_mappings) + { + input_mappings_.emplace_back(TensorModelMapping{mapping.first, mapping.second}); + } + + for (auto& mapping : output_mappings) + { + output_mappings_.emplace_back(TensorModelMapping{mapping.first, mapping.second}); + } + + auto triton_client = std::make_unique(server_url); + auto triton_inference_client = std::make_unique(std::move(triton_client), model_name); + auto stage = builder.construct_object>( + name, std::move(triton_inference_client), model_name, needs_logits, input_mappings_, output_mappings_); + + return stage; +} + +// ************ InferenceClientStageInterfaceProxy********* // +std::shared_ptr>> +InferenceClientStageInterfaceProxy::init_cm(mrc::segment::Builder& builder, + const std::string& name, + std::string server_url, + std::string model_name, + bool needs_logits, + std::map input_mappings, + std::map output_mappings) { std::vector input_mappings_{}; std::vector output_mappings_{}; @@ -284,10 +426,13 @@ std::shared_ptr> InferenceClientStage auto triton_client = std::make_unique(server_url); auto triton_inference_client = std::make_unique(std::move(triton_client), model_name); - auto stage = builder.construct_object( + auto stage = builder.construct_object>( name, std::move(triton_inference_client), model_name, needs_logits, input_mappings_, output_mappings_); return stage; } +template class InferenceClientStage; +template class InferenceClientStage; + } // namespace morpheus diff --git a/morpheus/_lib/src/stages/triton_inference.cpp b/morpheus/_lib/src/stages/triton_inference.cpp index 6464c3be5d..30f100e7ea 100644 --- a/morpheus/_lib/src/stages/triton_inference.cpp +++ b/morpheus/_lib/src/stages/triton_inference.cpp @@ -476,12 +476,13 @@ mrc::coroutines::Task TritonInferenceClientSession::infer(TensorMap&& const uint8_t* output_ptr = nullptr; size_t output_ptr_size = 0; + CHECK_TRITON(results->RawData(model_output.name, &output_ptr, &output_ptr_size)); - DCHECK_EQ(stop - start, output_shape[0]); - DCHECK_EQ(output_tensor.bytes(), output_ptr_size); - DCHECK_NOTNULL(output_ptr); // NOLINT - DCHECK_NOTNULL(output_tensor.data()); // NOLINT + // DCHECK_EQ(stop - start, output_shape[0]); + // DCHECK_EQ(output_tensor.bytes(), output_ptr_size); + // DCHECK_NOTNULL(output_ptr); // NOLINT + // DCHECK_NOTNULL(output_tensor.data()); // NOLINT MRC_CHECK_CUDA(cudaMemcpy(output_tensor.data(), output_ptr, output_ptr_size, cudaMemcpyHostToDevice)); } diff --git a/morpheus/_lib/stages/__init__.pyi b/morpheus/_lib/stages/__init__.pyi index 515bab0c12..85767bdcef 100644 --- a/morpheus/_lib/stages/__init__.pyi +++ b/morpheus/_lib/stages/__init__.pyi @@ -24,8 +24,10 @@ __all__ = [ "FilterDetectionsStage", "FilterSource", "HttpServerSourceStage", - "InferenceClientStage", + "InferenceClientStageCM", + "InferenceClientStageMM", "KafkaSourceStage", + "PreallocateControlMessageStage", "PreallocateMessageMetaStage", "PreallocateMultiMessageStage", "PreprocessFILControlMessageStage", @@ -68,7 +70,10 @@ class FilterDetectionsStage(mrc.core.segment.SegmentObject): class HttpServerSourceStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, bind_address: str = '127.0.0.1', port: int = 8080, endpoint: str = '/message', method: str = 'POST', accept_status: int = 201, sleep_time: float = 0.10000000149011612, queue_timeout: int = 5, max_queue_size: int = 1024, num_server_threads: int = 1, max_payload_size: int = 10485760, request_timeout: int = 30, lines: bool = False, stop_after: int = 0) -> None: ... pass -class InferenceClientStage(mrc.core.segment.SegmentObject): +class InferenceClientStageCM(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... + pass +class InferenceClientStageMM(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... pass class KafkaSourceStage(mrc.core.segment.SegmentObject): @@ -77,6 +82,9 @@ class KafkaSourceStage(mrc.core.segment.SegmentObject): @typing.overload def __init__(self, builder: mrc.core.segment.Builder, name: str, max_batch_size: int, topics: typing.List[str], batch_timeout_ms: int, config: typing.Dict[str, str], disable_commits: bool = False, disable_pre_filtering: bool = False, stop_after: int = 0, async_commits: bool = True, oauth_callback: typing.Optional[function] = None) -> None: ... pass +class PreallocateControlMessageStage(mrc.core.segment.SegmentObject): + def __init__(self, builder: mrc.core.segment.Builder, name: str, needed_columns: typing.List[typing.Tuple[str, morpheus._lib.common.TypeId]]) -> None: ... + pass class PreallocateMessageMetaStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, needed_columns: typing.List[typing.Tuple[str, morpheus._lib.common.TypeId]]) -> None: ... pass diff --git a/morpheus/_lib/stages/module.cpp b/morpheus/_lib/stages/module.cpp index 7b0d7ea293..6cdba387f0 100644 --- a/morpheus/_lib/stages/module.cpp +++ b/morpheus/_lib/stages/module.cpp @@ -18,6 +18,8 @@ #include "morpheus/messages/control.hpp" #include "morpheus/messages/meta.hpp" #include "morpheus/messages/multi.hpp" +#include "morpheus/messages/multi_inference.hpp" +#include "morpheus/messages/multi_response.hpp" #include "morpheus/objects/file_types.hpp" #include "morpheus/stages/add_classification.hpp" #include "morpheus/stages/add_scores.hpp" @@ -164,11 +166,25 @@ PYBIND11_MODULE(stages, _module) py::arg("filter_source"), py::arg("field_name") = "probs"); - py::class_, + py::class_< + mrc::segment::Object>, + mrc::segment::ObjectProperties, + std::shared_ptr>>>( + _module, "InferenceClientStageMM", py::multiple_inheritance()) + .def(py::init<>(&InferenceClientStageInterfaceProxy::init_mm), + py::arg("builder"), + py::arg("name"), + py::arg("server_url"), + py::arg("model_name"), + py::arg("needs_logits"), + py::arg("input_mapping") = py::dict(), + py::arg("output_mapping") = py::dict()); + + py::class_>, mrc::segment::ObjectProperties, - std::shared_ptr>>( - _module, "InferenceClientStage", py::multiple_inheritance()) - .def(py::init<>(&InferenceClientStageInterfaceProxy::init), + std::shared_ptr>>>( + _module, "InferenceClientStageCM", py::multiple_inheritance()) + .def(py::init<>(&InferenceClientStageInterfaceProxy::init_cm), py::arg("builder"), py::arg("name"), py::arg("server_url"), @@ -206,6 +222,15 @@ PYBIND11_MODULE(stages, _module) py::arg("async_commits") = true, py::arg("oauth_callback") = py::none()); + py::class_>, + mrc::segment::ObjectProperties, + std::shared_ptr>>>( + _module, "PreallocateControlMessageStage", py::multiple_inheritance()) + .def(py::init<>(&PreallocateStageInterfaceProxy::init), + py::arg("builder"), + py::arg("name"), + py::arg("needed_columns")); + py::class_>, mrc::segment::ObjectProperties, std::shared_ptr>>>( diff --git a/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp b/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp index c7a566b011..df7785d259 100644 --- a/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp +++ b/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp @@ -309,7 +309,8 @@ TEST_F(TestTritonInferenceStage, SingleRow) // create the fake triton client used for testing. auto triton_client = std::make_unique(); auto triton_inference_client = std::make_unique(std::move(triton_client), ""); - auto stage = morpheus::InferenceClientStage(std::move(triton_inference_client), "", false, {}, {}); + auto stage = morpheus::InferenceClientStage( + std::move(triton_inference_client), "", false, {}, {}); // manually invoke the stage and iterate through the inference responses auto on = std::make_shared(); diff --git a/morpheus/pipeline/preallocator_mixin.py b/morpheus/pipeline/preallocator_mixin.py index 61e9cd3702..c40ed6be04 100644 --- a/morpheus/pipeline/preallocator_mixin.py +++ b/morpheus/pipeline/preallocator_mixin.py @@ -28,6 +28,7 @@ from morpheus.common import TypeId from morpheus.common import typeid_to_numpy_str from morpheus.config import CppConfig +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiMessage from morpheus.utils.type_aliases import DataFrameType @@ -85,6 +86,10 @@ def _preallocate_multi(self, msg: MultiMessage) -> MultiMessage: self._preallocate_meta(msg.meta) return msg + def _preallocate_control(self, msg: ControlMessage) -> ControlMessage: + self._preallocate_meta(msg.payload()) + return msg + def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: out_type = self.output_ports[0].output_type pretty_type = pretty_print_type_name(out_type) @@ -92,17 +97,21 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) if len(self._needed_columns) > 0: node_name = f"{self.unique_name}-preallocate" - if issubclass(out_type, (MessageMeta, MultiMessage)): + if issubclass(out_type, (ControlMessage, MessageMeta, MultiMessage)): # Intentionally not using `_build_cpp_node` because `LinearBoundaryIngressStage` lacks a C++ impl if CppConfig.get_should_use_cpp(): import morpheus._lib.stages as _stages needed_columns = list(self._needed_columns.items()) - if issubclass(out_type, MessageMeta): + if issubclass(out_type, ControlMessage): + node = _stages.PreallocateControlMessageStage(builder, node_name, needed_columns) + elif issubclass(out_type, MessageMeta): node = _stages.PreallocateMessageMetaStage(builder, node_name, needed_columns) else: node = _stages.PreallocateMultiMessageStage(builder, node_name, needed_columns) else: - if issubclass(out_type, MessageMeta): + if issubclass(out_type, ControlMessage): + node = builder.make_node(node_name, ops.map(self._preallocate_control)) + elif issubclass(out_type, MessageMeta): node = builder.make_node(node_name, ops.map(self._preallocate_meta)) else: node = builder.make_node(node_name, ops.map(self._preallocate_multi)) diff --git a/morpheus/pipeline/stage_base.py b/morpheus/pipeline/stage_base.py index 3aa3b2f450..290ed83992 100644 --- a/morpheus/pipeline/stage_base.py +++ b/morpheus/pipeline/stage_base.py @@ -80,6 +80,8 @@ class StageBase(ABC, collections.abc.Hashable): __ID_COUNTER = AtomicInteger(0) + _schema: _pipeline.StageSchema + def __init__(self, config: Config): # Save the config self._config = config diff --git a/morpheus/stages/inference/inference_stage.py b/morpheus/stages/inference/inference_stage.py index 579ddccd53..8b1fa75d3a 100644 --- a/morpheus/stages/inference/inference_stage.py +++ b/morpheus/stages/inference/inference_stage.py @@ -16,7 +16,6 @@ import typing from abc import abstractmethod from functools import partial -from functools import reduce import cupy as cp import mrc @@ -192,13 +191,13 @@ def accepted_types(self) -> typing.Tuple: typing.Tuple Tuple of input types. """ - if (self._build_cpp_node()): - return (MultiInferenceMessage, ) - return (MultiInferenceMessage, ControlMessage) def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(MultiResponseMessage) + if schema.input_type == ControlMessage: + schema.output_schema.set_type(ControlMessage) + else: + schema.output_schema.set_type(MultiResponseMessage) def supports_cpp_node(self): # Default to False unless derived classes override this value @@ -285,10 +284,10 @@ def set_output_fut(resp: TensorMemory, inner_batch, batch_future: mrc.Future): if (isinstance(_message, ControlMessage)): _df = cudf.DataFrame(output_message.get_meta()) if (_df is not None and not _df.empty): - embeddings = output_message.get_probs_tensor() - _df["embedding"] = embeddings.tolist() _message_meta = CppMessageMeta(df=_df) _message.payload(_message_meta) + _message.tensors().set_tensor("probs", output_message.get_probs_tensor()) + print(_df) output_message = _message return output_message @@ -369,54 +368,6 @@ def _split_batches(x: MultiInferenceMessage, max_batch_size: int) -> typing.List return out_resp - @staticmethod - def _convert_response( - x: typing.Tuple[typing.List[MultiInferenceMessage], typing.List[TensorMemory]]) -> MultiResponseMessage: - - # Convert a MultiInferenceMessage into a MultiResponseMessage - in_message = x[0] - out_message = x[1] - - assert len(in_message) == len(out_message) - - # Get the total output size - total_mess_count = reduce(lambda y, z: y + z.mess_count, in_message, 0) - - # Create a message data to store the entire list - probs = cp.zeros((total_mess_count, out_message[0].get_tensor('probs').shape[1])) - - saved_offset = in_message[0].mess_offset - saved_count = 0 - - for inf, res in zip(in_message, out_message): - - # Ensure they all share the same meta object. Otherwise this doesn't work - # assert inf.meta is saved_meta - - # Make sure we have a continuous list - assert inf.mess_offset == saved_offset + saved_count - - assert inf.count == res.count - - # Two scenarios: - if (inf.mess_count == inf.count): - # In message and out message have same count. Just use probs as is - probs[inf.offset:inf.offset + inf.count, :] = res.get_output('probs') - else: - mess_ids = inf.get_tensor("seq_ids")[:, 0].get().tolist() - - # Out message has more reponses, so we have to do key based blending of probs - for i, idx in enumerate(mess_ids): - probs[idx, :] = cp.maximum(probs[idx, :], res.get_output('probs')[i, :]) - - saved_count += inf.mess_count - - assert saved_count == total_mess_count, "Did not set every element in output" - - memory = TensorMemory(count=total_mess_count, tensors={'probs': probs}) - - return MultiResponseMessage.from_message(in_message[0], mess_count=saved_count, memory=memory) - @staticmethod def _convert_one_response(output: MultiResponseMessage, inf: MultiInferenceMessage, res: TensorMemory): # Make sure we have a continuous list diff --git a/morpheus/stages/inference/triton_inference_stage.py b/morpheus/stages/inference/triton_inference_stage.py index e6c5c0fbb7..0b8a79dddf 100644 --- a/morpheus/stages/inference/triton_inference_stage.py +++ b/morpheus/stages/inference/triton_inference_stage.py @@ -32,6 +32,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes +from morpheus.messages import ControlMessage from morpheus.messages import MultiInferenceMessage from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.stages.inference.inference_stage import InferenceStage @@ -774,13 +775,22 @@ def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> TritonInfer needs_logits=self._needs_logits) def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: - return _stages.InferenceClientStage(builder, - self.unique_name, - self._server_url, - self._model_name, - self._needs_logits, - self._input_mapping, - self._output_mapping) + if self._schema.input_type == ControlMessage: + return _stages.InferenceClientStageCM(builder, + self.unique_name, + self._server_url, + self._model_name, + self._needs_logits, + self._input_mapping, + self._output_mapping) + + return _stages.InferenceClientStageMM(builder, + self.unique_name, + self._server_url, + self._model_name, + self._needs_logits, + self._input_mapping, + self._output_mapping) def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = super()._build_single(builder, input_node) diff --git a/morpheus/stages/postprocess/validation_stage.py b/morpheus/stages/postprocess/validation_stage.py index 1d62f18cab..7ae46db06f 100644 --- a/morpheus/stages/postprocess/validation_stage.py +++ b/morpheus/stages/postprocess/validation_stage.py @@ -23,6 +23,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.messages import MultiMessage from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage @@ -114,7 +115,7 @@ def accepted_types(self) -> typing.Tuple: Accepted input types. """ - return (MultiMessage, ) + return (MultiMessage, ControlMessage) def _do_comparison(self): results = self.get_results(clear=False) diff --git a/morpheus/stages/preprocess/preprocess_base_stage.py b/morpheus/stages/preprocess/preprocess_base_stage.py index 3731912026..f115e38053 100644 --- a/morpheus/stages/preprocess/preprocess_base_stage.py +++ b/morpheus/stages/preprocess/preprocess_base_stage.py @@ -61,15 +61,15 @@ def compute_schema(self, schema: StageSchema): if (schema.input_type == ControlMessage): self._use_control_message = True out_type = ControlMessage + self._preprocess_fn = self._get_preprocess_fn() else: self._use_control_message = False - - self._preprocess_fn = self._get_preprocess_fn() - preproc_sig = inspect.signature(self._preprocess_fn) - # If the innerfunction returns a type annotation, update the output type - if (preproc_sig.return_annotation - and typing_utils.issubtype(preproc_sig.return_annotation, MultiInferenceMessage)): - out_type = preproc_sig.return_annotation + self._preprocess_fn = self._get_preprocess_fn() + preproc_sig = inspect.signature(self._preprocess_fn) + # If the innerfunction returns a type annotation, update the output type + if (preproc_sig.return_annotation + and typing_utils.issubtype(preproc_sig.return_annotation, MultiInferenceMessage)): + out_type = preproc_sig.return_annotation schema.output_schema.set_type(out_type) diff --git a/tests/test_abp.py b/tests/test_abp.py index a3248deb7e..334f87afeb 100755 --- a/tests/test_abp.py +++ b/tests/test_abp.py @@ -27,6 +27,7 @@ from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.config import PipelineModes +from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.messages import MultiInferenceMessage from morpheus.messages import MultiMessage @@ -116,7 +117,8 @@ def test_abp_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): +@pytest.mark.parametrize("message_type", [MultiMessage, ControlMessage]) +def test_abp_cpp(config: Config, tmp_path: str, message_type: type, morpheus_log_level: int): config.mode = PipelineModes.FIL config.class_labels = ["mining"] config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -134,7 +136,7 @@ def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): pipe = LinearPipeline(config) pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) + pipe.add_stage(DeserializeStage(config, message_type=message_type)) pipe.add_stage(PreprocessFILStage(config)) # We are feeding TritonInferenceStage the port to the grpc server because that is what the validation tests do @@ -147,11 +149,16 @@ def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(AddScoresStage(config, prefix="score_")) pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) + ValidationStage(config, + val_file_name=val_file_name, + results_file_name=results_file_name, + rel_tol=0.05, + overwrite=True)) pipe.add_stage(SerializeStage(config)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) + pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=True)) pipe.run() + compare_class_to_scores(out_file, config.class_labels, '', 'score_', threshold=0.5) results = calc_error_val(results_file_name) assert results.diff_rows == 0 @@ -243,7 +250,14 @@ def test_abp_multi_segment_no_cpp(mock_triton_client: mock.MagicMock, @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_abp_multi_segment_cpp(config, tmp_path): +@pytest.mark.parametrize("message_type", [MultiMessage, ControlMessage]) +def test_abp_multi_segment_cpp(config, tmp_path, message_type): + + def get_boundary_type(boundary_type): + if message_type == ControlMessage: + return ControlMessage + return boundary_type + config.mode = PipelineModes.FIL config.class_labels = ["mining"] config.model_max_batch_size = MODEL_MAX_BATCH_SIZE @@ -261,13 +275,13 @@ def test_abp_multi_segment_cpp(config, tmp_path): pipe = LinearPipeline(config) pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) + pipe.add_stage(DeserializeStage(config, message_type=message_type)) - pipe.add_segment_boundary(MultiMessage) # Boundary 1 + pipe.add_segment_boundary(get_boundary_type(MultiMessage)) # Boundary 1 pipe.add_stage(PreprocessFILStage(config)) - pipe.add_segment_boundary(MultiInferenceMessage) # Boundary 2 + pipe.add_segment_boundary(get_boundary_type(MultiInferenceMessage)) # Boundary 2 # We are feeding TritonInferenceStage the port to the grpc server because that is what the validation tests do # but the code under-the-hood replaces this with the port number of the http server @@ -275,17 +289,17 @@ def test_abp_multi_segment_cpp(config, tmp_path): TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='localhost:8001', force_convert_inputs=True)) - pipe.add_segment_boundary(MultiResponseMessage) # Boundary 3 + pipe.add_segment_boundary(get_boundary_type(MultiResponseMessage)) # Boundary 3 pipe.add_stage(MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf")) pipe.add_stage(AddClassificationsStage(config)) - pipe.add_segment_boundary(MultiResponseMessage) # Boundary 4 + pipe.add_segment_boundary(get_boundary_type(MultiResponseMessage)) # Boundary 4 pipe.add_stage( ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - pipe.add_segment_boundary(MultiResponseMessage) # Boundary 5 + pipe.add_segment_boundary(get_boundary_type(MultiResponseMessage)) # Boundary 5 pipe.add_stage(SerializeStage(config)) diff --git a/tests/test_inference_stage.py b/tests/test_inference_stage.py index ee1989f2f9..e34f5a5bd4 100755 --- a/tests/test_inference_stage.py +++ b/tests/test_inference_stage.py @@ -25,7 +25,6 @@ from _utils.inference_worker import IW from morpheus.messages import ResponseMemory from morpheus.messages.memory.inference_memory import InferenceMemory -from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.messages.multi_inference_message import MultiInferenceMessage from morpheus.messages.multi_response_message import MultiResponseMessage @@ -121,67 +120,6 @@ def test_split_batches(): mock_message.get_slice.assert_has_calls([mock.call(0, 3), mock.call(3, 7), mock.call(7, 10)]) -@pytest.mark.use_python -def test_convert_response(): - # Pylint currently fails to work with classmethod: https://github.com/pylint-dev/pylint/issues/981 - # pylint: disable=no-member - - message_sizes = [3, 2, 1, 7, 4] - total_size = sum(message_sizes) - - full_input = _mk_message(mess_count=total_size, count=total_size) - - input_messages = [ - full_input.get_slice(sum(message_sizes[:i]), sum(message_sizes[:i]) + size) for i, - size in enumerate(message_sizes) - ] - - full_output = cp.random.rand(total_size, 3) - output_memory = [] - - for i, count in enumerate(message_sizes): - output_memory.append( - ResponseMemory(count=count, - tensors={"probs": full_output[sum(message_sizes[:i]):sum(message_sizes[:i]) + count, :]})) - - resp = InferenceStageT._convert_response((input_messages, output_memory)) - assert isinstance(resp, MultiResponseMessage) - assert resp.meta == full_input.meta - assert resp.mess_offset == 0 - assert resp.mess_count == total_size - assert isinstance(resp.memory, TensorMemory) - assert resp.offset == 0 - assert resp.count == total_size - assert (resp.memory.get_tensor("probs") == full_output).all() - - -def test_convert_response_errors(): - # Length of input messages doesn't match length of output messages - with pytest.raises(AssertionError): - InferenceStageT._convert_response(([1, 2, 3], [1, 2])) - - # Message offst of the second message doesn't line up offset+count of the first - msg1 = _mk_message() - msg2 = _mk_message(mess_offset=12) - - out_msg1 = ResponseMemory(count=1, tensors={"probs": cp.random.rand(1, 3)}) - out_msg2 = ResponseMemory(count=1, tensors={"probs": cp.random.rand(1, 3)}) - - with pytest.raises(AssertionError): - InferenceStageT._convert_response(([msg1, msg2], [out_msg1, out_msg2])) - - # mess_coutn and count don't match for msg2, and msg2.count != out_msg2.count - msg = _mk_message(mess_count=2, count=2) - msg1 = msg.get_slice(0, 1) - msg2 = msg.get_slice(1, 2) - - out_msg1 = ResponseMemory(count=1, tensors={"probs": cp.random.rand(1, 3)}) - out_msg2 = ResponseMemory(count=2, tensors={"probs": cp.random.rand(2, 3)}) - - with pytest.raises(AssertionError): - InferenceStageT._convert_response(([msg1, msg2], [out_msg1, out_msg2])) - - @pytest.mark.use_python def test_convert_one_response(): # Pylint currently fails to work with classmethod: https://github.com/pylint-dev/pylint/issues/981 diff --git a/tests/test_sid.py b/tests/test_sid.py index 2221abe930..b36903fd82 100755 --- a/tests/test_sid.py +++ b/tests/test_sid.py @@ -28,6 +28,8 @@ from morpheus.config import Config from morpheus.config import CppConfig from morpheus.config import PipelineModes +from morpheus.messages import ControlMessage +from morpheus.messages import MultiMessage from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage @@ -49,6 +51,7 @@ def _run_minibert_pipeline( *, config: Config, tmp_path: str, + message_type: type, model_name: str, truncated: bool, morpheus_log_level: int, @@ -99,7 +102,7 @@ def _run_minibert_pipeline( pipe = LinearPipeline(config) pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) + pipe.add_stage(DeserializeStage(config, message_type=message_type)) pipe.add_stage( PreprocessNLPStage(config, vocab_hash_file=vocab_file_name, @@ -126,6 +129,7 @@ def _run_minibert_pipeline( def _run_minibert(*, config: Config, tmp_path: str, + message_type: type, model_name: str, truncated: bool, morpheus_log_level: int, @@ -163,6 +167,7 @@ def _run_minibert(*, return _run_minibert_pipeline(config=config, tmp_path=tmp_path, + message_type=message_type, model_name=model_name, truncated=truncated, data_col_name=data_col_name, @@ -172,10 +177,12 @@ def _run_minibert(*, @pytest.mark.slow @pytest.mark.use_cpp @pytest.mark.usefixtures("launch_mock_triton") -def test_minibert_no_trunc(config: Config, tmp_path: str, morpheus_log_level: int): +@pytest.mark.parametrize("message_type", [MultiMessage, ControlMessage]) +def test_minibert_no_trunc(config: Config, tmp_path: str, message_type: type, morpheus_log_level: int): results = _run_minibert(config=config, tmp_path=tmp_path, + message_type=message_type, model_name="sid-minibert-onnx-no-trunc", truncated=False, morpheus_log_level=morpheus_log_level) @@ -190,10 +197,16 @@ def test_minibert_no_trunc(config: Config, tmp_path: str, morpheus_log_level: in @pytest.mark.slow @pytest.mark.usefixtures("launch_mock_triton") @pytest.mark.parametrize("data_col_name", ["data", "definitely_not_data"]) -def test_minibert_truncated(config: Config, tmp_path: str, morpheus_log_level: int, data_col_name: str): +@pytest.mark.parametrize("message_type", [MultiMessage, ControlMessage]) +def test_minibert_truncated(config: Config, + tmp_path: str, + message_type: type, + morpheus_log_level: int, + data_col_name: str): results = _run_minibert(config=config, tmp_path=tmp_path, + message_type=message_type, model_name='sid-minibert-onnx', truncated=True, data_col_name=data_col_name, From 57d11a290365b1232a48812cb6ee7bda9ffccb7b Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 1 May 2024 12:54:45 -0700 Subject: [PATCH 17/38] Truncate strings exceeding max_length when inserting to Milvus (#1665) * Adds new helper methods to `morpheus.io.utils`, `cudf_string_cols_exceed_max_bytes` and `truncate_string_cols_by_bytes` * When `truncate_long_strings=True` `MilvusVectorDBResourceService` will truncate all `VARCHAR` fields according to the schema's `max_length` * Add `truncate_long_strings=True` in config for `vdb_upload` pipeline * Set C++ mode to default for example LLM pipelines * Remove issues 1650 & 1651 from `known_issues.md` Closes #1650 Closes #1651 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1665 --- docs/source/extra_info/known_issues.md | 2 - examples/llm/cli.py | 2 +- examples/llm/vdb_upload/pipeline.py | 19 ++- examples/llm/vdb_upload/vdb_utils.py | 3 +- morpheus/io/utils.py | 96 ++++++++++++ .../service/vdb/milvus_vector_db_service.py | 73 ++++++--- morpheus/stages/inference/inference_stage.py | 8 +- morpheus/utils/type_aliases.py | 1 + tests/conftest.py | 6 + tests/io/test_io_utils.py | 134 +++++++++++++++++ tests/test_milvus_vector_db_service.py | 138 ++++++++++++++++++ .../milvus_string_collection_conf.json | 3 + 12 files changed, 457 insertions(+), 28 deletions(-) create mode 100755 tests/io/test_io_utils.py create mode 100644 tests/tests_data/service/milvus_string_collection_conf.json diff --git a/docs/source/extra_info/known_issues.md b/docs/source/extra_info/known_issues.md index 014fac3471..9eeb53508e 100644 --- a/docs/source/extra_info/known_issues.md +++ b/docs/source/extra_info/known_issues.md @@ -19,7 +19,5 @@ limitations under the License. - TrainAEStage fails with a Segmentation fault ([#1641](https://github.com/nv-morpheus/Morpheus/pull/1641)) - vdb_upload example pipeline triggers an internal error in Triton ([#1649](https://github.com/nv-morpheus/Morpheus/pull/1649)) -- vdb_upload example pipeline error on inserting large strings ([#1650](https://github.com/nv-morpheus/Morpheus/pull/1650)) -- vdb_upload example pipeline only works with C++ mode disabled ([#1651](https://github.com/nv-morpheus/Morpheus/pull/1651)) Refer to [open issues in the Morpheus project](https://github.com/nv-morpheus/Morpheus/issues) diff --git a/examples/llm/cli.py b/examples/llm/cli.py index 1ea9198dc1..c8aea20320 100644 --- a/examples/llm/cli.py +++ b/examples/llm/cli.py @@ -32,7 +32,7 @@ callback=parse_log_level, help="Specify the logging level to use.") @click.option('--use_cpp', - default=False, + default=True, type=bool, help=("Whether or not to use C++ node and message types or to prefer python. " "Only use as a last resort if bugs are encountered")) diff --git a/examples/llm/vdb_upload/pipeline.py b/examples/llm/vdb_upload/pipeline.py index 494446d16c..5d5fbee8e4 100644 --- a/examples/llm/vdb_upload/pipeline.py +++ b/examples/llm/vdb_upload/pipeline.py @@ -19,7 +19,9 @@ from vdb_upload.helper import process_vdb_sources from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.pipeline.pipeline import Pipeline +from morpheus.pipeline.stage_decorator import stage from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.general.trigger_stage import TriggerStage from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage @@ -78,6 +80,20 @@ def pipeline(pipeline_config: Config, monitor_2 = pipe.add_stage( MonitorStage(pipeline_config, description="Inference rate", unit="events", delayed_start=True)) + @stage + def embedding_tensor_to_df(message: ControlMessage, *, embedding_tensor_name='probs') -> ControlMessage: + """ + Copies the probs tensor to the 'embedding' field of the dataframe. + """ + msg_meta = message.payload() + with msg_meta.mutable_dataframe() as df: + embedding_tensor = message.tensors().get_tensor(embedding_tensor_name) + df['embedding'] = embedding_tensor.tolist() + + return message + + embedding_tensor_to_df_stage = pipe.add_stage(embedding_tensor_to_df(pipeline_config)) + vector_db = pipe.add_stage(WriteToVectorDBStage(pipeline_config, **vdb_config)) monitor_3 = pipe.add_stage( @@ -96,7 +112,8 @@ def pipeline(pipeline_config: Config, pipe.add_edge(nlp_stage, monitor_1) pipe.add_edge(monitor_1, embedding_stage) pipe.add_edge(embedding_stage, monitor_2) - pipe.add_edge(monitor_2, vector_db) + pipe.add_edge(monitor_2, embedding_tensor_to_df_stage) + pipe.add_edge(embedding_tensor_to_df_stage, vector_db) pipe.add_edge(vector_db, monitor_3) start_time = time.time() diff --git a/examples/llm/vdb_upload/vdb_utils.py b/examples/llm/vdb_upload/vdb_utils.py index d3aed615d7..7740acbc7c 100644 --- a/examples/llm/vdb_upload/vdb_utils.py +++ b/examples/llm/vdb_upload/vdb_utils.py @@ -315,14 +315,15 @@ def build_cli_configs(source_type, cli_vdb_conf = { # Vector db upload has some significant transaction overhead, batch size here should be as large as possible 'batch_size': 16384, - 'resource_name': vector_db_resource_name, 'embedding_size': embedding_size, 'recreate': True, + 'resource_name': vector_db_resource_name, 'resource_schemas': { vector_db_resource_name: build_defualt_milvus_config(embedding_size) if (vector_db_service == 'milvus') else None, }, 'service': vector_db_service, + 'truncate_long_strings': True, 'uri': vector_db_uri, } diff --git a/morpheus/io/utils.py b/morpheus/io/utils.py index 7c4cfce260..d8b286a8e8 100644 --- a/morpheus/io/utils.py +++ b/morpheus/io/utils.py @@ -14,7 +14,16 @@ # limitations under the License. """IO utilities.""" +import logging + +import pandas as pd + +import cudf + from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType + +logger = logging.getLogger(__name__) def filter_null_data(x: DataFrameType): @@ -31,3 +40,90 @@ def filter_null_data(x: DataFrameType): return x return x[~x['data'].isna()] + + +def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict[str, int]) -> bool: + """ + Checks a cudf DataFrame for string columns that exceed a maximum number of bytes and thus need to be truncated by + calling `truncate_string_cols_by_bytes`. + + This method utilizes a cudf method `Series.str.byte_count()` method that pandas lacks, which can avoid a costly + call to truncate_string_cols_by_bytes. + + Parameters + ---------- + df : DataFrameType + The dataframe to check. + column_max_bytes: dict[str, int] + A mapping of string column names to the maximum number of bytes for each column. + + Returns + ------- + bool + True if truncation is needed, False otherwise. + """ + if not isinstance(df, cudf.DataFrame): + raise ValueError("Expected cudf DataFrame") + + for (col, max_bytes) in column_max_bytes.items(): + series: cudf.Series = df[col] + + assert series.dtype == 'object' + + if series.str.byte_count().max() > max_bytes: + return True + + return False + + +def truncate_string_cols_by_bytes(df: DataFrameType, + column_max_bytes: dict[str, int], + warn_on_truncate: bool = True) -> bool: + """ + Truncates all string columns in a dataframe to a maximum number of bytes. This operation is performed in-place on + the dataframe. + + Parameters + ---------- + df : DataFrameType + The dataframe to truncate. + column_max_bytes: dict[str, int] + A mapping of string column names to the maximum number of bytes for each column. + warn_on_truncate: bool, default True + Whether to log a warning when truncating a column. + + Returns + ------- + bool + True if truncation was performed, False otherwise. + """ + + performed_truncation = False + is_cudf = isinstance(df, cudf.DataFrame) + + for (col, max_bytes) in column_max_bytes.items(): + series: SeriesType = df[col] + + if is_cudf: + series: pd.Series = series.to_pandas() + + assert series.dtype == 'object', f"Expected string column '{col}'" + + encoded_series = series.str.encode(encoding='utf-8', errors='strict') + if encoded_series.str.len().max() > max_bytes: + performed_truncation = True + if warn_on_truncate: + logger.warning("Truncating column '%s' to %d bytes", col, max_bytes) + + truncated_series = encoded_series.str.slice(0, max_bytes) + + # There is a possibility that slicing by max_len will slice a multi-byte character in half setting + # errors='ignore' will cause the resulting string to be truncated after the last full character + decoded_series = truncated_series.str.decode(encoding='utf-8', errors='ignore') + + if is_cudf: + df[col] = cudf.Series.from_pandas(decoded_series) + else: + df[col] = decoded_series + + return performed_truncation diff --git a/morpheus/service/vdb/milvus_vector_db_service.py b/morpheus/service/vdb/milvus_vector_db_service.py index 37cd82d1ba..09c68f15cd 100644 --- a/morpheus/service/vdb/milvus_vector_db_service.py +++ b/morpheus/service/vdb/milvus_vector_db_service.py @@ -20,18 +20,24 @@ import typing from functools import wraps -import pandas as pd - import cudf +from morpheus.io.utils import cudf_string_cols_exceed_max_bytes +from morpheus.io.utils import truncate_string_cols_by_bytes from morpheus.service.vdb.vector_db_service import VectorDBResourceService from morpheus.service.vdb.vector_db_service import VectorDBService +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) IMPORT_EXCEPTION = None IMPORT_ERROR_MESSAGE = "MilvusVectorDBResourceService requires the milvus and pymilvus packages to be installed." +# Milvus has a max string length in bytes of 65,535. Multi-byte characters like "ñ" will have a string length of 1, the +# byte length encoded as UTF-8 will be 2 +# https://milvus.io/docs/limitations.md#Length-of-a-string +MAX_STRING_LENGTH_BYTES = 65_535 + try: import pymilvus from pymilvus.orm.mutation import MutationResult @@ -222,9 +228,11 @@ class MilvusVectorDBResourceService(VectorDBResourceService): Name of the resource. client : MilvusClient An instance of the MilvusClient for interaction with the Milvus Vector Database. + truncate_long_strings : bool, optional + When true, truncate strings values that are longer than the max length of the field """ - def __init__(self, name: str, client: "MilvusClient") -> None: + def __init__(self, name: str, client: "MilvusClient", truncate_long_strings: bool = False) -> None: if IMPORT_EXCEPTION is not None: raise ImportError(IMPORT_ERROR_MESSAGE) from IMPORT_EXCEPTION @@ -239,13 +247,24 @@ def __init__(self, name: str, client: "MilvusClient") -> None: self._vector_field = None self._fillna_fields_dict = {} + # Mapping of field name to max length for string fields + self._fields_max_length: dict[str, int] = {} + for field in self._fields: if field.dtype == pymilvus.DataType.FLOAT_VECTOR: self._vector_field = field.name else: + # Intentionally excluding pymilvus.DataType.STRING, in our current version it isn't supported, and in + # some database systems string types don't have a max length. + if field.dtype == pymilvus.DataType.VARCHAR: + max_length = field.params.get('max_length') + if max_length is not None: + self._fields_max_length[field.name] = max_length if not field.auto_id: self._fillna_fields_dict[field.name] = field.dtype + self._truncate_long_strings = truncate_long_strings + self._collection.load() def _set_up_collection(self): @@ -275,13 +294,13 @@ def insert(self, data: list[list] | list[dict], **kwargs: dict[str, typing.Any]) return self._insert_result_to_dict(result=result) - def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs: dict[str, typing.Any]) -> dict: + def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict: """ Insert a dataframe entires into the vector database. Parameters ---------- - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted into the collection. **kwargs : dict[str, typing.Any] Extra keyword arguments specific to the vector database implementation. @@ -291,10 +310,6 @@ def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwa dict Returns response content as a dictionary. """ - - if isinstance(df, cudf.DataFrame): - df = df.to_pandas() - # Ensure that there are no None values in the DataFrame entries. for field_name, dtype in self._fillna_fields_dict.items(): if dtype in (pymilvus.DataType.VARCHAR, pymilvus.DataType.STRING): @@ -311,11 +326,24 @@ def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwa else: logger.info("Skipped checking 'None' in the field: %s, with datatype: %s", field_name, dtype) + needs_truncate = self._truncate_long_strings + if needs_truncate and isinstance(df, cudf.DataFrame): + # Cudf specific optimization, we can avoid a costly call to truncate_string_cols_by_bytes if all of the + # string columns are already below the max length + needs_truncate = cudf_string_cols_exceed_max_bytes(df, self._fields_max_length) + # From the schema, this is the list of columns we need, excluding any auto_id columns column_names = [field.name for field in self._fields if not field.auto_id] + collection_df = df[column_names] + if isinstance(collection_df, cudf.DataFrame): + collection_df = collection_df.to_pandas() + + if needs_truncate: + truncate_string_cols_by_bytes(collection_df, self._fields_max_length, warn_on_truncate=True) + # Note: dataframe columns has to be in the order of collection schema fields.s - result = self._collection.insert(data=df[column_names], **kwargs) + result = self._collection.insert(data=collection_df, **kwargs) self._collection.flush() return self._insert_result_to_dict(result=result) @@ -575,6 +603,8 @@ class MilvusVectorDBService(VectorDBService): The port number for connecting to the Milvus server. alias : str, optional Alias for the Milvus connection, by default "default". + truncate_long_strings : bool, optional + When true, truncate strings values that are longer than the max length of the field **kwargs : dict Additional keyword arguments specific to the Milvus connection configuration. """ @@ -589,13 +619,17 @@ def __init__(self, password: str = "", db_name: str = "", token: str = "", + truncate_long_strings: bool = False, **kwargs: dict[str, typing.Any]): + self._truncate_long_strings = truncate_long_strings self._client = MilvusClient(uri=uri, user=user, password=password, db_name=db_name, token=token, **kwargs) def load_resource(self, name: str, **kwargs: dict[str, typing.Any]) -> MilvusVectorDBResourceService: - - return MilvusVectorDBResourceService(name=name, client=self._client, **kwargs) + return MilvusVectorDBResourceService(name=name, + client=self._client, + truncate_long_strings=self._truncate_long_strings, + **kwargs) def has_store_object(self, name: str) -> bool: """ @@ -688,7 +722,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs: dict[str, typing. for part in partition_conf["partitions"]: self._client.create_partition(collection_name=name, partition_name=part["name"], timeout=timeout) - def _build_schema_conf(self, df: typing.Union[cudf.DataFrame, pd.DataFrame]) -> list[dict]: + def _build_schema_conf(self, df: DataFrameType) -> list[dict]: fields = [] # Always add a primary key @@ -708,7 +742,7 @@ def _build_schema_conf(self, df: typing.Union[cudf.DataFrame, pd.DataFrame]) -> } if (field_dict["dtype"] == pymilvus.DataType.VARCHAR): - field_dict["max_length"] = 65_535 + field_dict["max_length"] = MAX_STRING_LENGTH_BYTES if (field_dict["dtype"] == pymilvus.DataType.FLOAT_VECTOR or field_dict["dtype"] == pymilvus.DataType.BINARY_VECTOR): @@ -726,7 +760,7 @@ def _build_schema_conf(self, df: typing.Union[cudf.DataFrame, pd.DataFrame]) -> def create_from_dataframe(self, name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], + df: DataFrameType, overwrite: bool = False, **kwargs: dict[str, typing.Any]) -> None: """ @@ -736,7 +770,7 @@ def create_from_dataframe(self, ---------- name : str Name of the collection. - df : Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType The dataframe to create the collection from. overwrite : bool, optional Whether to overwrite the collection if it already exists. Default is False. @@ -797,10 +831,7 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs: dict[str, return resource.insert(data, **kwargs) @with_collection_lock - def insert_dataframe(self, - name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], - **kwargs: dict[str, typing.Any]) -> dict[str, typing.Any]: + def insert_dataframe(self, name: str, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict[str, typing.Any]: """ Converts dataframe to rows and insert to a collection in the Milvus vector database. @@ -808,7 +839,7 @@ def insert_dataframe(self, ---------- name : str Name of the collection to be inserted. - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted in the collection. **kwargs : dict[str, typing.Any] Additional keyword arguments containing collection configuration. diff --git a/morpheus/stages/inference/inference_stage.py b/morpheus/stages/inference/inference_stage.py index 8b1fa75d3a..ab12afe4d3 100644 --- a/morpheus/stages/inference/inference_stage.py +++ b/morpheus/stages/inference/inference_stage.py @@ -286,8 +286,12 @@ def set_output_fut(resp: TensorMemory, inner_batch, batch_future: mrc.Future): if (_df is not None and not _df.empty): _message_meta = CppMessageMeta(df=_df) _message.payload(_message_meta) - _message.tensors().set_tensor("probs", output_message.get_probs_tensor()) - print(_df) + + response_tensors = output_message.tensors + cm_tensors = _message.tensors() + for (name, tensor) in response_tensors.items(): + cm_tensors.set_tensor(name, tensor) + output_message = _message return output_message diff --git a/morpheus/utils/type_aliases.py b/morpheus/utils/type_aliases.py index f944c3f9cb..cd394664e6 100644 --- a/morpheus/utils/type_aliases.py +++ b/morpheus/utils/type_aliases.py @@ -20,3 +20,4 @@ import cudf DataFrameType = typing.Union[pd.DataFrame, cudf.DataFrame] +SeriesType = typing.Union[pd.Series, cudf.Series] diff --git a/tests/conftest.py b/tests/conftest.py index 1f8f0ef425..30cc8f869d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1035,6 +1035,12 @@ def simple_collection_config_fixture(): yield load_json_file(filename="service/milvus_simple_collection_conf.json") +@pytest.fixture(scope="session", name="string_collection_config") +def string_collection_config_fixture(): + from _utils import load_json_file + yield load_json_file(filename="service/milvus_string_collection_conf.json") + + @pytest.fixture(name="nemollm", scope='session') def nemollm_fixture(fail_missing: bool): """ diff --git a/tests/io/test_io_utils.py b/tests/io/test_io_utils.py new file mode 100755 index 0000000000..1ad46b75cb --- /dev/null +++ b/tests/io/test_io_utils.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable + +import pytest + +import cudf + +from _utils.dataset_manager import DatasetManager +from morpheus.io import utils as io_utils +from morpheus.utils.type_aliases import DataFrameType + +MULTI_BYTE_STRINGS = ["ñäμɛ", "Moρφέας", "taç"] + + +def _mk_df(df_class: Callable[..., DataFrameType], data: dict[str, list[str]]) -> DataFrameType: + """ + Create a dataframe with a 'data' column containing the given data, and some other columns with different data types + """ + num_rows = len(data[list(data.keys())[0]]) + + float_col = [] + int_col = [] + short_str_col = [] + for i in range(num_rows): + float_col.append(i) + int_col.append(i) + short_str_col.append(f"{i}"[0:3]) + + df_data = data.copy() + df_data.update({"float_col": float_col, "int_col": int_col, "short_str_col": short_str_col}) + + return df_class(df_data) + + +@pytest.mark.parametrize( + "data, max_bytes, expected", + [({ + "data": MULTI_BYTE_STRINGS[:] + }, { + "data": 8 + }, True), ({ + "data": MULTI_BYTE_STRINGS[:], "ignored_col": ["a" * 20, "b" * 20, "c" * 20] + }, { + "data": 12 + }, False), ({ + "data": MULTI_BYTE_STRINGS[:] + }, { + "data": 20 + }, False), ({ + "data": ["." * 20] + }, { + "data": 19 + }, True), ({ + "data": ["." * 20] + }, { + "data": 20 + }, False), ({ + "data": ["." * 20] + }, { + "data": 21 + }, False)]) +def test_cudf_needs_truncate(data: list[str], max_bytes: int, expected: bool): + df = _mk_df(cudf.DataFrame, data) + assert io_utils.cudf_string_cols_exceed_max_bytes(df, max_bytes) is expected + + +@pytest.mark.parametrize("warn_on_truncate", [True, False]) +@pytest.mark.parametrize( + "data, max_bytes, expected_data", + [({ + "multibyte_strings": MULTI_BYTE_STRINGS[:], "ascii_strings": ["a" * 20, "b" * 21, "c" * 19] + }, { + "multibyte_strings": 4, "ascii_strings": 20 + }, { + "multibyte_strings": ["ñä", "Moρ", "taç"], "ascii_strings": ["a" * 20, "b" * 20, "c" * 19] + }), + ({ + "data": MULTI_BYTE_STRINGS[:], "ignored_col": ["a" * 20, "b" * 20, "c" * 20] + }, { + "data": 5 + }, { + "data": ["ñä", "Moρ", "taç"], "ignored_col": ["a" * 20, "b" * 20, "c" * 20] + }), ({ + "data": MULTI_BYTE_STRINGS[:] + }, { + "data": 8 + }, { + "data": ["ñäμɛ", "Moρφέ", "taç"] + }), ({ + "data": MULTI_BYTE_STRINGS[:] + }, { + "data": 9 + }, { + "data": ["ñäμɛ", "Moρφέ", "taç"] + }), ({ + "data": MULTI_BYTE_STRINGS[:] + }, { + "data": 12 + }, { + "data": MULTI_BYTE_STRINGS[:] + })]) +def test_truncate_string_cols_by_bytes(dataset: DatasetManager, + data: dict[str, list[str]], + max_bytes: int, + expected_data: dict[str, list[str]], + warn_on_truncate: bool): + df = _mk_df(dataset.df_class, data) + + expect_truncation = (data != expected_data) + expected_df_class = dataset.df_class + + expected_df = _mk_df(expected_df_class, expected_data) + + performed_truncation = io_utils.truncate_string_cols_by_bytes(df, max_bytes, warn_on_truncate=warn_on_truncate) + + assert performed_truncation is expect_truncation + assert isinstance(df, expected_df_class) + + dataset.assert_df_equal(df, expected_df) diff --git a/tests/test_milvus_vector_db_service.py b/tests/test_milvus_vector_db_service.py index 723e7e7f8e..3d0548176d 100644 --- a/tests/test_milvus_vector_db_service.py +++ b/tests/test_milvus_vector_db_service.py @@ -16,14 +16,18 @@ import json import random +import string import numpy as np import pymilvus import pytest from pymilvus import DataType +from pymilvus import MilvusException import cudf +from _utils.dataset_manager import DatasetManager +from morpheus.service.vdb.milvus_vector_db_service import MAX_STRING_LENGTH_BYTES from morpheus.service.vdb.milvus_vector_db_service import FieldSchemaEncoder from morpheus.service.vdb.milvus_vector_db_service import MilvusVectorDBService @@ -71,6 +75,45 @@ def sample_field_fixture(): return pymilvus.FieldSchema(name="test_field", dtype=pymilvus.DataType.INT64) +def _mk_long_string(source_chars: str) -> str: + """ + Yields a string longer than MAX_STRING_LENGTH_BYTES from source chars + """ + source_chars_byte_len = len(source_chars.encode("utf-8")) + source_data = list(source_chars) + + byte_len = 0 + long_str_data = [] + while byte_len <= MAX_STRING_LENGTH_BYTES: + long_str_data.extend(source_data) + byte_len += source_chars_byte_len + + return "".join(long_str_data) + + +@pytest.fixture(scope="module", name="long_ascii_string") +def long_ascii_string_fixture(): + """ + Yields a string longer than MAX_STRING_LENGTH_BYTES containing only ascii (single-byte) characters + """ + return _mk_long_string(string.ascii_letters) + + +@pytest.fixture(scope="module", name="long_multibyte_string") +def long_multibyte_string_fixture(): + """ + Yields a string longer than MAX_STRING_LENGTH_BYTES containing a mix of single and multi-byte characters + """ + return _mk_long_string("Moρφέας") + + +def _truncate_string_by_bytes(s: str, max_bytes: int) -> str: + """ + Truncates a string to the given number of bytes + """ + return s.encode("utf-8")[:max_bytes].decode("utf-8", errors="ignore") + + @pytest.mark.milvus def test_create_and_drop_collection(idx_part_collection_config: dict, milvus_service: MilvusVectorDBService): collection_name = "test_collection" @@ -467,3 +510,98 @@ def test_fse_from_dict(): result = FieldSchemaEncoder.from_dict(data) assert result.name == "test_field" assert result.dtype == pymilvus.DataType.INT64 + + +@pytest.mark.milvus +@pytest.mark.slow +@pytest.mark.parametrize("use_multi_byte_strings", [True, False], ids=["multi_byte", "ascii"]) +@pytest.mark.parametrize("truncate_long_strings", [True, False], ids=["truncate", "no_truncate"]) +@pytest.mark.parametrize("exceed_max_str_len", [True, False], ids=["exceed_max_len", "within_max_len"]) +def test_insert_dataframe(milvus_server_uri: str, + string_collection_config: dict, + dataset: DatasetManager, + use_multi_byte_strings: bool, + truncate_long_strings: bool, + exceed_max_str_len: bool, + long_ascii_string: str, + long_multibyte_string: str): + num_rows = 10 + collection_name = "test_insert_dataframe" + + milvus_service = MilvusVectorDBService(uri=milvus_server_uri, truncate_long_strings=truncate_long_strings) + + # Make sure to drop any existing collection from previous runs. + milvus_service.drop(collection_name) + + # Create a collection. + milvus_service.create(collection_name, **string_collection_config) + + short_str_col_len = -1 + long_str_col_len = -1 + for field_conf in string_collection_config["schema_conf"]["schema_fields"]: + if field_conf["name"] == "short_str_col": + short_str_col_len = field_conf["params"]["max_length"] + + elif field_conf["name"] == "long_str_col": + long_str_col_len = field_conf["params"]["max_length"] + + assert short_str_col_len > 0, "short_str_col length is not set" + assert long_str_col_len == MAX_STRING_LENGTH_BYTES, "long_str_col length is not set to MAX_STRING_LENGTH_BYTES" + + # Construct the dataframe. + ids = [] + embedding_data = [] + long_str_col = [] + short_str_col = [] + + if use_multi_byte_strings: + long_str = long_multibyte_string + else: + long_str = long_ascii_string + + short_str = long_str[:7] + if not exceed_max_str_len: + short_str = _truncate_string_by_bytes(short_str, short_str_col_len) + long_str = _truncate_string_by_bytes(long_str, MAX_STRING_LENGTH_BYTES) + + for i in range(num_rows): + ids.append(i) + embedding_data.append([i / 10.0] * 3) + + long_str_col.append(long_str) + short_str_col.append(short_str) + + df = dataset.df_class({ + "id": ids, "embedding": embedding_data, "long_str_col": long_str_col, "short_str_col": short_str_col + }) + + expected_long_str = [] + for long_str in long_str_col: + if truncate_long_strings: + expected_long_str.append( + long_str.encode("utf-8")[:MAX_STRING_LENGTH_BYTES].decode("utf-8", errors="ignore")) + else: + expected_long_str.append(long_str) + + expected_df = dataset.df_class({ + "id": ids, "embedding": embedding_data, "long_str_col": expected_long_str, "short_str_col": short_str_col + }) + + if (exceed_max_str_len and (not truncate_long_strings)): + with pytest.raises(MilvusException, match="string exceeds max length"): + milvus_service.insert_dataframe(collection_name, df) + + return # Skip the rest of the test if the string column exceeds the maximum length. + + milvus_service.insert_dataframe(collection_name, df) + + # Retrieve inserted data by primary keys. + retrieved_data = milvus_service.retrieve_by_keys(collection_name, ids) + assert len(retrieved_data) == num_rows + + # Clean up the collection. + milvus_service.drop(collection_name) + + result_df = dataset.df_class(retrieved_data) + + dataset.compare_df(result_df, expected_df) diff --git a/tests/tests_data/service/milvus_string_collection_conf.json b/tests/tests_data/service/milvus_string_collection_conf.json new file mode 100644 index 0000000000..a75970a361 --- /dev/null +++ b/tests/tests_data/service/milvus_string_collection_conf.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adbc34ae22c1037c8308b5521a01597a81d0ea117cc691e72566b463c0be6e9a +size 1083 From 9d3de8a5c60d6a9be3f8ff7a71c6a75fce6791f3 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 1 May 2024 15:09:21 -0700 Subject: [PATCH 18/38] Strip HTML & XML tags from RSS feed input (#1670) * Optionally strip HTML & XML tags embedded in RSS feeds Requires PR #1665 to be merged first Closes #1666 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) - Michael Demoret (https://github.com/mdemoret-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1670 --- .../all_cuda-121_arch-x86_64.yaml | 1 + .../dev_cuda-121_arch-x86_64.yaml | 1 + .../examples_cuda-121_arch-x86_64.yaml | 1 + .../runtime_cuda-121_arch-x86_64.yaml | 1 + dependencies.yaml | 1 + .../llm/vdb_upload/module/rss_source_pipe.py | 3 + examples/llm/vdb_upload/vdb_config.yaml | 1 + examples/llm/vdb_upload/vdb_utils.py | 1 + morpheus/controllers/rss_controller.py | 54 ++++++++++++++- morpheus/modules/input/rss_source.py | 33 +++++---- morpheus/modules/schemas/rss_source_schema.py | 1 + morpheus/stages/input/rss_source_stage.py | 8 ++- tests/controllers/test_rss_controller.py | 67 ++++++++++++++++--- 13 files changed, 142 insertions(+), 31 deletions(-) diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml index b440991aa3..3b310995fb 100644 --- a/conda/environments/all_cuda-121_arch-x86_64.yaml +++ b/conda/environments/all_cuda-121_arch-x86_64.yaml @@ -13,6 +13,7 @@ dependencies: - appdirs - arxiv=1.4 - automake +- beautifulsoup4 - benchmark=1.8.3 - boost-cpp=1.84 - boto3 diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml index 2ee99333a0..23ff2c707e 100644 --- a/conda/environments/dev_cuda-121_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-121_arch-x86_64.yaml @@ -11,6 +11,7 @@ channels: dependencies: - appdirs - automake +- beautifulsoup4 - benchmark=1.8.3 - boost-cpp=1.84 - breathe=4.35.0 diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml index 857b73aa85..11d5e535ce 100644 --- a/conda/environments/examples_cuda-121_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-121_arch-x86_64.yaml @@ -12,6 +12,7 @@ dependencies: - anyio>=3.7 - appdirs - arxiv=1.4 +- beautifulsoup4 - boto3 - click >=8 - cuml=24.02.* diff --git a/conda/environments/runtime_cuda-121_arch-x86_64.yaml b/conda/environments/runtime_cuda-121_arch-x86_64.yaml index 3f9543d426..80f6f995d2 100644 --- a/conda/environments/runtime_cuda-121_arch-x86_64.yaml +++ b/conda/environments/runtime_cuda-121_arch-x86_64.yaml @@ -10,6 +10,7 @@ channels: - pytorch dependencies: - appdirs +- beautifulsoup4 - click >=8 - datacompy=0.10 - dill=0.3.7 diff --git a/dependencies.yaml b/dependencies.yaml index 616c1db3de..7f1f9145ef 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -249,6 +249,7 @@ dependencies: - &dill dill=0.3.7 - &scikit-learn scikit-learn=1.3.2 - appdirs + - beautifulsoup4 - datacompy=0.10 - elasticsearch==8.9.0 - feedparser=6.0.10 diff --git a/examples/llm/vdb_upload/module/rss_source_pipe.py b/examples/llm/vdb_upload/module/rss_source_pipe.py index ff61940b8c..55b309e032 100644 --- a/examples/llm/vdb_upload/module/rss_source_pipe.py +++ b/examples/llm/vdb_upload/module/rss_source_pipe.py @@ -49,6 +49,7 @@ class RSSSourcePipeSchema(BaseModel): request_timeout_sec: float = 2.0 run_indefinitely: bool = True stop_after_rec: int = 0 + strip_markup: bool = True vdb_resource_name: str web_scraper_config: Optional[Dict[Any, Any]] = None @@ -98,6 +99,7 @@ def _rss_source_pipe(builder: mrc.Builder): - **request_timeout_sec**: Timeout in seconds for RSS feed requests. - **run_indefinitely**: Boolean to indicate continuous running. - **stop_after**: Number of records to process before stopping (0 for indefinite). + - **strip_markup**: When True, strip HTML & XML markup from feed content. - **web_scraper_config**: Configuration for the web scraper module. - **chunk_overlap**: Overlap size for chunks in web scraping. - **chunk_size**: Size of content chunks for processing. @@ -131,6 +133,7 @@ def _rss_source_pipe(builder: mrc.Builder): "request_timeout_sec": validated_config.request_timeout_sec, "interval_sec": validated_config.interval_sec, "stop_after_rec": validated_config.stop_after_rec, + "strip_markup": validated_config.strip_markup, } rss_source_loader = RSSSourceLoaderFactory.get_instance("rss_source", {"rss_source": rss_source_config}) diff --git a/examples/llm/vdb_upload/vdb_config.yaml b/examples/llm/vdb_upload/vdb_config.yaml index ac93a47615..5698cc2e83 100644 --- a/examples/llm/vdb_upload/vdb_config.yaml +++ b/examples/llm/vdb_upload/vdb_config.yaml @@ -76,6 +76,7 @@ vdb_pipeline: request_timeout_sec: 2.0 run_indefinitely: true stop_after_rec: 0 + strip_markup: true web_scraper_config: chunk_overlap: 51 chunk_size: 512 diff --git a/examples/llm/vdb_upload/vdb_utils.py b/examples/llm/vdb_upload/vdb_utils.py index 7740acbc7c..d9e39b2553 100644 --- a/examples/llm/vdb_upload/vdb_utils.py +++ b/examples/llm/vdb_upload/vdb_utils.py @@ -142,6 +142,7 @@ def _build_default_rss_source(enable_cache, "interval_sec": interval_secs, "request_timeout_sec": rss_request_timeout_sec, "run_indefinitely": run_indefinitely, + "strip_markup": True, "vdb_resource_name": vector_db_resource_name, "web_scraper_config": { "chunk_size": content_chunking_size, diff --git a/morpheus/controllers/rss_controller.py b/morpheus/controllers/rss_controller.py index 5b9c36f369..a1972c406f 100644 --- a/morpheus/controllers/rss_controller.py +++ b/morpheus/controllers/rss_controller.py @@ -70,8 +70,17 @@ class RSSController: Cooldown interval in seconds if there is a failure in fetching or parsing the feed. request_timeout : float, optional, default = 2.0 Request timeout in secs to fetch the feed. + strip_markup : bool, optional, default = False + When true, strip HTML & XML markup from the from the content, summary and title fields. """ + # Fields which may contain HTML or XML content + MARKUP_FIELDS = ( + "content", + "summary", + "title", + ) + def __init__(self, feed_input: str | list[str], batch_size: int = 128, @@ -79,7 +88,8 @@ def __init__(self, enable_cache: bool = False, cache_dir: str = "./.cache/http", cooldown_interval: int = 600, - request_timeout: float = 2.0): + request_timeout: float = 2.0, + strip_markup: bool = False): if IMPORT_EXCEPTION is not None: raise ImportError(IMPORT_ERROR_MESSAGE) from IMPORT_EXCEPTION @@ -92,6 +102,7 @@ def __init__(self, self._previous_entries = set() # Stores the IDs of previous entries to prevent the processing of duplicates. self._cooldown_interval = cooldown_interval self._request_timeout = request_timeout + self._strip_markup = strip_markup # Validate feed_input for f in self._feed_input: @@ -236,6 +247,44 @@ def _try_parse_feed(self, url: str) -> "feedparser.FeedParserDict": return feed + @staticmethod + def _strip_markup_from_field(field: str, mime_type: str) -> str: + if mime_type.endswith("xml"): + parser = "xml" + else: + parser = "html.parser" + + try: + soup = BeautifulSoup(field, features=parser) + return soup.get_text() + except Exception as ex: + logger.error("Failed to strip tags from field: %s: %s", field, ex) + return field + + def _strip_markup_from_fields(self, entry: "feedparser.FeedParserDict"): + """ + Strip HTML & XML tags from the content, summary and title fields. + + Per note in feedparser documentation even if a field is advertized as plain text, it may still contain HTML + https://feedparser.readthedocs.io/en/latest/html-sanitization.html + """ + for field in self.MARKUP_FIELDS: + field_value = entry.get(field) + if field_value is not None: + if isinstance(field_value, list): + for field_item in field_value: + mime_type = field_item.get("type", "text/plain") + field_item["value"] = self._strip_markup_from_field(field_item["value"], mime_type) + field_item["type"] = "text/plain" + else: + detail_field_name = f"{field}_detail" + detail_field: dict = entry.get(detail_field_name, {}) + mime_type = detail_field.get("type", "text/plain") + + entry[field] = self._strip_markup_from_field(field_value, mime_type) + detail_field["type"] = "text/plain" + entry[detail_field_name] = detail_field + def parse_feeds(self): """ Parse the RSS feed using the feedparser library. @@ -291,6 +340,9 @@ def fetch_dataframes(self): entry_id = entry.get('id') current_entries.add(entry_id) if entry_id not in self._previous_entries: + if self._strip_markup: + self._strip_markup_from_fields(entry) + entry_accumulator.append(entry) if self._batch_size > 0 and len(entry_accumulator) >= self._batch_size: diff --git a/morpheus/modules/input/rss_source.py b/morpheus/modules/input/rss_source.py index 9f5dd6c316..1454a67b05 100644 --- a/morpheus/modules/input/rss_source.py +++ b/morpheus/modules/input/rss_source.py @@ -32,30 +32,26 @@ @register_module("rss_source", "morpheus") def _rss_source(builder: mrc.Builder): """ - A module for applying simple DataFrame schema transform policies. - - This module reads the configuration to determine how to set data types for columns, select, or rename them in the - dataframe. + A module for loading RSS feed items into a DataFrame. Parameters ---------- builder : mrc.Builder The Morpheus pipeline builder object. - Notes - ------------- - The configuration should be passed to the module through the `module_config` attribute of the builder. It should - contain a dictionary where each key is a column name, and the value is another dictionary with keys 'dtype' for - data type, 'op_type' for operation type ('select' or 'rename'), and optionally 'from' for the original column - name (if the column is to be renamed). - Example Configuration --------------------- { - "summary": {"dtype": "str", "op_type": "select"}, - "title": {"dtype": "str", "op_type": "select"}, - "content": {"from": "page_content", "dtype": "str", "op_type": "rename"}, - "source": {"from": "link", "dtype": "str", "op_type": "rename"} + "batch_size": 32, + "cache_dir": "./.cache/http", + "cooldown_interval_sec": 600, + "enable_cache": True, + "feed_input": ["https://nvidianews.nvidia.com/releases.xml"], + "interval_sec": 600, + "request_timeout_sec": 2.0, + run_indefinitely: True, + "stop_after_rec": 0, + "strip_markup": True, } """ @@ -77,7 +73,8 @@ def _rss_source(builder: mrc.Builder): enable_cache=validated_config.enable_cache, cache_dir=validated_config.cache_dir, cooldown_interval=validated_config.cooldown_interval_sec, - request_timeout=validated_config.request_timeout_sec) + request_timeout=validated_config.request_timeout_sec, + strip_markup=validated_config.strip_markup) stop_requested = False @@ -108,9 +105,9 @@ def fetch_feeds() -> MessageMeta: except Exception as exc: if not controller.run_indefinitely: - logger.error("Failed either in the process of fetching or processing entries: %d.", exc) + logger.error("Failed either in the process of fetching or processing entries: %s.", exc) raise - logger.error("Failed either in the process of fetching or processing entries: %d.", exc) + logger.error("Failed either in the process of fetching or processing entries: %s.", exc) if not controller.run_indefinitely: stop_requested = True diff --git a/morpheus/modules/schemas/rss_source_schema.py b/morpheus/modules/schemas/rss_source_schema.py index 53c0928391..38facfed0e 100644 --- a/morpheus/modules/schemas/rss_source_schema.py +++ b/morpheus/modules/schemas/rss_source_schema.py @@ -31,6 +31,7 @@ class RSSSourceSchema(BaseModel): request_timeout_sec: float = 2.0 interval_sec: int = 600 stop_after_rec: int = 0 + strip_markup: bool = True class Config: extra = "forbid" diff --git a/morpheus/stages/input/rss_source_stage.py b/morpheus/stages/input/rss_source_stage.py index d56a443542..a67d7997cb 100644 --- a/morpheus/stages/input/rss_source_stage.py +++ b/morpheus/stages/input/rss_source_stage.py @@ -52,6 +52,8 @@ class RSSSourceStage(PreallocatorMixin, SingleOutputSource): Cooldown interval in seconds if there is a failure in fetching or parsing the feed. request_timeout : float, optional, default = 2.0 Request timeout in secs to fetch the feed. + strip_markup : bool, optional, default = False + When true, strip HTML & XML markup from the from the content, summary and title fields. """ def __init__(self, @@ -64,7 +66,8 @@ def __init__(self, enable_cache: bool = False, cache_dir: str = "./.cache/http", cooldown_interval: int = 600, - request_timeout: float = 2.0): + request_timeout: float = 2.0, + strip_markup: bool = False): super().__init__(c) self._stop_requested = False @@ -87,7 +90,8 @@ def __init__(self, "enable_cache": enable_cache, "cache_dir": cache_dir, "cooldown_interval_sec": cooldown_interval, - "request_timeout_sec": request_timeout + "request_timeout_sec": request_timeout, + "strip_markup": strip_markup } } diff --git a/tests/controllers/test_rss_controller.py b/tests/controllers/test_rss_controller.py index dad981ad07..9cb42ca815 100644 --- a/tests/controllers/test_rss_controller.py +++ b/tests/controllers/test_rss_controller.py @@ -17,15 +17,18 @@ from os import path from unittest.mock import Mock from unittest.mock import patch +from xml.etree import ElementTree import feedparser import pytest +from bs4 import BeautifulSoup import cudf from _utils import TEST_DIRS from morpheus.controllers.rss_controller import FeedStats from morpheus.controllers.rss_controller import RSSController +from morpheus.utils.type_aliases import SeriesType test_urls = ["https://fake.nvidia.com/rss/HomePage.xml"] @@ -66,6 +69,11 @@ def mock_get_response_fixture() -> Mock: return mock_response +@pytest.fixture(scope="module", name="cisa_rss_feed") +def cisa_rss_feed_fixture() -> str: + return [path.join(TEST_DIRS.tests_data_dir, 'service/cisa_rss_feed.xml')] + + @pytest.mark.parametrize("feed_input, expected_output", [(url, True) for url in test_urls]) def test_run_indefinitely_true(feed_input: str, expected_output: bool): controller = RSSController(feed_input=feed_input) @@ -95,9 +103,11 @@ def test_parse_feed_invalid_input(feed_input: list[str]): RSSController(feed_input=feed_input) +@pytest.mark.parametrize("strip_markup", [False, True]) @pytest.mark.parametrize("feed_input, expected_count", [(test_file_paths[0], 30)]) -def test_skip_duplicates_feed_inputs(feed_input: str, expected_count: int): - controller = RSSController(feed_input=[feed_input, feed_input]) # Pass duplicate feed inputs +def test_skip_duplicates_feed_inputs(feed_input: str, expected_count: int, strip_markup: bool): + controller = RSSController(feed_input=[feed_input, feed_input], + strip_markup=strip_markup) # Pass duplicate feed inputs dataframes_generator = controller.fetch_dataframes() dataframe = next(dataframes_generator, None) assert isinstance(dataframe, cudf.DataFrame) @@ -130,9 +140,10 @@ def test_fetch_dataframes_url(feed_input: str | list[str], assert len(dataframe) > 0 +@pytest.mark.parametrize("strip_markup", [False, True]) @pytest.mark.parametrize("feed_input", [test_file_paths, test_file_paths[0]]) -def test_fetch_dataframes_filepath(feed_input: str | list[str]): - controller = RSSController(feed_input=feed_input) +def test_fetch_dataframes_filepath(feed_input: str | list[str], strip_markup: bool): + controller = RSSController(feed_input=feed_input, strip_markup=strip_markup) dataframes_generator = controller.fetch_dataframes() dataframe = next(dataframes_generator, None) assert isinstance(dataframe, cudf.DataFrame) @@ -140,18 +151,23 @@ def test_fetch_dataframes_filepath(feed_input: str | list[str]): assert len(dataframe) > 0 +@pytest.mark.parametrize("strip_markup", [False, True]) @pytest.mark.parametrize("feed_input, batch_size", [(test_file_paths, 5)]) -def test_batch_size(feed_input: list[str], batch_size: int): - controller = RSSController(feed_input=feed_input, batch_size=batch_size) +def test_batch_size(feed_input: list[str], batch_size: int, strip_markup: bool): + controller = RSSController(feed_input=feed_input, batch_size=batch_size, strip_markup=strip_markup) for df in controller.fetch_dataframes(): assert isinstance(df, cudf.DataFrame) assert len(df) <= batch_size +@pytest.mark.parametrize("strip_markup", [False, True]) @pytest.mark.parametrize("feed_input, enable_cache", [(test_file_paths[0], False), (test_urls[0], True), (test_urls[0], False)]) -def test_try_parse_feed_with_beautiful_soup(feed_input: str, enable_cache: bool, mock_get_response: Mock): - controller = RSSController(feed_input=feed_input, enable_cache=enable_cache) +def test_try_parse_feed_with_beautiful_soup(feed_input: str, + enable_cache: bool, + mock_get_response: Mock, + strip_markup: bool): + controller = RSSController(feed_input=feed_input, enable_cache=enable_cache, strip_markup=strip_markup) # When enable_cache is set to 'True', the feed content is provided as input. feed_data = controller._try_parse_feed_with_beautiful_soup(mock_get_response.text) @@ -226,13 +242,44 @@ def test_parse_feeds(mock_feed: feedparser.FeedParserDict): controller.get_feed_stats("http://testfeed.com") +@pytest.mark.parametrize("strip_markup", [False, True]) @pytest.mark.parametrize("feed_input", [test_urls[0]]) -def test_redundant_fetch(feed_input: str, mock_feed: feedparser.FeedParserDict, mock_get_response: Mock): +def test_redundant_fetch(feed_input: str, + mock_feed: feedparser.FeedParserDict, + mock_get_response: Mock, + strip_markup: bool): - controller = RSSController(feed_input=feed_input) + controller = RSSController(feed_input=feed_input, strip_markup=strip_markup) mock_feedparser_parse = patch("morpheus.controllers.rss_controller.feedparser.parse") with mock_feedparser_parse, patch("requests.Session.get", return_value=mock_get_response) as mocked_session_get: mock_feedparser_parse.return_value = mock_feed dataframes_generator = controller.fetch_dataframes() next(dataframes_generator, None) assert mocked_session_get.call_count == 1 + + +@pytest.mark.parametrize("strip_markup", [False, True]) +def test_strip_markup(cisa_rss_feed: list[str], strip_markup: bool): + # Construct expected data + tree = ElementTree.parse(cisa_rss_feed[0]) + + # feedparser will map the description field to the summary field + description_tags = tree.findall('./channel/item/description') + expected_summary_col = [(tag.text or "").strip() for tag in description_tags] + + if strip_markup: + expected_summary_col = [ + BeautifulSoup(summary, features="html.parser").get_text() for summary in expected_summary_col + ] + + controller = RSSController(feed_input=cisa_rss_feed, strip_markup=strip_markup) + dataframes = list(controller.fetch_dataframes()) + + # The length number of dataframes and rows should be the same regardless if strip_markup is True or False + assert len(dataframes) == 1 + dataframe = dataframes[0] + assert isinstance(dataframe, cudf.DataFrame) + assert len(dataframe) == 10 + + series: SeriesType = dataframe["summary"] + assert (series.to_pandas().values == expected_summary_col).all() From 808c52ca1c0ec4a74695c68fff06d67c5fad7e83 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Thu, 2 May 2024 14:15:51 -0700 Subject: [PATCH 19/38] Update examples to execute from the root of the repo (#1674) * Update top-level examples and Triton start-up commands to execute from the root of the repo * Where possible set default values for cli flags, removing the need to set them for the common use-case * Where possible remove the need for defining `MORPHEUS_ROOT` * Ensure C++ Triton pipelines use port 8000 to avoid the warning about the grpc port. * Optionally cast types in the C++ impl of the Triton stage when `force_convert_inputs=true` and the input and model types didn't match (previously types were always casted) * Remove `--num_threads=1` restriction and configure logging for the `log_parsing` example * Remove `--num_threads=8` restriction from `nlp_si_detection` since the pipeline has more than 8 stages. * Don't invoke the C++ impl of preallocate if the type being requested isn't supported on the C++ side (strings) * Don't use the C++ impl of the Triton stage if `use_shared_memory` is requested as this isn't supported in C++. * Add missing `gnn-fraud-classification` stage to CLI alternative for `gnn_fraud_detection_pipeline` example Closes #1671 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Eli Fajardo (https://github.com/efajardo-nv) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1674 --- docs/source/examples.md | 2 +- examples/README.md | 22 +- examples/abp_pcap_detection/README.md | 41 ++-- examples/abp_pcap_detection/run.py | 7 +- .../gnn_fraud_detection_pipeline/README.md | 15 +- examples/gnn_fraud_detection_pipeline/run.py | 8 +- examples/log_parsing/README.md | 21 +- examples/log_parsing/run.py | 8 +- examples/nlp_si_detection/README.md | 5 +- examples/nlp_si_detection/run.sh | 2 +- examples/ransomware_detection/README.md | 21 +- examples/ransomware_detection/run.py | 4 +- examples/root_cause_analysis/README.md | 5 +- morpheus/_lib/common/__init__.pyi | 3 + morpheus/_lib/common/module.cpp | 4 + .../_lib/include/morpheus/objects/dtype.hpp | 104 +++++++++- .../stages/inference_client_stage.hpp | 5 + .../morpheus/stages/triton_inference.hpp | 8 +- morpheus/_lib/src/objects/dtype.cpp | 17 ++ .../src/stages/inference_client_stage.cpp | 18 +- morpheus/_lib/src/stages/triton_inference.cpp | 35 +++- morpheus/_lib/stages/__init__.pyi | 4 +- morpheus/_lib/stages/module.cpp | 2 + morpheus/_lib/tests/objects/test_dtype.cpp | 17 +- .../stages/test_triton_inference_stage.cpp | 191 +++++++++++++++--- morpheus/common/__init__.py | 2 + morpheus/pipeline/preallocator_mixin.py | 10 +- .../inference/triton_inference_stage.py | 11 +- 28 files changed, 448 insertions(+), 144 deletions(-) diff --git a/docs/source/examples.md b/docs/source/examples.md index 5de469f588..bfe4f8e24c 100644 --- a/docs/source/examples.md +++ b/docs/source/examples.md @@ -24,7 +24,7 @@ limitations under the License. * [Example Ransomware Detection Morpheus Pipeline for AppShield Data](../../examples/ransomware_detection/README.md) * [Root Cause Analysis Acceleration & Predictive Maintenance Example](../../examples/root_cause_analysis/README.md) * [SID Visualization Example](../../examples/sid_visualization/README.md) -* [Large Language Models (LLMs)](../../examples/llm/README.md) +* Large Language Models (LLMs) * [Agents](../../examples/llm/agents/README.md) * [Completion](../../examples/llm/completion/README.md) * [VDB Upload](../../examples/llm/vdb_upload/README.md) diff --git a/examples/README.md b/examples/README.md index 1c001ffebe..4bdc94648f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,10 +15,18 @@ See the License for the specific language governing permissions and limitations under the License. --> -## Morpheus CLI Examples - -Examples run with the Morpheus CLI (`morpheus ...`) should be run from the repository root; otherwise, some filepath arguments may need to be changed. - -## Morpheus run.py Examples - -Examples run with python (`python run.py`) should be run from the example's directory; otherwise, relative Python imports may be broken. +# Examples +* [Anomalous Behavior Profiling with Forest Inference Library (FIL) Example](./abp_nvsmi_detection/README.md) +* [ABP Detection Example Using Morpheus](./abp_pcap_detection/README.md) +* [Digital Fingerprinting (DFP)](./digital_fingerprinting/README.md) +* [GNN Fraud Detection Pipeline](./gnn_fraud_detection_pipeline/README.md) +* [Example cyBERT Morpheus Pipeline for Apache Log Parsing](./log_parsing/README.md) +* [Sensitive Information Detection with Natural Language Processing (NLP) Example](./nlp_si_detection/README.md) +* [Example Ransomware Detection Morpheus Pipeline for AppShield Data](./ransomware_detection/README.md) +* [Root Cause Analysis Acceleration & Predictive Maintenance Example](./root_cause_analysis/README.md) +* [SID Visualization Example](./sid_visualization/README.md) +* Large Language Models (LLMs) + * [Agents](./llm/agents/README.md) + * [Completion](./llm/completion/README.md) + * [VDB Upload](./llm/vdb_upload/README.md) + * [Retreival Augmented Generation (RAG)](./llm/rag/README.md) diff --git a/examples/abp_pcap_detection/README.md b/examples/abp_pcap_detection/README.md index 371bd28e35..3cfae25aa9 100644 --- a/examples/abp_pcap_detection/README.md +++ b/examples/abp_pcap_detection/README.md @@ -27,14 +27,9 @@ docker pull nvcr.io/nvidia/tritonserver:23.06-py3 ``` ##### Deploy Triton Inference Server -From the root of the Morpheus repo, navigate to the anomalous behavior profiling example directory: +From the root of the Morpheus repo, run the following to launch Triton and load the `abp-pcap-xgb` model: ```bash -cd examples/abp_pcap_detection -``` - -The following creates the Triton container, mounts the `abp-pcap-xgb` directory to `/models/abp-pcap-xgb` in the Triton container, and starts the Triton server: -```bash -docker run --rm --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/abp-pcap-xgb:/models/abp-pcap-xgb --name tritonserver nvcr.io/nvidia/tritonserver:23.06-py3 tritonserver --model-repository=/models --exit-on-error=false +docker run --rm --gpus=all -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/examples/abp_pcap_detection/abp-pcap-xgb:/models/abp-pcap-xgb --name tritonserver nvcr.io/nvidia/tritonserver:23.06-py3 tritonserver --model-repository=/models --exit-on-error=false ``` ##### Verify Model Deployment @@ -53,8 +48,7 @@ Use Morpheus to run the Anomalous Behavior Profiling Detection Pipeline with the From the root of the Morpheus repo, run: ```bash -cd examples/abp_pcap_detection -python run.py --help +python examples/abp_pcap_detection/run.py --help ``` Output: @@ -62,44 +56,41 @@ Output: Usage: run.py [OPTIONS] Options: - --num_threads INTEGER RANGE Number of internal pipeline threads to use + --num_threads INTEGER RANGE Number of internal pipeline threads to use. [x>=1] --pipeline_batch_size INTEGER RANGE Internal batch size for the pipeline. Can be much larger than the model batch size. Also - used for Kafka consumers [x>=1] + used for Kafka consumers. [x>=1] --model_max_batch_size INTEGER RANGE - Max batch size to use for the model [x>=1] - --input_file PATH Input filepath [required] + Max batch size to use for the model. [x>=1] + --input_file PATH Input filepath. [required] --output_file TEXT The path to the file where the inference output will be saved. --model_fea_length INTEGER RANGE - Features length to use for the model [x>=1] + Features length to use for the model. + [x>=1] --model_name TEXT The name of the model that is deployed on - Tritonserver + Tritonserver. --iterative Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. Iterative mode is good for interleaving source stages. - --server_url TEXT Tritonserver url [required] - --file_type [auto|json|csv] Indicates what type of file to read. + --server_url TEXT Tritonserver url. [required] + --file_type [auto|csv|json] Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. --help Show this message and exit. ``` -To launch the configured Morpheus pipeline with the sample data that is provided in `examples/data`, from the `examples/abp_pcap_detection` directory run the following: +To launch the configured Morpheus pipeline with the sample data that is provided in `examples/data`, run the following: ```bash -python run.py \ - --input_file ../data/abp_pcap_dump.jsonlines \ - --output_file ./pcap_out.jsonlines \ - --model_name 'abp-pcap-xgb' \ - --server_url localhost:8001 +python examples/abp_pcap_detection/run.py ``` Note: Both Morpheus and Triton Inference Server containers must have access to the same GPUs in order for this example to work. -The pipeline will process the input `pcap_dump.jsonlines` sample data and write it to `pcap_out.jsonlines`. +The pipeline will process the input `abp_pcap_dump.jsonlines` sample data and write it to `pcap_out.jsonlines`. ### CLI Example The above example is illustrative of using the Python API to build a custom Morpheus Pipeline. @@ -123,5 +114,3 @@ morpheus --log_level INFO --plugin "examples/abp_pcap_detection/abp_pcap_preproc to-file --filename "pcap_out.jsonlines" --overwrite \ monitor --description "Write to file rate" --unit "to-file" ``` - -Note: Triton is still needed to be launched from the `examples/abp_pcap_detection` directory. diff --git a/examples/abp_pcap_detection/run.py b/examples/abp_pcap_detection/run.py index 405f9bde6e..18d5c25e5d 100644 --- a/examples/abp_pcap_detection/run.py +++ b/examples/abp_pcap_detection/run.py @@ -33,6 +33,9 @@ from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.utils.logger import configure_logging +CUR_DIR = os.path.dirname(__file__) +EX_DATA_DIR = os.path.join(CUR_DIR, "../data") + @click.command() @click.option( @@ -57,7 +60,7 @@ @click.option( "--input_file", type=click.Path(exists=True, readable=True), - default="pcap.jsonlines", + default=os.path.join(EX_DATA_DIR, "abp_pcap_dump.jsonlines"), required=True, help="Input filepath.", ) @@ -84,7 +87,7 @@ help=("Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. " "Iterative mode is good for interleaving source stages."), ) -@click.option("--server_url", required=True, help="Tritonserver url.") +@click.option("--server_url", required=True, help="Tritonserver url.", default="localhost:8001") @click.option( "--file_type", type=click.Choice(FILE_TYPE_NAMES, case_sensitive=False), diff --git a/examples/gnn_fraud_detection_pipeline/README.md b/examples/gnn_fraud_detection_pipeline/README.md index c61f288499..8f05229710 100644 --- a/examples/gnn_fraud_detection_pipeline/README.md +++ b/examples/gnn_fraud_detection_pipeline/README.md @@ -28,17 +28,10 @@ mamba env update \ ``` ## Running - -##### Setup Env Variable -```bash -export MORPHEUS_ROOT=$(pwd) -``` - Use Morpheus to run the GNN fraud detection Pipeline with the transaction data. A pipeline has been configured in `run.py` with several command line options: ```bash -cd ${MORPHEUS_ROOT}/examples/gnn_fraud_detection_pipeline -python run.py --help +python examples/gnn_fraud_detection_pipeline/run.py --help ``` ``` Usage: run.py [OPTIONS] @@ -63,11 +56,10 @@ Options: --help Show this message and exit. ``` -To launch the configured Morpheus pipeline with the sample data that is provided at `$MORPHEUS_ROOT/models/dataset`, run the following: +To launch the configured Morpheus pipeline, run the following: ```bash -cd ${MORPHEUS_ROOT}/examples/gnn_fraud_detection_pipeline -python run.py +python examples/gnn_fraud_detection_pipeline/run.py ``` ``` ====Registering Pipeline==== @@ -125,6 +117,7 @@ morpheus --log_level INFO \ monitor --description "Graph construction rate" \ gnn-fraud-sage --model_dir examples/gnn_fraud_detection_pipeline/model/ \ monitor --description "Inference rate" \ + gnn-fraud-classification --model_xgb_file examples/gnn_fraud_detection_pipeline/model/xgb.pt \ monitor --description "Add classification rate" \ serialize \ to-file --filename "output.csv" --overwrite diff --git a/examples/gnn_fraud_detection_pipeline/run.py b/examples/gnn_fraud_detection_pipeline/run.py index 58374a8c2b..ae91845b86 100644 --- a/examples/gnn_fraud_detection_pipeline/run.py +++ b/examples/gnn_fraud_detection_pipeline/run.py @@ -32,6 +32,8 @@ from stages.graph_construction_stage import FraudGraphConstructionStage from stages.graph_sage_stage import GraphSAGEStage +CUR_DIR = os.path.dirname(__file__) + @click.command() @click.option( @@ -62,21 +64,21 @@ @click.option( "--input_file", type=click.Path(exists=True, readable=True, dir_okay=False), - default="validation.csv", + default=os.path.join(CUR_DIR, "validation.csv"), required=True, help="Input data filepath.", ) @click.option( "--training_file", type=click.Path(exists=True, readable=True, dir_okay=False), - default="training.csv", + default=os.path.join(CUR_DIR, "training.csv"), required=True, help="Training data filepath.", ) @click.option( "--model_dir", type=click.Path(exists=True, readable=True, file_okay=False, dir_okay=True), - default="model", + default=os.path.join(CUR_DIR, "model"), required=True, help="Path to trained Hinsage & XGB models.", ) diff --git a/examples/log_parsing/README.md b/examples/log_parsing/README.md index ce9790be06..425e1c0b1c 100644 --- a/examples/log_parsing/README.md +++ b/examples/log_parsing/README.md @@ -29,11 +29,6 @@ Example: docker pull nvcr.io/nvidia/tritonserver:23.06-py3 ``` -##### Setup Env Variable -```bash -export MORPHEUS_ROOT=$(pwd) -``` - ##### Start Triton Inference Server Container From the Morpheus repo root directory, run the following to launch Triton and load the `log-parsing-onnx` model: @@ -56,19 +51,15 @@ Once Triton server finishes starting up, it will display the status of all loade ### Run Log Parsing Pipeline -Run the following from the `examples/log_parsing` directory to start the log parsing pipeline: +Run the following from the root of the Morpheus repo to start the log parsing pipeline: ```bash -python run.py \ - --num_threads 1 \ - --input_file ${MORPHEUS_ROOT}/models/datasets/validation-data/log-parsing-validation-data-input.csv \ - --output_file ./log-parsing-output.jsonlines \ +python examples/log_parsing/run.py \ + --input_file=./models/datasets/validation-data/log-parsing-validation-data-input.csv \ --model_vocab_hash_file=data/bert-base-cased-hash.txt \ - --model_vocab_file=${MORPHEUS_ROOT}/models/training-tuning-scripts/sid-models/resources/bert-base-cased-vocab.txt \ - --model_seq_length=256 \ + --model_vocab_file=./models/training-tuning-scripts/sid-models/resources/bert-base-cased-vocab.txt \ --model_name log-parsing-onnx \ - --model_config_file=${MORPHEUS_ROOT}/models/log-parsing-models/log-parsing-config-20220418.json \ - --server_url localhost:8001 + --model_config_file=./models/log-parsing-models/log-parsing-config-20220418.json ``` Use `--help` to display information about the command line options: @@ -110,7 +101,7 @@ PYTHONPATH="examples/log_parsing" \ morpheus --log_level INFO \ --plugin "inference" \ --plugin "postprocessing" \ - run --num_threads 1 --pipeline_batch_size 1024 --model_max_batch_size 32 \ + run --pipeline_batch_size 1024 --model_max_batch_size 32 \ pipeline-nlp \ from-file --filename ./models/datasets/validation-data/log-parsing-validation-data-input.csv \ deserialize \ diff --git a/examples/log_parsing/run.py b/examples/log_parsing/run.py index b0dfe76fd3..7fff20bd27 100644 --- a/examples/log_parsing/run.py +++ b/examples/log_parsing/run.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os import click @@ -28,6 +29,7 @@ from morpheus.stages.output.write_to_file_stage import WriteToFileStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage +from morpheus.utils.logger import configure_logging @click.command() @@ -79,7 +81,7 @@ help="The name of the model that is deployed on Tritonserver.", ) @click.option("--model_config_file", required=True, help="Model config file.") -@click.option("--server_url", required=True, help="Tritonserver url.") +@click.option("--server_url", required=True, help="Tritonserver url.", default="localhost:8001") def run_pipeline( num_threads, pipeline_batch_size, @@ -93,6 +95,10 @@ def run_pipeline( model_config_file, server_url, ): + + # Enable the default logger. + configure_logging(log_level=logging.INFO) + config = Config() config.mode = PipelineModes.NLP config.num_threads = num_threads diff --git a/examples/nlp_si_detection/README.md b/examples/nlp_si_detection/README.md index 32cc2f23b5..33081caf00 100644 --- a/examples/nlp_si_detection/README.md +++ b/examples/nlp_si_detection/README.md @@ -103,11 +103,10 @@ The following command line is the entire command to build and launch the pipelin From the Morpheus repo root directory, run: ```bash -export MORPHEUS_ROOT=$(pwd) # Launch Morpheus printing debug messages morpheus --log_level=DEBUG \ - `# Run a pipeline with 8 threads and a model batch size of 32 (Must match Triton config)` \ - run --num_threads=8 --pipeline_batch_size=1024 --model_max_batch_size=32 \ + `# Run a pipeline with a model batch size of 32 (Must match Triton config)` \ + run --pipeline_batch_size=1024 --model_max_batch_size=32 \ `# Specify a NLP pipeline with 256 sequence length (Must match Triton config)` \ pipeline-nlp --model_seq_length=256 \ `# 1st Stage: Read from file` \ diff --git a/examples/nlp_si_detection/run.sh b/examples/nlp_si_detection/run.sh index f702784968..390418e545 100755 --- a/examples/nlp_si_detection/run.sh +++ b/examples/nlp_si_detection/run.sh @@ -19,7 +19,7 @@ SCRIPT_DIR=${SCRIPT_DIR:-"$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null export MORPHEUS_ROOT=${MORPHEUS_ROOT:-"$(realpath ${SCRIPT_DIR}/../..)"} morpheus --log_level=DEBUG \ - run --num_threads=8 --pipeline_batch_size=1024 --model_max_batch_size=32 \ + run --pipeline_batch_size=1024 --model_max_batch_size=32 \ pipeline-nlp --model_seq_length=256 \ from-file --filename=${MORPHEUS_ROOT}/examples/data/pcap_dump.jsonlines \ deserialize \ diff --git a/examples/ransomware_detection/README.md b/examples/ransomware_detection/README.md index 23f44e4ede..6c04feae46 100644 --- a/examples/ransomware_detection/README.md +++ b/examples/ransomware_detection/README.md @@ -35,15 +35,15 @@ export MORPHEUS_ROOT=$(pwd) ``` ##### Start Triton Inference Server Container -Run the following from the `examples/ransomware_detection` directory to launch Triton and load the `ransomw-model-short-rf` model: - +From the Morpheus repo root directory, run the following to launch Triton and load the `ransomw-model-short-rf` model: ```bash # Run Triton in explicit mode -docker run --rm -ti --gpus=all -p8000:8000 -p8001:8001 -p8002:8002 -v $PWD/models:/models/triton-model-repo nvcr.io/nvidia/tritonserver:23.06-py3 \ - tritonserver --model-repository=/models/triton-model-repo \ - --exit-on-error=false \ - --model-control-mode=explicit \ - --load-model ransomw-model-short-rf +docker run --rm -ti --gpus=all -p8000:8000 -p8001:8001 -p8002:8002 \ + -v $PWD/examples/ransomware_detection/models:/models/triton-model-repo nvcr.io/nvidia/tritonserver:23.06-py3 \ + tritonserver --model-repository=/models/triton-model-repo \ + --exit-on-error=false \ + --model-control-mode=explicit \ + --load-model ransomw-model-short-rf ``` ##### Verify Model Deployment @@ -67,14 +67,13 @@ mamba install 'dask>=2023.1.1' 'distributed>=2023.1.1' ``` ## Run Ransomware Detection Pipeline -Run the following from the `examples/ransomware_detection` directory to start the ransomware detection pipeline: +Run the following from the root of the Morpheus repo to start the ransomware detection pipeline: ```bash -python run.py --server_url=localhost:8001 \ +python examples/ransomware_detection/run.py --server_url=localhost:8001 \ --sliding_window=3 \ --model_name=ransomw-model-short-rf \ - --conf_file=./config/ransomware_detection.yaml \ - --input_glob=${MORPHEUS_ROOT}/examples/data/appshield/*/snapshot-*/*.json \ + --input_glob=./examples/data/appshield/*/snapshot-*/*.json \ --output_file=./ransomware_detection_output.jsonlines ``` diff --git a/examples/ransomware_detection/run.py b/examples/ransomware_detection/run.py index 58296bd2ae..5a80265996 100644 --- a/examples/ransomware_detection/run.py +++ b/examples/ransomware_detection/run.py @@ -33,6 +33,8 @@ from stages.create_features import CreateFeaturesRWStage from stages.preprocessing import PreprocessingRWStage +CUR_DIR = os.path.dirname(__file__) + @click.command() @click.option('--debug', default=False) @@ -64,7 +66,7 @@ @click.option( "--conf_file", type=click.STRING, - default="./config/ransomware_detection.yaml", + default=os.path.join(CUR_DIR, "config/ransomware_detection.yaml"), help="Ransomware detection configuration filepath.", ) @click.option( diff --git a/examples/root_cause_analysis/README.md b/examples/root_cause_analysis/README.md index 2efd63c0ed..b456c3ff72 100644 --- a/examples/root_cause_analysis/README.md +++ b/examples/root_cause_analysis/README.md @@ -98,9 +98,6 @@ From the Morpheus repo root directory, run: ```bash export MORPHEUS_ROOT=$(pwd) -``` - -```bash morpheus --log_level=DEBUG \ `# Run a pipeline with 5 threads and a model batch size of 32 (Must match Triton config)` \ run --num_threads=8 --edge_buffer_size=4 --use_cpp=True --pipeline_batch_size=1024 --model_max_batch_size=32 \ @@ -113,7 +110,7 @@ deserialize \ `# 3rd Stage: Preprocessing converts the input data into BERT tokens` \ preprocess --column=log --vocab_hash_file=./data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ `# 4th Stage: Send messages to Triton for inference. Specify the binary model loaded in Setup` \ -inf-triton --force_convert_inputs=True --model_name=root-cause-binary-onnx --server_url=localhost:8001 \ +inf-triton --model_name=root-cause-binary-onnx --server_url=localhost:8000 --force_convert_inputs=True \ `# 5th Stage: Monitor stage prints throughput information to the console` \ monitor --description='Inference rate' --smoothing=0.001 --unit inf \ `# 6th Stage: Add scores from inference to the messages` \ diff --git a/morpheus/_lib/common/__init__.pyi b/morpheus/_lib/common/__init__.pyi index a5ec2b692c..39e722bdec 100644 --- a/morpheus/_lib/common/__init__.pyi +++ b/morpheus/_lib/common/__init__.pyi @@ -18,6 +18,7 @@ __all__ = [ "TypeId", "determine_file_type", "read_file_to_df", + "typeid_is_fully_supported", "typeid_to_numpy_str", "write_df_to_file" ] @@ -199,6 +200,8 @@ def determine_file_type(filename: str) -> FileTypes: pass def read_file_to_df(filename: str, file_type: FileTypes = FileTypes.Auto) -> object: pass +def typeid_is_fully_supported(arg0: TypeId) -> bool: + pass def typeid_to_numpy_str(arg0: TypeId) -> str: pass def write_df_to_file(df: object, filename: str, file_type: FileTypes = FileTypes.Auto, **kwargs) -> None: diff --git a/morpheus/_lib/common/module.cpp b/morpheus/_lib/common/module.cpp index 0c2ae40914..0bda85b975 100644 --- a/morpheus/_lib/common/module.cpp +++ b/morpheus/_lib/common/module.cpp @@ -129,6 +129,10 @@ PYBIND11_MODULE(common, _module) return DType(tid).type_str(); }); + _module.def("typeid_is_fully_supported", [](TypeId tid) { + return DType(tid).is_fully_supported(); + }); + _module.def( "determine_file_type", py::overload_cast(&determine_file_type), py::arg("filename")); _module.def("determine_file_type", diff --git a/morpheus/_lib/include/morpheus/objects/dtype.hpp b/morpheus/_lib/include/morpheus/objects/dtype.hpp index 63dbd1594a..2297460b52 100644 --- a/morpheus/_lib/include/morpheus/objects/dtype.hpp +++ b/morpheus/_lib/include/morpheus/objects/dtype.hpp @@ -33,6 +33,13 @@ namespace morpheus { */ // Pulled from cuDF + +/** + * @brief Template function to calculate the size in bits of a given type. + * + * @tparam T The type to calculate the size for. + * @return The size in bits of the given type. + */ template constexpr std::size_t size_in_bits() { @@ -40,8 +47,11 @@ constexpr std::size_t size_in_bits() return sizeof(T) * CHAR_BIT; } -// Pulled from cudf #pragma GCC visibility push(default) + +/** + * @brief Enum class for representing data types used in Tensors and DataFrame columns. + */ enum class TypeId : int32_t { EMPTY, ///< Always null with no underlying data @@ -78,40 +88,112 @@ enum class TypeId : int32_t NUM_TYPE_IDS ///< Total number of type ids }; -/****** DType****************************************/ +/** + * @class DType + * @brief This class represents a data type specified by a TypeId. + */ struct DType { + /** + * @brief Construct a DType for a given type specified by a TypeId. + * + * @param tid The TypeId to initialize the DType object with. + */ DType(TypeId tid); + + /** + * @brief Copy constructor. + * + * @param dtype The DType object to copy from. + */ DType(const DType& dtype) = default; + + /** + * @brief Equality operator. + * + * @param other The DType object to compare with. + * @return True if the two DType objects represent the same TypeId, false otherwise. + */ bool operator==(const DType& other) const; + /** + * @brief Get the TypeId of the DType object. + * + * @return The TypeId of the DType object. + */ TypeId type_id() const; - // Number of bytes per item + /** + * @brief Get the number of bytes per item. + * + * @return The number of bytes per item. + */ size_t item_size() const; - // Pretty print + /** + * @brief Get the name of the DType object. + * + * @return The name of the DType object. + */ std::string name() const; - // Returns the numpy string representation + /** + * @brief Get the numpy string representation of the DType object. + * + * @return The numpy string representation of the DType object. + */ std::string type_str() const; - // Cudf representation + /** + * @brief Get the cudf type id of the DType object. + * + * @return The cudf type id of the DType object. + */ cudf::type_id cudf_type_id() const; - // Returns the triton string representation + /** + * @brief Get the triton string representation of the DType object. + * + * @return The triton string representation of the DType object. + */ std::string triton_str() const; - // From cudf + /** + * @brief Create a DType object from a cudf type id. + * + * @param id The cudf type id. + * @return A DType object. + */ static DType from_cudf(cudf::type_id tid); - // From numpy + /** + * @brief Create a DType object from a numpy type string. + * + * @param type_str The numpy type string. + * @return A DType object. + */ static DType from_numpy(const std::string& numpy_str); - // From triton + /** + * @brief Create a DType object from a triton type string. + * + * @param type_str The triton type string. + * @return A DType object. + */ static DType from_triton(const std::string& type_str); - // from template + /** + * @brief Check if the DType object is fully supported. + * + * @return True if the DType object is fully supported, false otherwise. + */ + bool is_fully_supported() const; + + /** + * @brief Construct a DType object from a C++ type. + * + * @return A DType object. + */ template static DType create() { diff --git a/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp b/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp index fd115de5af..24f6934fdd 100644 --- a/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp +++ b/morpheus/_lib/include/morpheus/stages/inference_client_stage.hpp @@ -111,6 +111,7 @@ class MORPHEUS_EXPORT InferenceClientStage * @param model_name : Name of the model specifies which model can handle the inference requests that are sent to * Triton inference * @param needs_logits : Determines if logits are required. + * @param force_convert_inputs : Determines if inputs should be converted to the model's input format. * @param inout_mapping : Dictionary used to map pipeline input/output names to Triton input/output names. Use this * if the Morpheus names do not match the model. */ @@ -154,6 +155,7 @@ struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy * Triton inference * @param server_url : Triton server URL. * @param needs_logits : Determines if logits are required. + * @param force_convert_inputs : Determines if inputs should be converted to the model's input format. * @param inout_mapping : Dictionary used to map pipeline input/output names to Triton input/output names. Use this * if the Morpheus names do not match the model. * @return std::shared_ptr>> @@ -164,6 +166,7 @@ struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy std::string model_name, std::string server_url, bool needs_logits, + bool force_convert_inputs, std::map input_mapping, std::map output_mapping); @@ -176,6 +179,7 @@ struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy * Triton inference * @param server_url : Triton server URL. * @param needs_logits : Determines if logits are required. + * @param force_convert_inputs : Determines if inputs should be converted to the model's input format. * @param inout_mapping : Dictionary used to map pipeline input/output names to Triton input/output names. Use this * if the Morpheus names do not match the model. * @return std::shared_ptr>> @@ -186,6 +190,7 @@ struct MORPHEUS_EXPORT InferenceClientStageInterfaceProxy std::string model_name, std::string server_url, bool needs_logits, + bool force_convert_inputs, std::map input_mapping, std::map output_mapping); }; diff --git a/morpheus/_lib/include/morpheus/stages/triton_inference.hpp b/morpheus/_lib/include/morpheus/stages/triton_inference.hpp index 923a75e2b7..1cc8af06af 100644 --- a/morpheus/_lib/include/morpheus/stages/triton_inference.hpp +++ b/morpheus/_lib/include/morpheus/stages/triton_inference.hpp @@ -153,9 +153,12 @@ class MORPHEUS_EXPORT TritonInferenceClientSession : public IInferenceClientSess std::vector m_model_inputs; std::vector m_model_outputs; std::shared_ptr m_client; + bool m_force_convert_inputs; public: - TritonInferenceClientSession(std::shared_ptr client, std::string model_name); + TritonInferenceClientSession(std::shared_ptr client, + std::string model_name, + bool force_convert_inputs); /** @brief Gets the inference input mappings for Triton @@ -178,9 +181,10 @@ class MORPHEUS_EXPORT TritonInferenceClient : public IInferenceClient private: std::shared_ptr m_client; std::string m_model_name; + bool m_force_convert_inputs; public: - TritonInferenceClient(std::unique_ptr&& client, std::string model_name); + TritonInferenceClient(std::unique_ptr&& client, std::string model_name, bool force_convert_inputs); /** @brief Creates a TritonInferenceClientSession diff --git a/morpheus/_lib/src/objects/dtype.cpp b/morpheus/_lib/src/objects/dtype.cpp index 870cdb8059..3f167b1e01 100644 --- a/morpheus/_lib/src/objects/dtype.cpp +++ b/morpheus/_lib/src/objects/dtype.cpp @@ -357,4 +357,21 @@ char DType::type_char() const } } +bool DType::is_fully_supported() const +{ + try + { + byte_order_char(); + cudf_type_id(); + item_size(); + triton_str(); + type_char(); + } catch (...) + { + return false; + } + + return true; +} + } // namespace morpheus diff --git a/morpheus/_lib/src/stages/inference_client_stage.cpp b/morpheus/_lib/src/stages/inference_client_stage.cpp index 26428aa159..d53364a650 100644 --- a/morpheus/_lib/src/stages/inference_client_stage.cpp +++ b/morpheus/_lib/src/stages/inference_client_stage.cpp @@ -333,6 +333,10 @@ mrc::coroutines::AsyncGenerator> InferenceClientStage input_mappings, std::map output_mappings) { @@ -393,8 +398,9 @@ InferenceClientStageInterfaceProxy::init_mm(mrc::segment::Builder& builder, output_mappings_.emplace_back(TensorModelMapping{mapping.first, mapping.second}); } - auto triton_client = std::make_unique(server_url); - auto triton_inference_client = std::make_unique(std::move(triton_client), model_name); + auto triton_client = std::make_unique(server_url); + auto triton_inference_client = + std::make_unique(std::move(triton_client), model_name, force_convert_inputs); auto stage = builder.construct_object>( name, std::move(triton_inference_client), model_name, needs_logits, input_mappings_, output_mappings_); @@ -408,6 +414,7 @@ InferenceClientStageInterfaceProxy::init_cm(mrc::segment::Builder& builder, std::string server_url, std::string model_name, bool needs_logits, + bool force_convert_inputs, std::map input_mappings, std::map output_mappings) { @@ -424,9 +431,10 @@ InferenceClientStageInterfaceProxy::init_cm(mrc::segment::Builder& builder, output_mappings_.emplace_back(TensorModelMapping{mapping.first, mapping.second}); } - auto triton_client = std::make_unique(server_url); - auto triton_inference_client = std::make_unique(std::move(triton_client), model_name); - auto stage = builder.construct_object>( + auto triton_client = std::make_unique(server_url); + auto triton_inference_client = + std::make_unique(std::move(triton_client), model_name, force_convert_inputs); + auto stage = builder.construct_object>( name, std::move(triton_inference_client), model_name, needs_logits, input_mappings_, output_mappings_); return stage; diff --git a/morpheus/_lib/src/stages/triton_inference.cpp b/morpheus/_lib/src/stages/triton_inference.cpp index 30f100e7ea..a78beb5d11 100644 --- a/morpheus/_lib/src/stages/triton_inference.cpp +++ b/morpheus/_lib/src/stages/triton_inference.cpp @@ -258,9 +258,11 @@ triton::client::Error HttpTritonClient::async_infer(triton::client::InferenceSer } TritonInferenceClientSession::TritonInferenceClientSession(std::shared_ptr client, - std::string model_name) : + std::string model_name, + bool force_convert_inputs) : m_client(std::move(client)), - m_model_name(std::move(model_name)) + m_model_name(std::move(model_name)), + m_force_convert_inputs(force_convert_inputs) { // Now load the input/outputs for the model @@ -433,8 +435,24 @@ mrc::coroutines::Task TritonInferenceClientSession::infer(TensorMap&& for (auto model_input : m_model_inputs) { - auto inference_input_slice = - inputs[model_input.name].slice({start, 0}, {stop, -1}).as_type(model_input.datatype); + auto inference_input_slice = inputs.at(model_input.name).slice({start, 0}, {stop, -1}); + + if (inference_input_slice.dtype() != model_input.datatype) + { + if (m_force_convert_inputs) + { + inference_input_slice.swap(inference_input_slice.as_type(model_input.datatype)); + } + else + { + std::string err_msg = MORPHEUS_CONCAT_STR( + "Unexpected dtype for Triton input. Cannot automatically convert dtype due to loss of data." + "Input Name: '" + << model_input.name << ", Expected: " << model_input.datatype.name() + << ", Actual dtype:" << inference_input_slice.dtype().name()); + throw std::invalid_argument(err_msg); + } + } inference_inputs.emplace_back( TritonInferInput{model_input.name, @@ -491,14 +509,17 @@ mrc::coroutines::Task TritonInferenceClientSession::infer(TensorMap&& co_return model_output_tensors; }; -TritonInferenceClient::TritonInferenceClient(std::unique_ptr&& client, std::string model_name) : +TritonInferenceClient::TritonInferenceClient(std::unique_ptr&& client, + std::string model_name, + bool force_convert_inputs) : m_client(std::move(client)), - m_model_name(std::move(model_name)) + m_model_name(std::move(model_name)), + m_force_convert_inputs(force_convert_inputs) {} std::unique_ptr TritonInferenceClient::create_session() { - return std::make_unique(m_client, m_model_name); + return std::make_unique(m_client, m_model_name, m_force_convert_inputs); } } // namespace morpheus diff --git a/morpheus/_lib/stages/__init__.pyi b/morpheus/_lib/stages/__init__.pyi index 85767bdcef..78a0ff8091 100644 --- a/morpheus/_lib/stages/__init__.pyi +++ b/morpheus/_lib/stages/__init__.pyi @@ -71,10 +71,10 @@ class HttpServerSourceStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, bind_address: str = '127.0.0.1', port: int = 8080, endpoint: str = '/message', method: str = 'POST', accept_status: int = 201, sleep_time: float = 0.10000000149011612, queue_timeout: int = 5, max_queue_size: int = 1024, num_server_threads: int = 1, max_payload_size: int = 10485760, request_timeout: int = 30, lines: bool = False, stop_after: int = 0) -> None: ... pass class InferenceClientStageCM(mrc.core.segment.SegmentObject): - def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... + def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, force_convert_inputs: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... pass class InferenceClientStageMM(mrc.core.segment.SegmentObject): - def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... + def __init__(self, builder: mrc.core.segment.Builder, name: str, server_url: str, model_name: str, needs_logits: bool, force_convert_inputs: bool, input_mapping: typing.Dict[str, str] = {}, output_mapping: typing.Dict[str, str] = {}) -> None: ... pass class KafkaSourceStage(mrc.core.segment.SegmentObject): @typing.overload diff --git a/morpheus/_lib/stages/module.cpp b/morpheus/_lib/stages/module.cpp index 6cdba387f0..1cf57663ac 100644 --- a/morpheus/_lib/stages/module.cpp +++ b/morpheus/_lib/stages/module.cpp @@ -177,6 +177,7 @@ PYBIND11_MODULE(stages, _module) py::arg("server_url"), py::arg("model_name"), py::arg("needs_logits"), + py::arg("force_convert_inputs"), py::arg("input_mapping") = py::dict(), py::arg("output_mapping") = py::dict()); @@ -190,6 +191,7 @@ PYBIND11_MODULE(stages, _module) py::arg("server_url"), py::arg("model_name"), py::arg("needs_logits"), + py::arg("force_convert_inputs"), py::arg("input_mapping") = py::dict(), py::arg("output_mapping") = py::dict()); diff --git a/morpheus/_lib/tests/objects/test_dtype.cpp b/morpheus/_lib/tests/objects/test_dtype.cpp index 230d68dcd6..1f1a70bb51 100644 --- a/morpheus/_lib/tests/objects/test_dtype.cpp +++ b/morpheus/_lib/tests/objects/test_dtype.cpp @@ -22,6 +22,8 @@ #include #include +#include // for int32_t +#include // for set #include using namespace morpheus; @@ -283,4 +285,17 @@ TEST_F(TestDType, FromCudfNotSupported) EXPECT_THROW(DType::from_cudf(cudf::type_id::DECIMAL128), std::invalid_argument); EXPECT_THROW(DType::from_cudf(cudf::type_id::STRUCT), std::invalid_argument); EXPECT_THROW(DType::from_cudf(cudf::type_id::NUM_TYPE_IDS), std::invalid_argument); -} \ No newline at end of file +} + +TEST_F(TestDType, IsFullySupported) +{ + std::set unsupported_types = {TypeId::EMPTY, TypeId::STRING, TypeId::NUM_TYPE_IDS}; + for (auto type_id = static_cast(TypeId::EMPTY); type_id <= static_cast(TypeId::NUM_TYPE_IDS); + ++type_id) + { + auto enum_type_id = static_cast(type_id); + auto dtype = DType(enum_type_id); + + ASSERT_EQ(dtype.is_fully_supported(), !unsupported_types.contains(enum_type_id)); + } +} diff --git a/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp b/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp index df7785d259..170655e8c9 100644 --- a/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp +++ b/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp @@ -22,12 +22,14 @@ #include "morpheus/messages/multi_inference.hpp" #include "morpheus/messages/multi_response.hpp" #include "morpheus/objects/dtype.hpp" +#include "morpheus/objects/memory_descriptor.hpp" // for MemoryDescriptor #include "morpheus/objects/tensor.hpp" #include "morpheus/objects/tensor_object.hpp" #include "morpheus/stages/inference_client_stage.hpp" #include "morpheus/stages/triton_inference.hpp" #include "morpheus/types.hpp" #include "morpheus/utilities/cudf_util.hpp" +#include "morpheus/utilities/matx_util.hpp" #include #include @@ -37,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -44,15 +47,19 @@ #include #include #include +#include // for get_current_device_resource #include #include +#include // for initializer_list #include #include #include +#include // for operator<<, basic_ostream #include #include #include +#include class FakeInferResult : public triton::client::InferResult { @@ -117,6 +124,68 @@ class FakeInferResult : public triton::client::InferResult }; class FakeTritonClient : public morpheus::ITritonClient +{ + public: + triton::client::Error is_server_live(bool* live) override + { + *live = true; + return triton::client::Error::Success; + } + + triton::client::Error is_server_ready(bool* ready) override + { + *ready = true; + return triton::client::Error::Success; + } + + triton::client::Error is_model_ready(bool* ready, std::string& model_name) override + { + *ready = true; + return triton::client::Error::Success; + } + + triton::client::Error model_config(std::string* model_config, std::string& model_name) override + { + *model_config = R"({ + "max_batch_size": 100 + })"; + + return triton::client::Error::Success; + } + + triton::client::Error model_metadata(std::string* model_metadata, std::string& model_name) override + { + *model_metadata = R"({ + "inputs":[ + { + "name":"seq_ids", + "shape": [0, 1], + "datatype":"INT32" + } + ], + "outputs":[ + { + "name":"seq_ids", + "shape": [0, 1], + "datatype":"INT32" + } + ]})"; + + return triton::client::Error::Success; + } + + triton::client::Error async_infer(triton::client::InferenceServerHttpClient::OnCompleteFn callback, + const triton::client::InferOptions& options, + const std::vector& inputs, + const std::vector& outputs) override + { + callback(new FakeInferResult({{"seq_ids", std::vector({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})}})); + + return triton::client::Error::Success; + } +}; + +class ErrorProneTritonClient : public FakeTritonClient { private: bool m_is_server_live_has_errored = false; @@ -145,7 +214,7 @@ class FakeTritonClient : public morpheus::ITritonClient m_is_server_live = true; } - return triton::client::Error::Success; + return FakeTritonClient::is_server_live(live); } triton::client::Error is_server_ready(bool* ready) override @@ -192,11 +261,7 @@ class FakeTritonClient : public morpheus::ITritonClient return triton::client::Error("model_config error"); } - *model_config = R"({ - "max_batch_size": 100 - })"; - - return triton::client::Error::Success; + return FakeTritonClient::model_config(model_config, model_name); } triton::client::Error model_metadata(std::string* model_metadata, std::string& model_name) override @@ -207,23 +272,7 @@ class FakeTritonClient : public morpheus::ITritonClient return triton::client::Error("model_metadata error"); } - *model_metadata = R"({ - "inputs":[ - { - "name":"seq_ids", - "shape": [0, 1], - "datatype":"INT32" - } - ], - "outputs":[ - { - "name":"seq_ids", - "shape": [0, 1], - "datatype":"INT32" - } - ]})"; - - return triton::client::Error::Success; + return FakeTritonClient::model_metadata(model_metadata, model_name); } triton::client::Error async_infer(triton::client::InferenceServerHttpClient::OnCompleteFn callback, @@ -237,9 +286,7 @@ class FakeTritonClient : public morpheus::ITritonClient return triton::client::Error("async_infer error"); } - callback(new FakeInferResult({{"seq_ids", std::vector({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})}})); - - return triton::client::Error::Success; + return FakeTritonClient::async_infer(callback, options, inputs, outputs); } }; @@ -307,8 +354,9 @@ TEST_F(TestTritonInferenceStage, SingleRow) auto message = std::make_shared(meta, 0, count, memory); // create the fake triton client used for testing. - auto triton_client = std::make_unique(); - auto triton_inference_client = std::make_unique(std::move(triton_client), ""); + auto triton_client = std::make_unique(); + auto triton_inference_client = + std::make_unique(std::move(triton_client), "", true); auto stage = morpheus::InferenceClientStage( std::move(triton_inference_client), "", false, {}, {}); @@ -342,3 +390,90 @@ TEST_F(TestTritonInferenceStage, SingleRow) ASSERT_EQ(results.size(), 1); } + +TEST_F(TestTritonInferenceStage, ForceConvert) +{ + using namespace morpheus; + const TypeId model_type = TypeId::INT32; + const std::size_t count = 10; + + std::vector test_types = {TypeId::INT8, + TypeId::INT16, + TypeId::INT32, + TypeId::INT64, + TypeId::UINT8, + TypeId::UINT16, + TypeId::UINT32, + TypeId::UINT64}; + + for (const auto type_id : test_types) + { + for (bool force_convert_inputs : {true, false}) + { + const bool expect_throw = (type_id != model_type) && !force_convert_inputs; + const auto dtype = DType(type_id); + + DVLOG(10) << "Testing type: " << dtype.name() << " with force_convert_inputs: " << force_convert_inputs + << " and expect_throw: " << expect_throw; + + // Create a seq_id tensor + auto md = + std::make_shared(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()); + auto seq_ids_buffer = MatxUtil::create_seq_ids(count, 1, type_id, md); + + auto tensors = TensorMap(); + tensors["seq_ids"].swap(Tensor::create(seq_ids_buffer, dtype, {count, 3}, {})); + + // create the MultiInferenceMessage using the sequence id tensor. + auto memory = std::make_shared(count, std::move(tensors)); + auto table = create_test_table_with_metadata(count); + auto meta = morpheus::MessageMeta::create_from_cpp(std::move(table), 1); + auto message = std::make_shared(meta, 0, count, memory); + + // create the fake triton client used for testing. + auto triton_client = std::make_unique(); + auto triton_inference_client = + std::make_unique(std::move(triton_client), "", force_convert_inputs); + auto stage = + morpheus::InferenceClientStage( + std::move(triton_inference_client), "", false, {}, {}); + + // manually invoke the stage and iterate through the inference responses + auto on = std::make_shared(); + auto results_task = [](auto& stage, auto message, auto on) + -> mrc::coroutines::Task>> { + std::vector> results; + + auto responses_generator = stage.on_data(std::move(message), on); + + auto iter = co_await responses_generator.begin(); + + while (iter != responses_generator.end()) + { + results.emplace_back(std::move(*iter)); + + co_await ++iter; + } + + co_return results; + }(stage, message, on); + + results_task.resume(); + + while (on->resume_next()) {} + + if (expect_throw) + { + ASSERT_THROW(results_task.promise().result(), std::invalid_argument); + } + else + { + ASSERT_NO_THROW(results_task.promise().result()); + + auto results = results_task.promise().result(); + + ASSERT_EQ(results.size(), 1); + } + } + } +} diff --git a/morpheus/common/__init__.py b/morpheus/common/__init__.py index 01b1d97ba0..3170b82e66 100644 --- a/morpheus/common/__init__.py +++ b/morpheus/common/__init__.py @@ -23,6 +23,7 @@ from morpheus._lib.common import TypeId from morpheus._lib.common import determine_file_type from morpheus._lib.common import read_file_to_df +from morpheus._lib.common import typeid_is_fully_supported from morpheus._lib.common import typeid_to_numpy_str from morpheus._lib.common import write_df_to_file @@ -34,6 +35,7 @@ "HttpServer", "read_file_to_df", "Tensor", + "typeid_is_fully_supported", "typeid_to_numpy_str", "TypeId", "write_df_to_file", diff --git a/morpheus/pipeline/preallocator_mixin.py b/morpheus/pipeline/preallocator_mixin.py index c40ed6be04..acec20b9c7 100644 --- a/morpheus/pipeline/preallocator_mixin.py +++ b/morpheus/pipeline/preallocator_mixin.py @@ -26,6 +26,7 @@ import cudf from morpheus.common import TypeId +from morpheus.common import typeid_is_fully_supported from morpheus.common import typeid_to_numpy_str from morpheus.config import CppConfig from morpheus.messages import ControlMessage @@ -90,6 +91,13 @@ def _preallocate_control(self, msg: ControlMessage) -> ControlMessage: self._preallocate_meta(msg.payload()) return msg + def _all_types_supported_in_cpp(self) -> bool: + for column_type in self._needed_columns.values(): + if not typeid_is_fully_supported(column_type): + return False + + return True + def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: out_type = self.output_ports[0].output_type pretty_type = pretty_print_type_name(out_type) @@ -99,7 +107,7 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) if issubclass(out_type, (ControlMessage, MessageMeta, MultiMessage)): # Intentionally not using `_build_cpp_node` because `LinearBoundaryIngressStage` lacks a C++ impl - if CppConfig.get_should_use_cpp(): + if CppConfig.get_should_use_cpp() and self._all_types_supported_in_cpp(): import morpheus._lib.stages as _stages needed_columns = list(self._needed_columns.items()) if issubclass(out_type, ControlMessage): diff --git a/morpheus/stages/inference/triton_inference_stage.py b/morpheus/stages/inference/triton_inference_stage.py index 0b8a79dddf..c46cdcab48 100644 --- a/morpheus/stages/inference/triton_inference_stage.py +++ b/morpheus/stages/inference/triton_inference_stage.py @@ -756,7 +756,14 @@ def __init__(self, def supports_cpp_node(self) -> bool: # Get the value from the worker class - return TritonInferenceWorker.supports_cpp_node() + if TritonInferenceWorker.supports_cpp_node(): + if not self._use_shared_memory: + return True + + logger.warning("The C++ implementation of TritonInferenceStage does not support the use_shared_memory " + "option. Falling back to Python implementation.") + + return False def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> TritonInferenceWorker: """ @@ -781,6 +788,7 @@ def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: self._server_url, self._model_name, self._needs_logits, + self._force_convert_inputs, self._input_mapping, self._output_mapping) @@ -789,6 +797,7 @@ def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: self._server_url, self._model_name, self._needs_logits, + self._force_convert_inputs, self._input_mapping, self._output_mapping) From fefa1cc5319217daf058e6bcd4a4ca2850cdba33 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Thu, 2 May 2024 22:46:06 -0700 Subject: [PATCH 20/38] Update dfp-model-card.md Updating to fields and presentation of fields for Model Card++ 3.0 Release. --- models/model-cards/dfp-model-card.md | 56 ++++------------------------ 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/models/model-cards/dfp-model-card.md b/models/model-cards/dfp-model-card.md index 420ceabfe0..d3d3a381f0 100644 --- a/models/model-cards/dfp-model-card.md +++ b/models/model-cards/dfp-model-card.md @@ -106,36 +106,6 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ## Model Card ++ Bias Subcard -### What is the gender balance of the model validation data? -* Not Applicable - -### What is the racial/ethnicity balance of the model validation data? -* Not Applicable - -### What is the age balance of the model validation data? -* Not Applicable - -### What is the language balance of the model validation data? -* English (cloudtrail logs): 100% - -### What is the geographic origin language balance of the model validation data? -* Not Applicable - -### What is the educational background balance of the model validation data? -* Not Applicable - -### What is the accent balance of the model validation data? -* Not Applicable - -### What is the face/key point balance of the model validation data? -* Not Applicable - -### What is the skin/tone balance of the model validation data? -* Not Applicable - -### What is the religion balance of the model validation data? -* Not Applicable - ### Individuals from the following adversely impacted (protected classes) groups participate in model design and testing. * Not Applicable @@ -147,7 +117,7 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Name example applications and use cases for this model. * The model is primarily designed for testing purposes and serves as a small pretrained model specifically used to evaluate and validate the DFP pipeline. Its application is focused on assessing the effectiveness of the pipeline rather than being intended for broader use cases or specific applications beyond testing. -### Fill in the blank for the model technique. +### Intended Users. * This model is designed for developers seeking to test the DFP pipeline with a small pretrained model trained on a synthetic dataset. ### Name who is intended to benefit from this model. @@ -157,16 +127,16 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw * The model calculates an anomaly score for each input based on the reconstruction loss obtained from the trained Autoencoder. This score represents the level of anomaly detected in the input data. Higher scores indicate a higher likelihood of anomalous behavior. * The model provides the reconstruction loss of each feature to facilitate further testing and debugging of the pipeline. -### List the steps explaining how this model works. +### Describe how this model works. * The model works by training on baseline behaviors and subsequently detecting deviations from the established baseline, triggering alerts accordingly. * [Training notebook](https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb) -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: -* Not Applicable - ### List the technical limitations of the model. * The model expects cloudtrail logs with specific features that match the training dataset. Data lacking the required features or requiring a different feature set may not be compatible with the model. +### Has this been verified to have met prescribed NVIDIA quality standards? +* Yes + ### What performance metrics were used to affirm the model's performance? * The model's performance was evaluated based on its ability to correctly identify anomalous behavior in the synthetic dataset during testing. @@ -181,10 +151,7 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Link the location of the training dataset's repository (if able to share). * https://github.com/nv-morpheus/Morpheus/tree/branch-24.06/models/datasets/training-data/cloudtrail -### Is the model used in an application with physical safety impact? -* No - -### Describe physical safety impact (if present). +### Describe the life critical impact (if present). * None ### Was model and dataset assessed for vulnerability for potential form of attack? @@ -196,12 +163,6 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Name use case restrictions for the model. * The model's use case is restricted to testing the Morpheus pipeline and may not be suitable for other applications. -### Has this been verified to have met prescribed quality standards? -* No - -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. -* None - ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No @@ -236,10 +197,7 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### If PII collected for the development of this AI model, was it minimized to only what was required? * Not Applicable (no PII collected) -### Is data in dataset traceable? -* No - -### Are we able to identify and trace source of dataset? +### Is there data provenance? * Yes ([fully synthetic dataset](https://github.com/nv-morpheus/Morpheus/tree/branch-24.06/models/datasets/training-data/cloudtrail)) ### Does data labeling (annotation, metadata) comply with privacy laws? From 33599b8077424ae8ddce03d228fb74dfb93cc58e Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Thu, 2 May 2024 22:46:17 -0700 Subject: [PATCH 21/38] Update root-cause-analysis-model-card.md Updating to fields and presentation of fields for Model Card++ 3.0 Release. --- .../root-cause-analysis-model-card.md | 80 ++----------------- 1 file changed, 8 insertions(+), 72 deletions(-) diff --git a/models/model-cards/root-cause-analysis-model-card.md b/models/model-cards/root-cause-analysis-model-card.md index 0f6a332f52..bd8c301faf 100644 --- a/models/model-cards/root-cause-analysis-model-card.md +++ b/models/model-cards/root-cause-analysis-model-card.md @@ -21,63 +21,49 @@ limitations under the License. # Model Overview ## Description: - * Root cause analysis is a binary classifier differentiating between ordinary logs and errors/problems/root causes in the log files.
## References(s): - * Devlin J. et al. (2018), BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding https://arxiv.org/abs/1810.04805
## Model Architecture: - **Architecture Type:** - * Transformers
**Network Architecture:** - * BERT
## Input: (Enter "None" As Needed) **Input Format:** - * CSV
**Input Parameters:** - * kern.log file contents
**Other Properties Related to Output:** - * N/A
## Output: (Enter "None" As Needed) **Output Format:** - * Binary Results, Root Cause or Ordinary
**Output Parameters:** - * N/A
**Other Properties Related to Output:** - * N/A
## Software Integration: **Runtime(s):** - * Morpheus
**Supported Hardware Platform(s):**
- * Ampere/Turing
**Supported Operating System(s):**
- * Linux
## Model Version(s): @@ -88,67 +74,31 @@ limitations under the License. ## Training Dataset: **Link:** - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/training-data/root-cause-training-data.csv
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * kern.log files from DGX machines
## Evaluation Dataset: **Link:** - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/validation-data/root-cause-validation-data-input.jsonlines
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * kern.log files from DGX machines
## Inference: **Engine:** - * Triton
**Test Hardware:**
- * Other
# Subcards ## Model Card ++ Bias Subcard -### What is the gender balance of the model validation data? -* Not Applicable - -### What is the racial/ethnicity balance of the model validation data? -* Not Applicable - -### What is the age balance of the model validation data? -* Not Applicable - -### What is the language balance of the model validation data? -* Not Applicable - -### What is the geographic origin language balance of the model validation data? -* Not Applicable - -### What is the educational background balance of the model validation data? -* Not Applicable - -### What is the accent balance of the model validation data? -* Not Applicable - -### What is the face/key point balance of the model validation data? -* Not Applicable - -### What is the skin/tone balance of the model validation data? -* Not Applicable - -### What is the religion balance of the model validation data? -* Not Applicable - ### Individuals from the following adversely impacted (protected classes) groups participate in model design and testing. * Not Applicable @@ -160,26 +110,24 @@ limitations under the License. ### Name example applications and use cases for this model. * The model is primarily designed for testing purposes and serves as a small pre-trained model specifically used to evaluate and validate the Root Cause Analysis pipeline. This model is an example of customized transformer-based root cause analysis. It can be used for pipeline testing purposes. It needs to be re-trained for specific root cause analysis or predictive maintenance needs with the fine-tuning scripts in the repo. The hyperparameters can be optimised to adjust to get the best results with another dataset. The aim is to get the model to predict some false positives that could be previously unknown error types. Users can use this root cause analysis approach with other log types too. If they have known failures in their logs, they can use them to train along with ordinary logs and can detect other root causes they weren't aware of before. -### Fill in the blank for the model technique. - +### Intended Users. * This model is designed for developers seeking to test the root cause analysis pipeline with a small pre-trained model trained on a very small `kern.log` file from a DGX. ### Name who is intended to benefit from this model. - * The intended beneficiaries of this model are developers who aim to test the functionality of the DFP pipeline using synthetic datasets ### Describe the model output. * This model output can be used as a binary result, Root cause or Ordinary -### List the steps explaining how this model works. +### Describe how this model works. * A BERT model gets fine-tuned with the kern.log dataset and in the inference it predicts one of the binary classes. Root cause or Ordinary. -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: -* Not Applicable - ### List the technical limitations of the model. * For different log types and content, different models need to be trained. +### Has this been verified to have met prescribed NVIDIA quality standards? +* Yes + ### What performance metrics were used to affirm the model's performance? * F1 @@ -195,10 +143,7 @@ limitations under the License. ### Link the location of the training dataset's repository. * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/training-data/root-cause-training-data.csv -### Is the model used in an application with physical safety impact? -* No - -### Describe physical safety impact (if present). +### Describe the life critical impact (if present). * None ### Was model and dataset assessed for vulnerability for potential form of attack? @@ -210,12 +155,6 @@ limitations under the License. ### Name use case restrictions for the model. * Different models need to be trained depending on the log types. -### Has this been verified to have met prescribed quality standards? -* No - -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. -* N/A - ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No @@ -232,7 +171,7 @@ limitations under the License. ### Generatable or reverse engineerable personally-identifiable information (PII)? -* Neither +* None ### Was consent obtained for any PII used? * N/A @@ -249,12 +188,9 @@ limitations under the License. ### If PII collected for the development of this AI model, was it minimized to only what was required? * N/A -### Is data in dataset traceable? +### Is there data provenance? * Original raw logs are not saved. The small sample in the repo is saved for testing the pipeline. -### Are we able to identify and trace source of dataset? -* N/A - ### Does data labeling (annotation, metadata) comply with privacy laws? * N/A From eb8036f0577ec8aa4aa5579fd886162b7b4624bc Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Mon, 6 May 2024 16:45:11 -0700 Subject: [PATCH 22/38] Update abp-model-card.md Adding "### Describe access restrictions * The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to." --- models/model-cards/abp-model-card.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/models/model-cards/abp-model-card.md b/models/model-cards/abp-model-card.md index 3f9043db86..f7e49eed37 100644 --- a/models/model-cards/abp-model-card.md +++ b/models/model-cards/abp-model-card.md @@ -210,13 +210,9 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe * No -### Are there explicit model and dataset restrictions? +### Describe access restrictions -* No - -### Are there access restrictions to systems, model, and data? - -* No +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. ### Is there a digital signature? From c010017b8a217ca6738704221100bc61cbd5fcf3 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Mon, 6 May 2024 16:45:32 -0700 Subject: [PATCH 23/38] Update dfp-model-card.md ### Describe access restrictions * The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to." --- models/model-cards/dfp-model-card.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models/model-cards/dfp-model-card.md b/models/model-cards/dfp-model-card.md index 420ceabfe0..a049cf18fa 100644 --- a/models/model-cards/dfp-model-card.md +++ b/models/model-cards/dfp-model-card.md @@ -205,11 +205,9 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No -### Are there explicit model and dataset restrictions? -* No +### Describe access restrictions -### Are there access restrictions to systems, model, and data? -* No +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. ### Is there a digital signature? * No From 9108041fbbf51c3ee790de76b547a8e19cf0cf6c Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Mon, 6 May 2024 16:45:59 -0700 Subject: [PATCH 24/38] Update gnn-fsi-model-card.md ### Describe access restrictions * The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to." --- models/model-cards/gnn-fsi-model-card.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models/model-cards/gnn-fsi-model-card.md b/models/model-cards/gnn-fsi-model-card.md index ae76cd8edd..84ce630c55 100644 --- a/models/model-cards/gnn-fsi-model-card.md +++ b/models/model-cards/gnn-fsi-model-card.md @@ -169,11 +169,9 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * Not Applicable -### Are there explicit model and dataset restrictions? -* No +### Describe access restrictions -### Are there access restrictions to systems, model, and data? -* No +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. ### Is there a digital signature? * No From 4e962d0b21cb4a712744e2ed052f065c3e848642 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Mon, 6 May 2024 16:46:20 -0700 Subject: [PATCH 25/38] Update phishing-model-card.md ### Describe access restrictions * The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to." --- models/model-cards/phishing-model-card.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models/model-cards/phishing-model-card.md b/models/model-cards/phishing-model-card.md index 7699c256b2..a902a3fde5 100644 --- a/models/model-cards/phishing-model-card.md +++ b/models/model-cards/phishing-model-card.md @@ -204,11 +204,9 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No -### Are there explicit model and dataset restrictions? -* No +### Describe access restrictions -### Are there access restrictions to systems, model, and data? -* No +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. ### Is there a digital signature? From 4f2b6c88f645ccb89b4531546ee0d8e0e4611a2b Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Mon, 6 May 2024 16:46:50 -0700 Subject: [PATCH 26/38] Update root-cause-analysis-model-card.md ### Describe access restrictions * The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to." --- models/model-cards/root-cause-analysis-model-card.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models/model-cards/root-cause-analysis-model-card.md b/models/model-cards/root-cause-analysis-model-card.md index bd8c301faf..064019756b 100644 --- a/models/model-cards/root-cause-analysis-model-card.md +++ b/models/model-cards/root-cause-analysis-model-card.md @@ -158,11 +158,9 @@ limitations under the License. ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No -### Are there explicit model and dataset restrictions? -* It is for pipeline testing purposes. +### Describe access restrictions -### Are there access restrictions to systems, model, and data? -* No +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. ### Is there a digital signature? * No From c88527bee55a4e6b634e9a553d4a581a128e05a5 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Tue, 7 May 2024 07:37:51 -0700 Subject: [PATCH 27/38] Update dfp-model-card.md (#1644) Closes ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - https://github.com/HesAnEasyCoder Approvers: - Devin Robison (https://github.com/drobison00) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1644 --- models/model-cards/dfp-model-card.md | 81 +++++++--------------------- 1 file changed, 20 insertions(+), 61 deletions(-) diff --git a/models/model-cards/dfp-model-card.md b/models/model-cards/dfp-model-card.md index 420ceabfe0..1839daba05 100644 --- a/models/model-cards/dfp-model-card.md +++ b/models/model-cards/dfp-model-card.md @@ -45,13 +45,11 @@ The model architecture consists of an Autoencoder, where the reconstruction loss ## Output: **Output Format:** -* Anomaly score and the reconstruction loss for each feature in a pandas dataframe +* Anomaly score (per feature) +* Reconstruction loss (per feature) **Output Parameters:** -* None - -**Other Properties Related to Output:** -* Not Applicable +* Pandas Dataframe ## Software Integration: **Runtime(s):** @@ -102,45 +100,21 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw **Test Hardware:** * Other +## Ethical Considerations (For NVIDIA Models Only): +NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. For more detailed information on ethical considerations for this model, please see the Model Card++ Explainability, Bias, Safety & Security, and Privacy Subcard + # Subcards ## Model Card ++ Bias Subcard -### What is the gender balance of the model validation data? -* Not Applicable - -### What is the racial/ethnicity balance of the model validation data? -* Not Applicable - -### What is the age balance of the model validation data? -* Not Applicable - ### What is the language balance of the model validation data? * English (cloudtrail logs): 100% -### What is the geographic origin language balance of the model validation data? -* Not Applicable - -### What is the educational background balance of the model validation data? -* Not Applicable - -### What is the accent balance of the model validation data? -* Not Applicable - -### What is the face/key point balance of the model validation data? -* Not Applicable - -### What is the skin/tone balance of the model validation data? -* Not Applicable - -### What is the religion balance of the model validation data? -* Not Applicable - ### Individuals from the following adversely impacted (protected classes) groups participate in model design and testing. -* Not Applicable +* None of the Above. ### Describe measures taken to mitigate against unwanted bias. -* Not Applicable +* None of the Above. ## Model Card ++ Explainability Subcard @@ -161,12 +135,12 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw * The model works by training on baseline behaviors and subsequently detecting deviations from the established baseline, triggering alerts accordingly. * [Training notebook](https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/training-tuning-scripts/dfp-models/hammah-20211017.ipynb) -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: -* Not Applicable - ### List the technical limitations of the model. * The model expects cloudtrail logs with specific features that match the training dataset. Data lacking the required features or requiring a different feature set may not be compatible with the model. +### Has this been verified to have met prescribed quality standards? +* Yes + ### What performance metrics were used to affirm the model's performance? * The model's performance was evaluated based on its ability to correctly identify anomalous behavior in the synthetic dataset during testing. @@ -181,10 +155,7 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Link the location of the training dataset's repository (if able to share). * https://github.com/nv-morpheus/Morpheus/tree/branch-24.06/models/datasets/training-data/cloudtrail -### Is the model used in an application with physical safety impact? -* No - -### Describe physical safety impact (if present). +### Describe the life critical impact (if present). * None ### Was model and dataset assessed for vulnerability for potential form of attack? @@ -196,30 +167,18 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw ### Name use case restrictions for the model. * The model's use case is restricted to testing the Morpheus pipeline and may not be suitable for other applications. -### Has this been verified to have met prescribed quality standards? -* No - -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. -* None - -### Is the model and dataset compliant with National Classification Management Society (NCMS)? -* No - -### Are there explicit model and dataset restrictions? -* No +### Name explicit model and/or dataset restrictions. +* The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. ### Are there access restrictions to systems, model, and data? * No -### Is there a digital signature? -* No - ## Model Card ++ Privacy Subcard ### Generatable or reverse engineerable personally-identifiable information (PII)? -* Neither +* None ### Was consent obtained for any PII used? * The synthetic data used in this model is generated using the [faker](https://github.com/joke2k/faker/blob/master/LICENSE.txt) python package. The user agent field is generated by faker, which pulls items from its own dataset of fictitious values (located in the linked repo). Similarly, the event source field is randomly chosen from a list of event names provided in the AWS documentation. There are no privacy concerns or PII involved in this synthetic data generation process. @@ -228,22 +187,22 @@ The evaluation dataset consists of AWS CloudTrail logs. It contains logs from tw * Not applicable ### How often is dataset reviewed? -* The dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for any changes. +* The dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for changes. ### Is a mechanism in place to honor data subject right of access or deletion of personal data? -* No (as the dataset is fully synthetic) +* No (dataset is fully synthetic) ### If PII collected for the development of this AI model, was it minimized to only what was required? * Not Applicable (no PII collected) -### Is data in dataset traceable? +### Is there data provenance? * No ### Are we able to identify and trace source of dataset? * Yes ([fully synthetic dataset](https://github.com/nv-morpheus/Morpheus/tree/branch-24.06/models/datasets/training-data/cloudtrail)) ### Does data labeling (annotation, metadata) comply with privacy laws? -* Not applicable (as the dataset is fully synthetic) +* Not applicable (dataset is fully synthetic) ### Is data compliant with data subject requests for data correction or removal, if such a request was made? -* Not applicable (as the dataset is fully synthetic) +* Not applicable (dataset is fully synthetic) From 4c3fc5d99d269f41458d045bc5d935945a1deac3 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Wed, 8 May 2024 12:04:27 -0700 Subject: [PATCH 28/38] Update phishing-model-card.md (#1680) Closes ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - https://github.com/HesAnEasyCoder Approvers: - Devin Robison (https://github.com/drobison00) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1680 --- models/model-cards/phishing-model-card.md | 48 ++--------------------- 1 file changed, 4 insertions(+), 44 deletions(-) diff --git a/models/model-cards/phishing-model-card.md b/models/model-cards/phishing-model-card.md index 7699c256b2..6cb445b8cc 100644 --- a/models/model-cards/phishing-model-card.md +++ b/models/model-cards/phishing-model-card.md @@ -28,60 +28,47 @@ limitations under the License. * Devlin J. et al. (2018), BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding https://arxiv.org/abs/1810.04805
## Model Architecture: - **Architecture Type:** - * Transformers
**Network Architecture:** - * BERT
## Input: (Enter "None" As Needed) **Input Format:** - * Evaluation script downloads the smsspamcollection.zip and extract tabular information into a dataframe
**Input Parameters:** - * SMS/emails
**Other Properties Related to Output:** - * N/A
## Output: (Enter "None" As Needed) **Output Format:** - * Binary Results, Fraudulent or Benign
**Output Parameters:** - * N/A
**Other Properties Related to Output:** - * N/A
## Software Integration: **Runtime(s):** - * Morpheus
**Supported Hardware Platform(s):**
- * Ampere/Turing
**Supported Operating System(s):**
- * Linux
## Model Version(s): - * v1
# Training & Evaluation: @@ -89,31 +76,25 @@ limitations under the License. ## Training Dataset: **Link:** - * http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * Dataset consists of SMSs
## Evaluation Dataset: **Link:** - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/validation-data/phishing-email-validation-data.jsonlines
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * Dataset consists of SMSs
## Inference: **Engine:** - * Triton
**Test Hardware:**
- * DGX (V100)
## Ethical Considerations: @@ -124,19 +105,15 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ## Model Card ++ Bias Subcard ### What is the language balance of the model validation data? - * English ### What is the geographic origin language balance of the model validation data? - * UK ### Individuals from the following adversely impacted (protected classes) groups participate in model design and testing. - * Not Applicable ### Describe measures taken to mitigate against unwanted bias. - * Not Applicable ## Model Card ++ Explainability Subcard @@ -144,28 +121,22 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Name example applications and use cases for this model. * The model is primarily designed for testing purposes and serves as a small pre-trained model specifically used to evaluate and validate the phishing detection pipeline. Its application is focused on assessing the effectiveness of the pipeline rather than being intended for broader use cases or specific applications beyond testing. -### Fill in the blank for the model technique. - +### Intended Users. * This model is designed for developers seeking to test the phishing detection pipeline with a small pre-trained model. ### Name who is intended to benefit from this model. - * The intended beneficiaries of this model are developers who aim to test the performance and functionality of the phishing pipeline using synthetic datasets. It may not be suitable or provide significant value for real-world phishing messages. ### Describe the model output. * This model output can be used as a binary result, Phishing/Spam or Benign -### List the steps explaining how this model works. +### Describe how this model works. * A BERT model gets fine-tuned with the dataset and in the inference it predicts one of the binary classes. Phishing/Spam or Benign. -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: -* Not Applicable - ### List the technical limitations of the model. * For different email/SMS types and content, different models need to be trained. ### Has this been verified to have met prescribed NVIDIA standards? - * Yes ### What performance metrics were used to affirm the model's performance? @@ -182,25 +153,18 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Link the location of the training dataset's repository. * http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip -### Is the model used in an application with physical safety impact? -* No - -### Describe life-critical impact (if present). +### Describe the life critical impact (if present). * None ### Was model and dataset assessed for vulnerability for potential form of attack? * No ### Name applications for the model. - * The primary application for this model is testing the Morpheus phishing detection pipeline ### Name use case restrictions for the model. * This pretrained model's use case is restricted to testing the Morpheus pipeline and may not be suitable for other applications. -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. -* N/A - ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * No @@ -230,16 +194,12 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe * Unknown ### Is a mechanism in place to honor data subject right of access or deletion of personal data? - * N/A ### If PII collected for the development of this AI model, was it minimized to only what was required? * N/A -### Is data in dataset traceable? -* N/A - -### Are we able to identify and trace source of dataset? +### Is there data provenance? * N/A ### Does data labeling (annotation, metadata) comply with privacy laws? From a5a68413cad58c1d2264ca34eb3fa6f70ac32128 Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Wed, 8 May 2024 12:05:03 -0700 Subject: [PATCH 29/38] Update gnn-fsi-model-card.md (#1681) Closes ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - https://github.com/HesAnEasyCoder Approvers: - Devin Robison (https://github.com/drobison00) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1681 --- models/model-cards/gnn-fsi-model-card.md | 27 ++++++------------------ 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/models/model-cards/gnn-fsi-model-card.md b/models/model-cards/gnn-fsi-model-card.md index ae76cd8edd..92dede0b0e 100644 --- a/models/model-cards/gnn-fsi-model-card.md +++ b/models/model-cards/gnn-fsi-model-card.md @@ -115,23 +115,19 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Fill in the blank for the model technique. * This model is designed for developers seeking to test the GNN fraud detection pipeline with a small pretrained model on a synthetic dataset. -### Name who is intended to benefit from this model. +### Intended Users. * The intended beneficiaries of this model are developers who aim to test the performance and functionality of the GNN fraud detection pipeline using synthetic datasets. It may not be suitable or provide significant value for real-world transactions. ### Describe the model output. * This model outputs fraud probability score b/n (0 & 1). -### List the steps explaining how this model works. (e.g., ) -* The model uses a bipartite heterogeneous graph representation as input for `GraphSAGE` for feature learning and `XGBoost` as a classifier. Since the input graph is heterogeneous, a heterogeneous implementation of `GraphSAGE` (HinSAGE) is used for feature embedding.
- -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: -* Not Applicable +### Describe how this model works. +* The model uses a bipartite heterogeneous graph representation as input for `GraphSAGE` for feature learning and `XGBoost` as a classifier. Since the input graph is heterogeneous, a heterogeneous implementation of `GraphSAGE` (HinSAGE) is used for feature embedding. ### List the technical limitations of the model. * This model version requires a transactional data schema with entities (user, merchant, transaction) as requirement for the model. ### Has this been verified to have met prescribed NVIDIA standards? - * Yes ### What performance metrics were used to affirm the model's performance? @@ -148,11 +144,8 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Link the location of the training dataset's repository (if able to share). * [training dataset](models/datasets/training-data/fraud-detection-training-data.csv) -### Is the model used in an application with physical safety impact? -* No - -### Describe life-critical impact (if present). -* Not Applicable +### Describe the life critical impact (if present). +* None ### Was model and dataset assessed for vulnerability for potential form of attack? * No @@ -163,9 +156,6 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### Name use case restrictions for the model. * The model's use case is restricted to testing the Morpheus pipeline and may not be suitable for other applications. -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. -* Not Applicable - ### Is the model and dataset compliant with National Classification Management Society (NCMS)? * Not Applicable @@ -192,16 +182,13 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ### How often is dataset reviewed? * The dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for any changes. -### Is a mechanism in place to honor data +### Is a mechanism in place to honor data subject right of access or deletion of personal data? * Yes ### If PII collected for the development of this AI model, was it minimized to only what was required? * Not applicable -### Is data in dataset traceable? -* No - -### Are we able to identify and trace source of dataset? +### Is there data provenance? * Yes ### Does data labeling (annotation, metadata) comply with privacy laws? From 26eb9f866926f54825c89e699086516873dbcc3d Mon Sep 17 00:00:00 2001 From: HesAnEasyCoder <105108698+HesAnEasyCoder@users.noreply.github.com> Date: Wed, 8 May 2024 12:07:18 -0700 Subject: [PATCH 30/38] Update abp-model-card.md (#1683) Closes ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - https://github.com/HesAnEasyCoder Approvers: - Devin Robison (https://github.com/drobison00) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1683 --- models/model-cards/abp-model-card.md | 79 +++------------------------- 1 file changed, 6 insertions(+), 73 deletions(-) diff --git a/models/model-cards/abp-model-card.md b/models/model-cards/abp-model-card.md index 3f9043db86..efc48eed7c 100644 --- a/models/model-cards/abp-model-card.md +++ b/models/model-cards/abp-model-card.md @@ -21,31 +21,24 @@ limitations under the License. # Model Overview ## Description: - * This model is an example of a binary XGBoost classifier to differentiate between anomalous GPU behavior, such as crypto mining / GPU malware, and non-anomalous GPU-based workflows (e.g., ML/DL training). This model is for demonstration purposes and not for production usage.
## References(s): - * Chen, Guestrin (2016) XGBoost. A scalable tree boosting system. https://arxiv.org/abs/1603.02754
## Model Architecture: - **Architecture Type:** - * Gradient boosting
**Network Architecture:** - * XGBOOST
## Input: (Enter "None" As Needed) **Input Format:** - * nvidia-smi output
**Input Parameters:** - * GPU statistics that are included in the nvidia-smi output
**Other Properties Related to Output:** N/A
@@ -53,33 +46,26 @@ limitations under the License. ## Output: (Enter "None" As Needed) **Output Format:** - * Binary Results
**Output Parameters:** - * N/A
**Other Properties Related to Output:** - * N/A
## Software Integration: **Runtime(s):** - * Morpheus
**Supported Hardware Platform(s):**
- * Ampere/Turing
**Supported Operating System(s):**
- * Linux
## Model Version(s): - * v1
# Training & Evaluation: @@ -87,31 +73,25 @@ limitations under the License. ## Training Dataset: **Link:** - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/training-data/abp-sample-nvsmi-training-data.json
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * Sample dataset consists of over 1000 nvidia-smi outputs
## Evaluation Dataset: **Link:** - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/validation-data/abp-validation-data.jsonlines
**Properties (Quantity, Dataset Descriptions, Sensor(s)):** - * Sample dataset consists of over 1000 nvidia-smi outputs
## Inference: **Engine:** - * Triton
**Test Hardware:**
- * DGX (V100)
## Ethical Considerations: @@ -122,146 +102,99 @@ NVIDIA believes Trustworthy AI is a shared responsibility and we have establishe ## Model Card ++ Bias Subcard ### Individuals from the following adversely impacted (protected classes) groups participate in model design and testing. - * Not Applicable ### Describe measures taken to mitigate against unwanted bias. - * Not Applicable ## Model Card ++ Explainability Subcard ### Name example applications and use cases for this model. - * The model is primarily designed for testing purposes and serves as a small model specifically used to evaluate and validate the ABP pipeline. Its application is focused on assessing the effectiveness of the pipeline rather than being intended for broader use cases or specific applications beyond testing. -### Fill in the blank for the model technique. - +### Intended Users. * The model is primarily designed for testing purposes. This model is intended to be an example for developers that want to test Morpheus ABP pipeline. ### Name who is intended to benefit from this model. - * The intended beneficiaries of this model are developers who aim to test the functionality of the ABP models for detecting crypto mining. ### Describe the model output. - * This model output can be used as a binary result, Crypto mining or legitimate GPU usage. -### List the steps explaining how this model works. - +### Describe how this model works. * nvidia-smi features are used as the input and the model predicts a label for each row -### Name the adversely impacted groups (protected classes) this has been tested to deliver comparable outcomes regardless of: - -* Not Applicable - ### List the technical limitations of the model. - * For different GPU workloads different models need to be trained. -### Has this been verified to have met prescribed NVIDIA standards? - +### Has this been verified to have met prescribed NVIDIA quality standards? * Yes ### What performance metrics were used to affirm the model's performance? - * Accuracy ### What are the potential known risks to users and stakeholders? - * N/A ### Link the relevant end user license agreement - * [Apache 2.0](http://www.apache.org/licenses/LICENSE-2.0) ## Model Card ++ Saftey & Security Subcard ### Link the location of the training dataset's repository. - * https://github.com/nv-morpheus/Morpheus/blob/branch-24.06/models/datasets/training-data/abp-sample-nvsmi-training-data.json -### Is the model used in an application with physical safety impact? - -* No - -### Describe life-critical impact (if present). - -* N/A +### Describe the life critical impact (if present). +* None ### Was model and dataset assessed for vulnerability for potential form of attack? - * No ### Name applications for the model. - * The primary application for this model is testing the Morpheus pipeline. ### Name use case restrictions for the model. - * The model's use case is restricted to testing the Morpheus pipeline and may not be suitable for other applications. -### Name target quality Key Performance Indicators (KPIs) for which this has been tested. - -* N/A - ### Is the model and dataset compliant with National Classification Management Society (NCMS)? - * No ### Are there explicit model and dataset restrictions? - * No ### Are there access restrictions to systems, model, and data? - * No ### Is there a digital signature? - * No ## Model Card ++ Privacy Subcard ### Generatable or reverse engineerable personally-identifiable information (PII)? - * None ### Was consent obtained for any PII used? - * N/A ### Protected classes used to create this model? (The following were used in model the model's training:) - * N/A - ### How often is dataset reviewed? - * The dataset is initially reviewed upon addition, and subsequent reviews are conducted as needed or upon request for any changes. ### Is a mechanism in place to honor data subject right of access or deletion of personal data? - * N/A ### If PII collected for the development of this AI model, was it minimized to only what was required? - -* N/A - -### Is data in dataset traceable? - * N/A -### Are we able to identify and trace source of dataset? - +### Is there data provenance? * Yes ### Does data labeling (annotation, metadata) comply with privacy laws? - * N/A ### Is data compliant with data subject requests for data correction or removal, if such a request was made? - * N/A From 78dab99ddceb8eab500a9abc0cd3ac605b94c4f7 Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 8 May 2024 13:22:53 -0700 Subject: [PATCH 31/38] Enable C++ mode for `abp_pcap_detection` example (#1687) * Update the `pcap-preprocess` stage to ensure tensors are in row major as required by Triton. * Update the `pcap-preprocess` stage to cast float64 data to model's expected input of float32, removing the need to specify `force_convert_inputs`. Since this stage is specific to this pipeline & model. Closes #1675 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1687 --- examples/abp_pcap_detection/README.md | 4 ++-- .../abp_pcap_preprocessing.py | 3 ++- examples/abp_pcap_detection/run.py | 17 +++-------------- .../test_abp_pcap_preprocessing.py | 12 ++++++++---- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/examples/abp_pcap_detection/README.md b/examples/abp_pcap_detection/README.md index 3cfae25aa9..440c3fb783 100644 --- a/examples/abp_pcap_detection/README.md +++ b/examples/abp_pcap_detection/README.md @@ -99,13 +99,13 @@ Alternately, the Morpheus command line could have been used to accomplish the sa From the root of the Morpheus repo, run: ```bash morpheus --log_level INFO --plugin "examples/abp_pcap_detection/abp_pcap_preprocessing.py" \ - run --use_cpp False --pipeline_batch_size 100000 --model_max_batch_size 100000 \ + run --pipeline_batch_size 100000 --model_max_batch_size 100000 \ pipeline-fil --model_fea_length 13 --label=probs \ from-file --filename examples/data/abp_pcap_dump.jsonlines --filter_null False \ deserialize \ pcap-preprocess \ monitor --description "Preprocessing rate" \ - inf-triton --model_name "abp-pcap-xgb" --server_url "localhost:8001" --force_convert_inputs=True \ + inf-triton --model_name "abp-pcap-xgb" --server_url "localhost:8000" \ monitor --description "Inference rate" --unit inf \ add-class --label=probs \ monitor --description "Add classification rate" --unit "add-class" \ diff --git a/examples/abp_pcap_detection/abp_pcap_preprocessing.py b/examples/abp_pcap_detection/abp_pcap_preprocessing.py index 453dc2a419..59a8060854 100644 --- a/examples/abp_pcap_detection/abp_pcap_preprocessing.py +++ b/examples/abp_pcap_detection/abp_pcap_preprocessing.py @@ -170,7 +170,8 @@ def round_time_kernel(timestamp, rollup_time, secs): del df, grouped_df # Convert the dataframe to cupy the same way cuml does - data = cp.asarray(merged_df[fea_cols].to_cupy()) + # Explicity casting to float32 to match the model's input, and setting row-major as required by Triton + data = cp.asarray(merged_df[fea_cols].to_cupy(), order='C', dtype=cp.float32) count = data.shape[0] for col in req_cols: diff --git a/examples/abp_pcap_detection/run.py b/examples/abp_pcap_detection/run.py index 18d5c25e5d..8937351d16 100644 --- a/examples/abp_pcap_detection/run.py +++ b/examples/abp_pcap_detection/run.py @@ -21,7 +21,6 @@ from morpheus.cli.commands import FILE_TYPE_NAMES from morpheus.cli.utils import str_to_file_type from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -87,7 +86,7 @@ help=("Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. " "Iterative mode is good for interleaving source stages."), ) -@click.option("--server_url", required=True, help="Tritonserver url.", default="localhost:8001") +@click.option("--server_url", required=True, help="Tritonserver url.", default="localhost:8000") @click.option( "--file_type", type=click.Choice(FILE_TYPE_NAMES, case_sensitive=False), @@ -111,8 +110,6 @@ def run_pipeline( # Enable the default logger. configure_logging(log_level=logging.INFO) - CppConfig.set_should_use_cpp(False) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.FIL @@ -124,8 +121,6 @@ def run_pipeline( config.feature_length = model_fea_length config.class_labels = ["probs"] - kwargs = {} - # Create a linear pipeline object. pipeline = LinearPipeline(config) @@ -154,13 +149,7 @@ def run_pipeline( # Add a inference stage. # This stage sends inference requests to the Tritonserver and captures the response. - pipeline.add_stage( - TritonInferenceStage( - config, - model_name=model_name, - server_url=server_url, - force_convert_inputs=True, - )) + pipeline.add_stage(TritonInferenceStage(config, model_name=model_name, server_url=server_url)) # Add a monitor stage. # This stage logs the metrics (inf/sec) from the above stage. @@ -176,7 +165,7 @@ def run_pipeline( # Add a serialize stage. # This stage includes & excludes columns from messages. - pipeline.add_stage(SerializeStage(config, **kwargs)) + pipeline.add_stage(SerializeStage(config)) # Add a monitor stage. # This stage logs the metrics (msg/sec) from the above stage. diff --git a/tests/examples/abp_pcap_detection/test_abp_pcap_preprocessing.py b/tests/examples/abp_pcap_detection/test_abp_pcap_preprocessing.py index 97443d65d6..90a3c067f4 100755 --- a/tests/examples/abp_pcap_detection/test_abp_pcap_preprocessing.py +++ b/tests/examples/abp_pcap_detection/test_abp_pcap_preprocessing.py @@ -62,6 +62,8 @@ def check_inf_message(msg: MultiInferenceFILMessage, input__0 = msg.memory.get_tensor('input__0') assert input__0.shape == (expected_count, expected_feature_length) + assert input__0.dtype == cp.float32 + assert input__0.strides == (expected_feature_length * 4, 4) assert (input__0 == expected_input__0).all() seq_ids = msg.memory.get_tensor('seq_ids') @@ -87,10 +89,12 @@ def test_abp_pcap_preprocessing(config: Config, dataset_cudf: DatasetManager, input_df = dataset_cudf.get_df(input_file, no_cache=True, filter_nulls=False) expected_flow_ids = input_df.src_ip + ":" + input_df.src_port + "=" + input_df.dest_ip + ":" + input_df.dest_port - expected_input__0 = cp.asarray( - np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'examples/abp_pcap_detection/abp_pcap_expected_input_0.csv'), - delimiter=",", - skiprows=0)) + expected_input__0 = cp.asarray(np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, + 'examples/abp_pcap_detection/abp_pcap_expected_input_0.csv'), + delimiter=",", + skiprows=0, + dtype=np.float32), + order='C') assert len(input_df) == 20 From bf80d93958e78c7e0312cd75ad71d1b75d0befce Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Wed, 8 May 2024 23:12:48 -0700 Subject: [PATCH 32/38] Support the filter_null parameter in the C++ impl of the FileSourceStage (#1689) * Fixes bug where the `filter_null` constructor argument to the `FileSourceStage` was only implemented in the Python impl of the stage. * Update `filter_null` feature to make the column(s) being filtered upon configurable, previously this was hard-coded to `"data"` * Add new `get_column_names` helper method to `CuDFTableUtil` Closes #1678 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1689 --- .../include/morpheus/io/deserializers.hpp | 2 +- .../include/morpheus/stages/file_source.hpp | 35 +++++--- .../include/morpheus/utilities/table_util.hpp | 28 ++++++- morpheus/_lib/src/io/deserializers.cpp | 16 ++-- morpheus/_lib/src/stages/file_source.cpp | 41 +++++++-- morpheus/_lib/src/utilities/table_util.cpp | 45 +++++++++- morpheus/_lib/stages/__init__.pyi | 4 +- morpheus/_lib/stages/module.cpp | 24 ++++-- morpheus/_lib/tests/CMakeLists.txt | 6 ++ .../_lib/tests/utilities/test_table_util.cpp | 79 +++++++++++++++++ morpheus/io/deserializers.py | 84 +++++++++++-------- morpheus/io/utils.py | 6 +- morpheus/stages/input/file_source_stage.py | 17 +++- tests/stages/test_file_source_stage_pipe.py | 69 +++++++++++++++ tests/tests_data/file_with_nans.csv | 3 + tests/tests_data/file_with_nans.jsonlines | 3 + tests/tests_data/file_with_nulls.csv | 3 + tests/tests_data/file_with_nulls.jsonlines | 3 + 18 files changed, 390 insertions(+), 78 deletions(-) create mode 100644 morpheus/_lib/tests/utilities/test_table_util.cpp create mode 100755 tests/stages/test_file_source_stage_pipe.py create mode 100644 tests/tests_data/file_with_nans.csv create mode 100644 tests/tests_data/file_with_nans.jsonlines create mode 100644 tests/tests_data/file_with_nulls.csv create mode 100644 tests/tests_data/file_with_nulls.jsonlines diff --git a/morpheus/_lib/include/morpheus/io/deserializers.hpp b/morpheus/_lib/include/morpheus/io/deserializers.hpp index d98cad6e9a..569d503eb9 100644 --- a/morpheus/_lib/include/morpheus/io/deserializers.hpp +++ b/morpheus/_lib/include/morpheus/io/deserializers.hpp @@ -71,7 +71,7 @@ int get_index_col_count(const cudf::io::table_with_metadata& data_table); int prepare_df_index(cudf::io::table_with_metadata& data_table); /** - * @brief Loads a cudf table from either CSV or JSON file returning the DataFrame as a Python object + * @brief Loads a cudf table from either CSV, JSON or Parquet file returning the DataFrame as a Python object * * @param filename : Name of the file that should be loaded into a table * @return pybind11::object diff --git a/morpheus/_lib/include/morpheus/stages/file_source.hpp b/morpheus/_lib/include/morpheus/stages/file_source.hpp index 6ed1ea4852..95ec2ebd64 100644 --- a/morpheus/_lib/include/morpheus/stages/file_source.hpp +++ b/morpheus/_lib/include/morpheus/stages/file_source.hpp @@ -31,6 +31,7 @@ #include #include #include +#include namespace morpheus { /****** Component public implementations *******************/ @@ -61,13 +62,19 @@ class FileSourceStage : public mrc::pymrc::PythonSource json_lines = std::nullopt); + FileSourceStage(std::string filename, + int repeat = 1, + bool filter_null = true, + std::vector filter_null_columns = {}, + std::optional json_lines = std::nullopt); private: subscriber_fn_t build(); std::string m_filename; int m_repeat{1}; + bool m_filter_null{true}; + std::vector m_filter_null_columns; std::optional m_json_lines; }; @@ -87,16 +94,22 @@ struct FileSourceStageInterfaceProxy * @param parser_kwargs : Optional arguments to pass to the file parser. * @return std::shared_ptr> */ - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - std::string filename, - int repeat = 1, - pybind11::dict parser_kwargs = pybind11::dict()); - static std::shared_ptr> init(mrc::segment::Builder& builder, - const std::string& name, - std::filesystem::path filename, - int repeat = 1, - pybind11::dict parser_kwargs = pybind11::dict()); + static std::shared_ptr> init( + mrc::segment::Builder& builder, + const std::string& name, + std::string filename, + int repeat = 1, + bool filter_null = true, + std::vector filter_null_columns = {}, + pybind11::dict parser_kwargs = pybind11::dict()); + static std::shared_ptr> init( + mrc::segment::Builder& builder, + const std::string& name, + std::filesystem::path filename, + int repeat = 1, + bool filter_null = true, + std::vector filter_null_columns = {}, + pybind11::dict parser_kwargs = pybind11::dict()); }; #pragma GCC visibility pop /** @} */ // end of group diff --git a/morpheus/_lib/include/morpheus/utilities/table_util.hpp b/morpheus/_lib/include/morpheus/utilities/table_util.hpp index b8797901ea..9cef0ee87b 100644 --- a/morpheus/_lib/include/morpheus/utilities/table_util.hpp +++ b/morpheus/_lib/include/morpheus/utilities/table_util.hpp @@ -15,10 +15,13 @@ * limitations under the License. */ +#include "morpheus/export.h" // for MORPHEUS_EXPORT + #include #include // IWYU pragma: keep #include +#include #pragma once @@ -35,12 +38,33 @@ namespace morpheus { /** * @brief Structure that encapsulates cuDF table utilities. */ -struct CuDFTableUtil +struct MORPHEUS_EXPORT CuDFTableUtil { /** - * TODO(Documentation) + * @brief Load a table from a file. + * + * @param filename The name of the file to load. + * @return cudf::io::table_with_metadata The table loaded from the file. */ static cudf::io::table_with_metadata load_table(const std::string& filename); + + /** + * @brief Get the column names from a cudf table_with_metadata. + * + * @param table The table to get the column names from. + * @return std::vector The column names. + */ + static std::vector get_column_names(const cudf::io::table_with_metadata& table); + + /** + * @brief Filters rows from a table that contain null values in a given columns. + * null values in columns other than those specified in `filter_columns` are not considered. + * Any missing columns are ignored. + * + * @param table The table to filter + * @param filter_columns The name of the columns to filter on + */ + static void filter_null_data(cudf::io::table_with_metadata& table, const std::vector& filter_columns); }; /** @} */ // end of group } // namespace morpheus diff --git a/morpheus/_lib/src/io/deserializers.cpp b/morpheus/_lib/src/io/deserializers.cpp index 4704b1ba3d..032cffd57b 100644 --- a/morpheus/_lib/src/io/deserializers.cpp +++ b/morpheus/_lib/src/io/deserializers.cpp @@ -20,6 +20,7 @@ #include "morpheus/utilities/cudf_util.hpp" // for CudfHelper #include "morpheus/utilities/stage_util.hpp" #include "morpheus/utilities/string_util.hpp" +#include "morpheus/utilities/table_util.hpp" // for get_column_names #include #include @@ -29,8 +30,6 @@ #include // for cudf::type_id #include // IWYU pragma: keep -#include -#include #include #include #include @@ -50,7 +49,9 @@ namespace morpheus { std::vector get_column_names_from_table(const cudf::io::table_with_metadata& table) { - return foreach_map(table.metadata.schema_info, [](auto schema) { return schema.name; }); + return foreach_map(table.metadata.schema_info, [](auto schema) { + return schema.name; + }); } cudf::io::table_with_metadata load_table_from_file(const std::string& filename, @@ -69,7 +70,7 @@ cudf::io::table_with_metadata load_table_from_file(const std::string& filename, case FileTypes::JSON: { auto options = cudf::io::json_reader_options::builder(cudf::io::source_info{filename}).lines(json_lines.value_or(true)); - table = cudf::io::read_json(options.build()); + table = cudf::io::read_json(options.build()); break; } case FileTypes::CSV: { @@ -106,12 +107,9 @@ pybind11::object read_file_to_df(const std::string& filename, FileTypes file_typ int get_index_col_count(const cudf::io::table_with_metadata& data_table) { - int index_col_count = 0; - auto const& schema = data_table.metadata.schema_info; + int index_col_count = 0; - std::vector names; - names.reserve(schema.size()); - std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { return c.name; }); + std::vector names = CuDFTableUtil::get_column_names(data_table); // Check if we have a first column with INT64 data type if (names.size() >= 1 && data_table.tbl->get_column(0).type().id() == cudf::type_id::INT64) diff --git a/morpheus/_lib/src/stages/file_source.cpp b/morpheus/_lib/src/stages/file_source.cpp index 84a59f5f12..c3dce33693 100644 --- a/morpheus/_lib/src/stages/file_source.cpp +++ b/morpheus/_lib/src/stages/file_source.cpp @@ -24,6 +24,7 @@ #include "morpheus/objects/file_types.hpp" #include "morpheus/objects/table_info.hpp" #include "morpheus/utilities/cudf_util.hpp" +#include "morpheus/utilities/table_util.hpp" // for filter_null_data #include #include @@ -37,24 +38,39 @@ #include #include #include +#include // for invalid_argument #include -// IWYU thinks we need __alloc_traits<>::value_type for vector assignments -// IWYU pragma: no_include namespace morpheus { // Component public implementations // ************ FileSourceStage ************* // -FileSourceStage::FileSourceStage(std::string filename, int repeat, std::optional json_lines) : +FileSourceStage::FileSourceStage(std::string filename, + int repeat, + bool filter_null, + std::vector filter_null_columns, + std::optional json_lines) : PythonSource(build()), m_filename(std::move(filename)), m_repeat(repeat), + m_filter_null(filter_null), + m_filter_null_columns(std::move(filter_null_columns)), m_json_lines(json_lines) -{} +{ + if (m_filter_null && m_filter_null_columns.empty()) + { + throw std::invalid_argument("Filter null columns must not be empty if filter_null is true"); + } +} FileSourceStage::subscriber_fn_t FileSourceStage::build() { return [this](rxcpp::subscriber output) { - auto data_table = load_table_from_file(m_filename, FileTypes::Auto, m_json_lines); + auto data_table = load_table_from_file(m_filename, FileTypes::Auto, m_json_lines); + if (m_filter_null) + { + CuDFTableUtil::filter_null_data(data_table, m_filter_null_columns); + } + int index_col_count = prepare_df_index(data_table); // Next, create the message metadata. This gets reused for repeats @@ -116,6 +132,8 @@ std::shared_ptr> FileSourceStageInterfaceP const std::string& name, std::string filename, int repeat, + bool filter_null, + std::vector filter_null_columns, pybind11::dict parser_kwargs) { std::optional json_lines = std::nullopt; @@ -125,7 +143,8 @@ std::shared_ptr> FileSourceStageInterfaceP json_lines = parser_kwargs["lines"].cast(); } - auto stage = builder.construct_object(name, filename, repeat, json_lines); + auto stage = builder.construct_object( + name, filename, repeat, filter_null, std::move(filter_null_columns), json_lines); return stage; } @@ -135,8 +154,16 @@ std::shared_ptr> FileSourceStageInterfaceP const std::string& name, std::filesystem::path filename, int repeat, + bool filter_null, + std::vector filter_null_columns, pybind11::dict parser_kwargs) { - return init(builder, name, filename.string(), repeat, std::move(parser_kwargs)); + return init(builder, + name, + filename.string(), + repeat, + filter_null, + std::move(filter_null_columns), + std::move(parser_kwargs)); } } // namespace morpheus diff --git a/morpheus/_lib/src/utilities/table_util.cpp b/morpheus/_lib/src/utilities/table_util.cpp index 1c93493d92..d6aa159b6d 100644 --- a/morpheus/_lib/src/utilities/table_util.cpp +++ b/morpheus/_lib/src/utilities/table_util.cpp @@ -19,17 +19,24 @@ #include #include +#include // for drop_nulls +#include // for size_type #include #include +#include // for find, transform #include +#include // for back_insert_iterator, back_inserter +#include // for unique_ptr #include // needed for logging #include // for runtime_error +namespace { namespace fs = std::filesystem; namespace py = pybind11; - -cudf::io::table_with_metadata morpheus::CuDFTableUtil::load_table(const std::string& filename) +} // namespace +namespace morpheus { +cudf::io::table_with_metadata CuDFTableUtil::load_table(const std::string& filename) { auto file_path = fs::path(filename); @@ -52,3 +59,37 @@ cudf::io::table_with_metadata morpheus::CuDFTableUtil::load_table(const std::str throw std::runtime_error("Unknown extension"); } } + +std::vector CuDFTableUtil::get_column_names(const cudf::io::table_with_metadata& table) +{ + auto const& schema = table.metadata.schema_info; + + std::vector names; + names.reserve(schema.size()); + std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) { + return c.name; + }); + + return names; +} + +void CuDFTableUtil::filter_null_data(cudf::io::table_with_metadata& table, + const std::vector& filter_columns) +{ + std::vector filter_keys; + auto column_names = get_column_names(table); + for (const auto& column_name : filter_columns) + { + auto found_col = std::find(column_names.cbegin(), column_names.cend(), column_name); + if (found_col != column_names.cend()) + { + filter_keys.push_back((found_col - column_names.cbegin())); + } + } + + auto tv = table.tbl->view(); + auto filtered_table = cudf::drop_nulls(tv, filter_keys, filter_keys.size()); + + table.tbl.swap(filtered_table); +} +} // namespace morpheus diff --git a/morpheus/_lib/stages/__init__.pyi b/morpheus/_lib/stages/__init__.pyi index 78a0ff8091..bfd66dcb64 100644 --- a/morpheus/_lib/stages/__init__.pyi +++ b/morpheus/_lib/stages/__init__.pyi @@ -60,9 +60,9 @@ class DeserializeMultiMessageStage(mrc.core.segment.SegmentObject): pass class FileSourceStage(mrc.core.segment.SegmentObject): @typing.overload - def __init__(self, builder: mrc.core.segment.Builder, name: str, filename: os.PathLike, repeat: int, parser_kwargs: dict) -> None: ... + def __init__(self, builder: mrc.core.segment.Builder, name: str, filename: os.PathLike, repeat: int, filter_null: bool, filter_null_columns: typing.List[str], parser_kwargs: dict) -> None: ... @typing.overload - def __init__(self, builder: mrc.core.segment.Builder, name: str, filename: str, repeat: int, parser_kwargs: dict) -> None: ... + def __init__(self, builder: mrc.core.segment.Builder, name: str, filename: str, repeat: int, filter_null: bool, filter_null_columns: typing.List[str], parser_kwargs: dict) -> None: ... pass class FilterDetectionsStage(mrc.core.segment.SegmentObject): def __init__(self, builder: mrc.core.segment.Builder, name: str, threshold: float, copy: bool, filter_source: morpheus._lib.common.FilterSource, field_name: str = 'probs') -> None: ... diff --git a/morpheus/_lib/stages/module.cpp b/morpheus/_lib/stages/module.cpp index 1cf57663ac..32c3c5e030 100644 --- a/morpheus/_lib/stages/module.cpp +++ b/morpheus/_lib/stages/module.cpp @@ -52,6 +52,7 @@ #include #include #include +#include namespace morpheus { namespace py = pybind11; @@ -138,20 +139,33 @@ PYBIND11_MODULE(stages, _module) mrc::segment::ObjectProperties, std::shared_ptr>>( _module, "FileSourceStage", py::multiple_inheritance()) - .def(py::init(py::overload_cast( - &FileSourceStageInterfaceProxy::init)), + .def(py::init(py::overload_cast, + py::dict>(&FileSourceStageInterfaceProxy::init)), py::arg("builder"), py::arg("name"), py::arg("filename"), py::arg("repeat"), + py::arg("filter_null"), + py::arg("filter_null_columns"), py::arg("parser_kwargs")) - .def(py::init( - py::overload_cast( - &FileSourceStageInterfaceProxy::init)), + .def(py::init(py::overload_cast, + py::dict>(&FileSourceStageInterfaceProxy::init)), py::arg("builder"), py::arg("name"), py::arg("filename"), py::arg("repeat"), + py::arg("filter_null"), + py::arg("filter_null_columns"), py::arg("parser_kwargs")); py::class_, diff --git a/morpheus/_lib/tests/CMakeLists.txt b/morpheus/_lib/tests/CMakeLists.txt index a17a297aca..e42e7717e8 100644 --- a/morpheus/_lib/tests/CMakeLists.txt +++ b/morpheus/_lib/tests/CMakeLists.txt @@ -188,4 +188,10 @@ add_morpheus_test( test_type_util.cpp ) +add_morpheus_test( + NAME table_util + FILES + utilities/test_table_util.cpp +) + list(POP_BACK CMAKE_MESSAGE_CONTEXT) diff --git a/morpheus/_lib/tests/utilities/test_table_util.cpp b/morpheus/_lib/tests/utilities/test_table_util.cpp new file mode 100644 index 0000000000..021b8a8322 --- /dev/null +++ b/morpheus/_lib/tests/utilities/test_table_util.cpp @@ -0,0 +1,79 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils/common.hpp" // IWYU pragma: associated + +#include "morpheus/io/deserializers.hpp" +#include "morpheus/utilities/table_util.hpp" // for filter_null_data + +#include // for table_with_metadata +#include // for table +#include + +#include +#include +#include // for pair +#include +// IWYU pragma: no_include + +using namespace morpheus; + +TEST_CLASS(TableUtil); + +TEST_F(TestTableUtil, GetColumnNames) +{ + auto morpheus_root = test::get_morpheus_root(); + auto input_files = {morpheus_root / "tests/tests_data/file_with_nulls.csv", + morpheus_root / "tests/tests_data/file_with_nulls.jsonlines"}; + + for (const auto& input_file : input_files) + { + auto table_w_meta = load_table_from_file(input_file); + auto column_names = CuDFTableUtil::get_column_names(table_w_meta); + + EXPECT_EQ(column_names.size(), 2); + EXPECT_EQ(column_names[0], "data"); + EXPECT_EQ(column_names[1], "other"); + } +} + +TEST_F(TestTableUtil, FilterNullData) +{ + auto morpheus_root = test::get_morpheus_root(); + auto input_files = {morpheus_root / "tests/tests_data/file_with_nans.csv", + morpheus_root / "tests/tests_data/file_with_nans.jsonlines", + morpheus_root / "tests/tests_data/file_with_nulls.csv", + morpheus_root / "tests/tests_data/file_with_nulls.jsonlines"}; + std::vector, std::size_t>> expected_row_counts{ + {{"data"}, 8}, {{"data"}, 8}, {{"other"}, 7}, {{"other"}, 7}, {{"data", "other"}, 5}}; + + for (const auto& input_file : input_files) + { + for (const auto& [filter_columns, expected_row_count] : expected_row_counts) + { + auto table_w_meta = load_table_from_file(input_file); + + EXPECT_EQ(table_w_meta.tbl->num_columns(), 2); + EXPECT_EQ(table_w_meta.tbl->num_rows(), 10); + + CuDFTableUtil::filter_null_data(table_w_meta, filter_columns); + + EXPECT_EQ(table_w_meta.tbl->num_columns(), 2); + EXPECT_EQ(table_w_meta.tbl->num_rows(), expected_row_count); + } + } +} diff --git a/morpheus/io/deserializers.py b/morpheus/io/deserializers.py index 293bc2a303..31499b4359 100644 --- a/morpheus/io/deserializers.py +++ b/morpheus/io/deserializers.py @@ -29,40 +29,11 @@ from morpheus.utils.type_aliases import DataFrameType -def read_file_to_df(file_name: typing.Union[str, io.IOBase], - file_type: FileTypes = FileTypes.Auto, - parser_kwargs: dict = None, - filter_nulls: bool = True, - df_type: typing.Literal["cudf", "pandas"] = "pandas") -> DataFrameType: - """ - Reads a file into a dataframe and performs any of the necessary cleanup. - - Parameters - ---------- - file_name : str - File to read. - file_type : `morpheus.common.FileTypes` - Type of file. Leave as Auto to determine from the extension. - parser_kwargs : dict, optional - Any argument to pass onto the parse, by default {}. Ignored when C++ execution is enabled and `df_type="cudf"` - filter_nulls : bool, optional - Whether to filter null rows after loading, by default True. - df_type : typing.Literal[, optional - What type of parser to use. Options are 'cudf' and 'pandas', by default "pandas". - - Returns - ------- - DataFrameType - A parsed DataFrame. - """ - - # The C++ reader only supports cudf dataframes - if (CppConfig.get_should_use_cpp() and df_type == "cudf"): - df = read_file_to_df_cpp(file_name, file_type) - if (filter_nulls): - df = filter_null_data(df) - return df - +def _read_file_to_df_py(*, + file_name: typing.Union[str, io.IOBase], + file_type: FileTypes, + parser_kwargs: dict, + df_type: typing.Literal["cudf", "pandas"]) -> DataFrameType: if (parser_kwargs is None): parser_kwargs = {} @@ -111,7 +82,50 @@ def read_file_to_df(file_name: typing.Union[str, io.IOBase], assert df is not None + return df + + +def read_file_to_df(file_name: typing.Union[str, io.IOBase], + file_type: FileTypes = FileTypes.Auto, + parser_kwargs: dict = None, + filter_nulls: bool = True, + filter_null_columns: list[str] | str = 'data', + df_type: typing.Literal["cudf", "pandas"] = "pandas") -> DataFrameType: + """ + Reads a file into a dataframe and performs any of the necessary cleanup. + + Parameters + ---------- + file_name : str + File to read. + file_type : `morpheus.common.FileTypes` + Type of file. Leave as Auto to determine from the extension. + parser_kwargs : dict, optional + Any argument to pass onto the parse, by default {}. Ignored when C++ execution is enabled and `df_type="cudf"` + filter_nulls : bool, optional + Whether to filter null rows after loading, by default True. + filter_null_columns : list[str]|str, default = 'data' + Column or columns to filter null values from. Ignored when `filter_null` is False. + df_type : typing.Literal[, optional + What type of parser to use. Options are 'cudf' and 'pandas', by default "pandas". + + Returns + ------- + DataFrameType + A parsed DataFrame. + """ + + # The C++ reader only supports cudf dataframes + if (CppConfig.get_should_use_cpp() and df_type == "cudf"): + df = read_file_to_df_cpp(file_name, file_type) + else: + df = _read_file_to_df_py(file_name=file_name, file_type=file_type, parser_kwargs=parser_kwargs, df_type=df_type) + if (filter_nulls): - df = filter_null_data(df) + if isinstance(filter_null_columns, str): + filter_null_columns = [filter_null_columns] + + for col in filter_null_columns: + df = filter_null_data(df, column_name=col) return df diff --git a/morpheus/io/utils.py b/morpheus/io/utils.py index d8b286a8e8..9a20afb4d5 100644 --- a/morpheus/io/utils.py +++ b/morpheus/io/utils.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -def filter_null_data(x: DataFrameType): +def filter_null_data(x: DataFrameType, column_name: str = "data") -> DataFrameType: """ Filters out null row in a dataframe's 'data' column if it exists. @@ -34,12 +34,14 @@ def filter_null_data(x: DataFrameType): ---------- x : DataFrameType The dataframe to fix. + column_name : str, default 'data' + The column name to filter on. """ if ("data" not in x): return x - return x[~x['data'].isna()] + return x[~x[column_name].isna()] def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict[str, int]) -> bool: diff --git a/morpheus/stages/input/file_source_stage.py b/morpheus/stages/input/file_source_stage.py index eb4630fb3e..9b3551dce6 100644 --- a/morpheus/stages/input/file_source_stage.py +++ b/morpheus/stages/input/file_source_stage.py @@ -57,8 +57,11 @@ class FileSourceStage(PreallocatorMixin, SingleOutputSource): repeat : int, default = 1, min = 1 Repeats the input dataset multiple times. Useful to extend small datasets for debugging. filter_null : bool, default = True - Whether to filter rows with null 'data' column. Null values in the 'data' column can cause issues down - the line with processing. Setting this to True is recommended. + Whether to filter rows with null `filter_null_columns` columns. Null values in source data can cause issues + down the line with processing. Setting this to True is recommended. + filter_null_columns : list[str], default = None + Column or columns to filter null values from. Ignored when `filter_null` is False. If None, and `filter_null` + is `True`, this will default to `["data"]` parser_kwargs : dict, default = {} Extra options to pass to the file parser. """ @@ -70,6 +73,7 @@ def __init__(self, file_type: FileTypes = FileTypes.Auto, repeat: int = 1, filter_null: bool = True, + filter_null_columns: list[str] = None, parser_kwargs: dict = None): super().__init__(c) @@ -79,6 +83,12 @@ def __init__(self, self._filename = filename self._file_type = file_type self._filter_null = filter_null + + if filter_null_columns is None or len(filter_null_columns) == 0: + filter_null_columns = ["data"] + + self._filter_null_columns = filter_null_columns + self._parser_kwargs = parser_kwargs or {} self._input_count = None @@ -114,6 +124,8 @@ def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: self.unique_name, self._filename, self._repeat_count, + self._filter_null, + self._filter_null_columns, self._parser_kwargs) else: node = builder.make_source(self.unique_name, self._generate_frames()) @@ -126,6 +138,7 @@ def _generate_frames(self) -> typing.Iterable[MessageMeta]: self._filename, self._file_type, filter_nulls=self._filter_null, + filter_null_columns=self._filter_null_columns, parser_kwargs=self._parser_kwargs, df_type="cudf", ) diff --git a/tests/stages/test_file_source_stage_pipe.py b/tests/stages/test_file_source_stage_pipe.py new file mode 100755 index 0000000000..59f9c76d63 --- /dev/null +++ b/tests/stages/test_file_source_stage_pipe.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pathlib + +import pandas as pd +import pytest + +from _utils import TEST_DIRS +from _utils import assert_results +from morpheus.common import FileTypes +from morpheus.common import determine_file_type +from morpheus.config import Config +from morpheus.io.deserializers import read_file_to_df +from morpheus.pipeline import LinearPipeline +from morpheus.stages.input.file_source_stage import FileSourceStage +from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage + + +@pytest.mark.slow +@pytest.mark.parametrize("input_file", + [ + os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), + os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.parquet"), + os.path.join(TEST_DIRS.tests_data_dir, 'examples/abp_pcap_detection/abp_pcap.jsonlines') + ], + ids=["csv", "parquet", "jsonlines"]) +@pytest.mark.parametrize("filter_null", [False, True], ids=["no_filter", "filter_null"]) +@pytest.mark.parametrize("use_pathlib", [False, True], ids=["no_pathlib", "pathlib"]) +@pytest.mark.parametrize("repeat", [1, 2, 5], ids=["repeat1", "repeat2", "repeat5"]) +def test_file_source_stage_pipe(config: Config, input_file: str, filter_null: bool, use_pathlib: bool, repeat: int): + parser_kwargs = {} + if determine_file_type(input_file) == FileTypes.JSON: + # kwarg specific to pandas.read_json + parser_kwargs['convert_dates'] = False + + expected_df = read_file_to_df(file_name=input_file, + filter_nulls=filter_null, + df_type="pandas", + parser_kwargs=parser_kwargs) + expected_df = pd.concat([expected_df for _ in range(repeat)]) + + expected_df.reset_index(inplace=True) + expected_df.drop('index', axis=1, inplace=True) + + if use_pathlib: + input_file = pathlib.Path(input_file) + + pipe = LinearPipeline(config) + pipe.set_source(FileSourceStage(config, filename=input_file, repeat=repeat, filter_null=filter_null)) + comp_stage = pipe.add_stage( + CompareDataFrameStage(config, compare_df=expected_df, exclude=["index"], reset_index=True)) + pipe.run() + + assert_results(comp_stage.get_results()) diff --git a/tests/tests_data/file_with_nans.csv b/tests/tests_data/file_with_nans.csv new file mode 100644 index 0000000000..ff3a8643fa --- /dev/null +++ b/tests/tests_data/file_with_nans.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a173a9d2027a90c7df128dac1f9126160107954fc286a13d04bd94824d668b8 +size 76 diff --git a/tests/tests_data/file_with_nans.jsonlines b/tests/tests_data/file_with_nans.jsonlines new file mode 100644 index 0000000000..7a9190ce40 --- /dev/null +++ b/tests/tests_data/file_with_nans.jsonlines @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:559f654cd30742b2fe49ec6f163118b660b61f2c6ebe5acb13bdfeb907fe9865 +size 255 diff --git a/tests/tests_data/file_with_nulls.csv b/tests/tests_data/file_with_nulls.csv new file mode 100644 index 0000000000..d2416abb19 --- /dev/null +++ b/tests/tests_data/file_with_nulls.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65dbc84b9c7ebe0132fbcab419fe681a1628cb7b1c08f09ca62c2b46fbd56c59 +size 46 diff --git a/tests/tests_data/file_with_nulls.jsonlines b/tests/tests_data/file_with_nulls.jsonlines new file mode 100644 index 0000000000..af82d24f9f --- /dev/null +++ b/tests/tests_data/file_with_nulls.jsonlines @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1dfca1a616e66ebdcdb87d4adb9b15af594ca6fded67b4d9af8181b061e559f +size 255 From 9719d9f0f5d368381004635e171399bc07380216 Mon Sep 17 00:00:00 2001 From: Aser Garcia Date: Thu, 9 May 2024 11:19:48 -0400 Subject: [PATCH 33/38] Ensuring consistent use of the export macro `MORPHEUS_EXPORT` (#1672) This PR makes exporting symbols consistent across header files. The docs and examples for the developer_guide is also updated. Closes #1595 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - Aser Garcia (https://github.com/aserGarcia) - David Gardner (https://github.com/dagardner-nv) Approvers: - Yuchen Zhang (https://github.com/yuchenz427) - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1672 --- .../guides/3_simple_cpp_stage.md | 29 +++++----- .../guides/4_source_cpp_stage.md | 18 +++---- .../developer_guide/guides/8_cpp_modules.md | 7 +-- .../src/simple_cpp_stage/_lib/pass_thru.hpp | 11 ++-- .../_lib/rabbitmq_source.hpp | 11 ++-- .../include/morpheus/doca/doca_source.hpp | 9 ++-- .../_lib/include/morpheus/io/data_loader.hpp | 8 ++- .../morpheus/io/data_loader_registry.hpp | 7 ++- .../include/morpheus/io/deserializers.hpp | 18 +++---- .../_lib/include/morpheus/io/loaders/file.hpp | 7 +-- .../_lib/include/morpheus/io/loaders/grpc.hpp | 7 +-- .../include/morpheus/io/loaders/lambda.hpp | 7 +-- .../include/morpheus/io/loaders/payload.hpp | 7 +-- .../_lib/include/morpheus/io/loaders/rest.hpp | 7 +-- .../_lib/include/morpheus/io/serializers.hpp | 54 ++++++++++--------- .../include/morpheus/messages/control.hpp | 11 ++-- .../messages/memory/inference_memory.hpp | 8 ++- .../messages/memory/inference_memory_fil.hpp | 5 +- .../messages/memory/inference_memory_nlp.hpp | 9 ++-- .../messages/memory/response_memory.hpp | 9 ++-- .../messages/memory/response_memory_probs.hpp | 8 +-- .../messages/memory/tensor_memory.hpp | 8 ++- .../_lib/include/morpheus/messages/meta.hpp | 12 ++--- .../_lib/include/morpheus/messages/multi.hpp | 17 +++--- .../morpheus/messages/multi_inference.hpp | 8 +-- .../morpheus/messages/multi_inference_fil.hpp | 9 ++-- .../morpheus/messages/multi_inference_nlp.hpp | 8 +-- .../morpheus/messages/multi_response.hpp | 8 +-- .../messages/multi_response_probs.hpp | 9 ++-- .../morpheus/messages/multi_tensor.hpp | 8 ++- .../morpheus/modules/data_loader_module.hpp | 6 +-- .../_lib/include/morpheus/objects/dtype.hpp | 10 ++-- .../morpheus/objects/factory_registry.hpp | 7 ++- .../include/morpheus/objects/fiber_queue.hpp | 9 ++-- .../include/morpheus/objects/file_types.hpp | 14 +++-- .../morpheus/objects/filter_source.hpp | 9 ++-- .../objects/mutable_table_ctx_mgr.hpp | 6 +-- .../include/morpheus/objects/rmm_tensor.hpp | 7 ++- .../_lib/include/morpheus/objects/tensor.hpp | 6 +-- .../morpheus/objects/wrapped_tensor.hpp | 5 +- .../morpheus/stages/add_classification.hpp | 8 ++- .../include/morpheus/stages/add_scores.hpp | 7 ++- .../morpheus/stages/add_scores_stage_base.hpp | 6 +-- .../include/morpheus/stages/deserialize.hpp | 9 ++-- .../include/morpheus/stages/file_source.hpp | 7 ++- .../morpheus/stages/filter_detection.hpp | 8 ++- .../stages/http_server_source_stage.hpp | 8 ++- .../include/morpheus/stages/kafka_source.hpp | 10 ++-- .../include/morpheus/stages/preallocate.hpp | 10 ++-- .../morpheus/stages/preprocess_fil.hpp | 8 +-- .../morpheus/stages/preprocess_nlp.hpp | 9 ++-- .../include/morpheus/stages/serialize.hpp | 9 ++-- .../include/morpheus/stages/write_to_file.hpp | 9 ++-- .../morpheus/utilities/http_server.hpp | 13 +++-- .../morpheus/utilities/python_util.hpp | 10 ++-- 55 files changed, 257 insertions(+), 297 deletions(-) diff --git a/docs/source/developer_guide/guides/3_simple_cpp_stage.md b/docs/source/developer_guide/guides/3_simple_cpp_stage.md index 3b0982d21e..678fc3074f 100644 --- a/docs/source/developer_guide/guides/3_simple_cpp_stage.md +++ b/docs/source/developer_guide/guides/3_simple_cpp_stage.md @@ -54,7 +54,7 @@ def supports_cpp_node(self): return True ``` -C++ message object declarations can be found in the header files that are located in the `morpheus/_lib/include/morpheus/messages` directory. For example, the `MessageMeta` class declaration is located in `morpheus/_lib/include/morpheus/messages/meta.hpp`. In code this would be included as: +C++ message object declarations can be found in the header files that are located in the `morpheus/_lib/include/morpheus/messages` directory. For example, the `MessageMeta` class declaration is located in `morpheus/_lib/include/morpheus/messages/meta.hpp`. Since this code is outside of the morpheus directory it would be included as: ```cpp #include @@ -89,6 +89,7 @@ While our Python implementation accepts messages of any type (in the form of Pyt To start with, we have our Morpheus and MRC-specific includes: ```cpp +#include #include // for MultiMessage #include // for Segment Builder #include // for Segment Object @@ -100,12 +101,10 @@ We'll want to define our stage in its own namespace. In this case, we will name ```cpp namespace morpheus_example { -// pybind11 sets visibility to hidden by default; we want to export our symbols -#pragma GCC visibility push(default) - using namespace morpheus; -class PassThruStage : public mrc::pymrc::PythonNode, std::shared_ptr> +// pybind11 sets visibility to hidden by default; we want to export our symbols +class MORPHEUS_EXPORT PassThruStage : public mrc::pymrc::PythonNode, std::shared_ptr> { public: using base_t = mrc::pymrc::PythonNode, std::shared_ptr>; @@ -119,7 +118,13 @@ class PassThruStage : public mrc::pymrc::PythonNode