From 74402cfd2d47a931b48caa93cf11f94c37de0588 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 16:57:02 +0000 Subject: [PATCH 01/12] remove nvtabular --- .../all_cuda-121_arch-x86_64.yaml | 1 - .../dev_cuda-121_arch-x86_64.yaml | 1 - .../examples_cuda-121_arch-x86_64.yaml | 1 - .../runtime_cuda-121_arch-x86_64.yaml | 1 - dependencies.yaml | 1 - docs/source/conf.py | 1 - .../production/conda_env.yml | 1 - morpheus/utils/column_info.py | 9 - morpheus/utils/downloader.py | 39 +- morpheus/utils/nvt/__init__.py | 13 - morpheus/utils/nvt/decorators.py | 123 ---- morpheus/utils/nvt/extensions/__init__.py | 17 - morpheus/utils/nvt/extensions/morpheus_ext.py | 27 - morpheus/utils/nvt/mutate.py | 195 ------ morpheus/utils/nvt/patches/__init__.py | 15 - morpheus/utils/nvt/patches/merlin_patches.py | 30 - morpheus/utils/nvt/schema_converters.py | 642 ----------------- morpheus/utils/nvt/transforms.py | 64 -- morpheus/utils/schema_transforms.py | 107 +-- .../test_dfp_file_to_df.py | 22 +- tests/test_column_info.py | 62 -- tests/test_downloader.py | 18 +- tests/utils/nvt/__init__.py | 13 - tests/utils/nvt/integration/__init__.py | 13 - tests/utils/nvt/integration/test_mutate_op.py | 65 -- .../utils/nvt/test_json_flatten_transform.py | 68 -- tests/utils/nvt/test_mutate_op.py | 120 ---- tests/utils/nvt/test_schema_converters.py | 661 ------------------ tests/utils/nvt/test_transforms.py | 52 -- 29 files changed, 54 insertions(+), 2328 deletions(-) delete mode 100644 morpheus/utils/nvt/__init__.py delete mode 100644 morpheus/utils/nvt/decorators.py delete mode 100644 morpheus/utils/nvt/extensions/__init__.py delete mode 100644 morpheus/utils/nvt/extensions/morpheus_ext.py delete mode 100644 morpheus/utils/nvt/mutate.py delete mode 100644 morpheus/utils/nvt/patches/__init__.py delete mode 100644 morpheus/utils/nvt/patches/merlin_patches.py delete mode 100644 morpheus/utils/nvt/schema_converters.py delete mode 100644 morpheus/utils/nvt/transforms.py delete mode 100644 tests/utils/nvt/__init__.py delete mode 100644 tests/utils/nvt/integration/__init__.py delete mode 100644 tests/utils/nvt/integration/test_mutate_op.py delete mode 100644 tests/utils/nvt/test_json_flatten_transform.py delete mode 100644 tests/utils/nvt/test_mutate_op.py delete mode 100644 tests/utils/nvt/test_schema_converters.py delete mode 100644 tests/utils/nvt/test_transforms.py diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml index a6aab42af3..fe924bf8a6 100644 --- a/conda/environments/all_cuda-121_arch-x86_64.yaml +++ b/conda/environments/all_cuda-121_arch-x86_64.yaml @@ -71,7 +71,6 @@ dependencies: - nodejs=18.* - numexpr - numpydoc=1.5 -- nvtabular=23.08.00 - onnx=1.15 - openai=1.13 - papermill=2.4.0 diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml index 468166b3d7..345461bf2c 100644 --- a/conda/environments/dev_cuda-121_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-121_arch-x86_64.yaml @@ -59,7 +59,6 @@ dependencies: - nlohmann_json=3.11 - nodejs=18.* - numpydoc=1.5 -- nvtabular=23.08.00 - pip - pkg-config=0.29 - pluggy=1.3 diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml index ea17b3bdb3..1ec20467f3 100644 --- a/conda/environments/examples_cuda-121_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-121_arch-x86_64.yaml @@ -36,7 +36,6 @@ dependencies: - nodejs=18.* - numexpr - numpydoc=1.5 -- nvtabular=23.08.00 - onnx=1.15 - openai=1.13 - papermill=2.4.0 diff --git a/conda/environments/runtime_cuda-121_arch-x86_64.yaml b/conda/environments/runtime_cuda-121_arch-x86_64.yaml index b7ab668e1f..d9f23b252d 100644 --- a/conda/environments/runtime_cuda-121_arch-x86_64.yaml +++ b/conda/environments/runtime_cuda-121_arch-x86_64.yaml @@ -30,7 +30,6 @@ dependencies: - mrc=24.06 - networkx=2.8.8 - numpydoc=1.5 -- nvtabular=23.08.00 - pip - pluggy=1.3 - pydantic diff --git a/dependencies.yaml b/dependencies.yaml index f95295ee52..9c29642132 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -334,7 +334,6 @@ dependencies: - mrc=24.06 - networkx=2.8.8 - numpydoc=1.5 - - nvtabular=23.08.00 - pydantic # - python ## - python-confluent-kafka>=1.9.2,<1.10.0a0 diff --git a/docs/source/conf.py b/docs/source/conf.py index 709e8230e0..70f5e51eb9 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -175,7 +175,6 @@ "langchain_core", "merlin", "morpheus.cli.commands", # Dont document the CLI in Sphinx - "nvtabular", "pandas", "pydantic", "pymilvus", diff --git a/examples/digital_fingerprinting/production/conda_env.yml b/examples/digital_fingerprinting/production/conda_env.yml index 36ebd7d448..0eb8d3b2db 100644 --- a/examples/digital_fingerprinting/production/conda_env.yml +++ b/examples/digital_fingerprinting/production/conda_env.yml @@ -29,7 +29,6 @@ dependencies: - librdkafka - mlflow>=2.10.0,<3 - nodejs=18.* - - nvtabular=23.06 - papermill - s3fs>=2023.6 diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index a5e892a8bb..9ebba46f2b 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -17,7 +17,6 @@ import logging import re import typing -import warnings from datetime import datetime from functools import partial @@ -25,12 +24,6 @@ import cudf -if (typing.TYPE_CHECKING): - with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - import nvtabular as nvt - logger = logging.getLogger(f"morpheus.{__name__}") DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' @@ -749,7 +742,6 @@ class DataFrameInputSchema: input_columns: typing.Dict[str, str] = dataclasses.field(init=False, repr=False) output_columns: typing.List[tuple[str, str]] = dataclasses.field(init=False, repr=False) - nvt_workflow: "nvt.Workflow" = dataclasses.field(init=False, repr=False) prep_dataframe: typing.Callable[[pd.DataFrame], typing.List[str]] = dataclasses.field(init=False, repr=False) def __post_init__(self): @@ -797,4 +789,3 @@ def __post_init__(self): json_cols=self.json_columns, preserve_re=self.preserve_columns) - self.nvt_workflow = None diff --git a/morpheus/utils/downloader.py b/morpheus/utils/downloader.py index 0a68ae6e14..d352f7ef22 100644 --- a/morpheus/utils/downloader.py +++ b/morpheus/utils/downloader.py @@ -17,16 +17,13 @@ """ import logging -import multiprocessing as mp import os import threading import typing -import warnings from enum import Enum import fsspec import pandas as pd -from merlin.core.utils import Distributed logger = logging.getLogger(__name__) @@ -99,20 +96,22 @@ def get_dask_cluster(self): Returns ------- - dask_cuda.LocalCUDACluster + dask.distributed.LocalCluster """ with Downloader._mutex: if Downloader._dask_cluster is None: - import dask_cuda.utils + import dask + import dask.distributed logger.debug("Creating dask cluster...") n_workers = dask_cuda.utils.get_n_gpus() threads_per_worker = mp.cpu_count() // n_workers - Downloader._dask_cluster = dask_cuda.LocalCUDACluster(n_workers=n_workers, - threads_per_worker=threads_per_worker) + Downloader._dask_cluster = dask.distributed.LocalCluster(start=True, + processes=self.download_method + != "dask_thread") logger.debug("Creating dask cluster... Done. Dashboard: %s", Downloader._dask_cluster.dashboard_link) @@ -127,24 +126,18 @@ def get_dask_client(self): dask.distributed.Client """ import dask.distributed + return dask.distributed.Client(self.get_dask_cluster()) - # Up the heartbeat interval which can get violated with long download times - dask.config.set({"distributed.client.heartbeat": self._dask_heartbeat_interval}) + def close(self): + """Close the dask cluster if it exists.""" + if (self._dask_cluster is not None): + logger.debug("Stopping dask cluster...") - if (self._merlin_distributed is None): - with warnings.catch_warnings(): - # Merlin.Distributed will warn if a client already exists, the client in question is the one created - # and are explicitly passing to it in the constructor. - warnings.filterwarnings("ignore", - message="Existing Dask-client object detected in the current context.*", - category=UserWarning) - self._merlin_distributed = Distributed(client=dask.distributed.Client(self.get_dask_cluster())) + self._dask_cluster.close() - return self._merlin_distributed + self._dask_cluster = None - def close(self): - """Cluster management is handled by Merlin.Distributed""" - pass + logger.debug("Stopping dask cluster... Done.") def download(self, download_buckets: fsspec.core.OpenFiles, @@ -169,8 +162,8 @@ def download(self, if (self._download_method.startswith("dask")): # Create the client each time to ensure all connections to the cluster are closed (they can time out) with self.get_dask_client() as dist: - dfs = dist.client.map(download_fn, download_buckets) - dfs = dist.client.gather(dfs) + dfs = dist.map(download_fn, download_buckets) + dfs = dist.gather(dfs) else: # Simply loop diff --git a/morpheus/utils/nvt/__init__.py b/morpheus/utils/nvt/__init__.py deleted file mode 100644 index 66061e580b..0000000000 --- a/morpheus/utils/nvt/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/morpheus/utils/nvt/decorators.py b/morpheus/utils/nvt/decorators.py deleted file mode 100644 index 6d13dfa444..0000000000 --- a/morpheus/utils/nvt/decorators.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import inspect -import os -import typing - -import pandas as pd - -import cudf - - -def sync_df_as_pandas(df_arg_name='df'): - """ - This function serves as a decorator that synchronizes cudf.DataFrame to pandas.DataFrame before applying the - function. - - Parameters - ---------- - df_arg_name : str - The name of the DataFrame parameter in the decorated function. - - Returns - ------- - Callable - The decorator. - """ - - # pylint: disable=pointless-string-statement - x_data_frame = typing.TypeVar("x_data_frame", pd.DataFrame, cudf.DataFrame) - """ - Represents a DataFrame that can be either a pandas or cudf DataFrame - """ - - # pylint: disable=pointless-string-statement - _sync_pandas_args = typing.ParamSpec('_sync_pandas_args') - """ - Represents the remaining arguments to the function after the first argument (the DataFrame) - """ - - def decorator(func: typing.Callable[typing.Concatenate[pd.DataFrame, _sync_pandas_args], pd.DataFrame]) -> \ - typing.Callable[typing.Concatenate[x_data_frame, _sync_pandas_args], x_data_frame]: - """ - The actual decorator that wraps the function. - - Parameters - ---------- - func : Callable - The function to apply to the DataFrame. - - Returns - ------- - Callable - The wrapped function. - """ - - def wrapper(*args, **kwargs) -> typing.Union[pd.DataFrame, cudf.DataFrame]: - is_arg = False - arg_index = 0 - df_arg = kwargs.get(df_arg_name) - if df_arg is None: - # try to get DataFrame argument from positional arguments - func_args = inspect.signature(func).parameters - for i, arg in enumerate(func_args): - if arg == df_arg_name: - is_arg = True - arg_index = i - df_arg = args[i] - break - - convert_to_cudf = False - if isinstance(df_arg, cudf.DataFrame): - convert_to_cudf = True - if (is_arg): - args = list(args) - args[arg_index] = df_arg.to_pandas() - args = tuple(args) - else: - kwargs[df_arg_name] = df_arg.to_pandas() - - result = func(*args, **kwargs) - - if convert_to_cudf: - result = cudf.from_pandas(result) - - return result - - return wrapper - - return decorator - - -# Avoid using the annotate decorator in sphinx builds, instead define a simple pass-through decorator -if os.environ.get("MORPHEUS_IN_SPHINX_BUILD") is None: - from merlin.core.dispatch import annotate # pylint: disable=unused-import -else: - - def annotate(*args, **kwargs): # pylint: disable=unused-argument - """ - `merlin.core.dispatch.annotate` - """ - - def decorator(func): - - @functools.wraps(func) - def wrappper(*args, **kwargs): - return func(*args, **kwargs) - - return wrappper - - return decorator diff --git a/morpheus/utils/nvt/extensions/__init__.py b/morpheus/utils/nvt/extensions/__init__.py deleted file mode 100644 index 57d1384dac..0000000000 --- a/morpheus/utils/nvt/extensions/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .morpheus_ext import register_morpheus_extensions - -__all__ = ["register_morpheus_extensions"] diff --git a/morpheus/utils/nvt/extensions/morpheus_ext.py b/morpheus/utils/nvt/extensions/morpheus_ext.py deleted file mode 100644 index b85a3bb68d..0000000000 --- a/morpheus/utils/nvt/extensions/morpheus_ext.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def register_morpheus_extensions(): - from datetime import datetime - - import merlin.dtypes.aliases as mn - from merlin.dtypes import register - from merlin.dtypes.mapping import DTypeMapping - - morpheus_extension = DTypeMapping(mapping={ - mn.datetime64: [datetime], - }, ) - - register("morpheus_ext", morpheus_extension) diff --git a/morpheus/utils/nvt/mutate.py b/morpheus/utils/nvt/mutate.py deleted file mode 100644 index c9228f1310..0000000000 --- a/morpheus/utils/nvt/mutate.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing -from inspect import getsourcelines - -import numpy as np -from merlin.core.dispatch import DataFrameType -from merlin.schema import ColumnSchema -from merlin.schema import Schema -from nvtabular.ops.operator import ColumnSelector -from nvtabular.ops.operator import Operator - -from morpheus.utils.nvt.decorators import annotate - - -class MutateOp(Operator): - - def __init__(self, - func: typing.Callable, - output_columns: typing.Optional[typing.List] = None, - dependencies: typing.Optional[typing.List] = None, - label: typing.Optional[str] = None): - """ - Initialize MutateOp class. - - Parameters - ---------- - func : Callable - Function to perform mutation operation. - output_columns : Optional[List], optional - List of output columns, by default None. - dependencies : Optional[List], optional - List of dependencies, by default None. - label : Optional[str], optional - Label for MutateOp, by default None. - """ - - super().__init__() - - self._dependencies = dependencies or [] - self._func = func - self._label = label - self._output_columns = output_columns or [] - - def _remove_deps(self, column_selector: ColumnSelector): - """ - Remove dependencies from column selector. - - Parameters - ---------- - column_selector : ColumnSelector - Instance of ColumnSelector from which dependencies will be removed. - - Returns - ------- - ColumnSelector - Updated instance of ColumnSelector. - """ - - to_skip = ColumnSelector( - [dep if isinstance(dep, str) else dep.output_schema.column_names for dep in self._dependencies]) - - return column_selector.filter_columns(to_skip) - - @property - def label(self): - """ - Get the label of the MutateOp instance. - - Returns - ------- - str - The label of the MutateOp instance. - """ - - if (self._label is not None): - return self._label - - # if we have a named function (not a lambda) return the function name - name = self._func.__name__.split(".")[-1] - if name != "": - return f"MutateOp: {name}" - - try: - # otherwise get the lambda source code from the inspect module if possible - source = getsourcelines(self.f)[0][0] # pylint: disable=no-member - lambdas = [op.strip() for op in source.split(">>") if "lambda " in op] - if len(lambdas) == 1 and lambdas[0].count("lambda") == 1: - return lambdas[0] - except Exception: # pylint: disable=broad-except - # we can fail to load the source in distributed environments. Since the - # label is mainly used for diagnostics, don't worry about the error here and - # fallback to the default labelling - pass - - # Failed to figure out the source - return "MutateOp" - - # pylint: disable=arguments-renamed - @annotate("MutateOp", color="darkgreen", domain="nvt_python") - def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: - """ - Apply the transformation function on the dataframe. - - Parameters - ---------- - col_selector : ColumnSelector - Instance of ColumnSelector. - df : DataFrameType - Input dataframe. - - Returns - ------- - DataFrameType - Transformed dataframe. - """ - - df = self._func(col_selector, df) - - # If our dataframe doesn't contain the expected output columns, even after processing, we add dummy columns. - # This could occur if our JSON data doesn't always contain columns we expect to be expanded. - df_cols_set = set(df.columns) - new_cols = { - col[0]: np.zeros(df.shape[0], dtype=col[1]) - for col in self._output_columns if col[0] not in df_cols_set - } - - df = df.assign(**new_cols) - - return df - - def column_mapping(self, col_selector: ColumnSelector) -> typing.Dict[str, str]: - """ - Generate a column mapping. - - Parameters - ---------- - col_selector : ColumnSelector - Instance of ColumnSelector. - - Returns - ------- - Dict[str, str] - Dictionary of column mappings. - """ - - column_mapping = {} - - for col_name, _ in self._output_columns: - column_mapping[col_name] = col_selector.names - - return column_mapping - - def compute_output_schema( - self, - input_schema: Schema, - col_selector: ColumnSelector, - prev_output_schema: typing.Optional[Schema] = None, - ) -> Schema: - """ - Compute the output schema. - - Parameters - ---------- - input_schema : Schema - The input schema. - col_selector : ColumnSelector - Instance of ColumnSelector. - prev_output_schema : Optional[Schema], optional - Previous output schema, by default None. - - Returns - ------- - Schema - The output schema. - """ - output_schema = super().compute_output_schema(input_schema, col_selector, prev_output_schema) - - # Add new columns to the output schema - for col, dtype in self._output_columns: - output_schema += Schema([ColumnSchema(col, dtype=dtype)]) - - return output_schema diff --git a/morpheus/utils/nvt/patches/__init__.py b/morpheus/utils/nvt/patches/__init__.py deleted file mode 100644 index 03a1b3bc36..0000000000 --- a/morpheus/utils/nvt/patches/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .merlin_patches import patch_numpy_dtype_registry diff --git a/morpheus/utils/nvt/patches/merlin_patches.py b/morpheus/utils/nvt/patches/merlin_patches.py deleted file mode 100644 index 9d9d82e81d..0000000000 --- a/morpheus/utils/nvt/patches/merlin_patches.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def patch_numpy_dtype_registry() -> None: - """ - Patches the Merlin dtypes registry to support conversion from Merlin 'struct' dtypes to the equivalent numpy object. - - This is necessary to support pandas conversion of input dataframes containing 'struct' dtypes within an NVT - operator. Until this is fixed upstream, with the mappings added to `merlin/dtypes/mappings/numpy.py`, this patch - should be used. The function is idempotent, and should be called before any NVT operators are used. - """ - import merlin.dtypes.aliases as mn - import numpy as np - from merlin.dtypes import _dtype_registry - - numpy_dtypes = _dtype_registry.mappings["numpy"].from_merlin_ - if (mn.struct not in numpy_dtypes.keys()): - numpy_dtypes[mn.struct] = [np.dtype("O"), object] diff --git a/morpheus/utils/nvt/schema_converters.py b/morpheus/utils/nvt/schema_converters.py deleted file mode 100644 index 44249618b7..0000000000 --- a/morpheus/utils/nvt/schema_converters.py +++ /dev/null @@ -1,642 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses -import os -import typing -from functools import partial - -import networkx as nx -import nvtabular as nvt -import pandas as pd -from merlin.core.dispatch import DataFrameType -from merlin.core.dispatch import annotate -from merlin.core.dispatch import is_dataframe_object -from merlin.core.dispatch import is_series_object -from merlin.dag import ColumnSelector -from nvtabular.ops import Filter -from nvtabular.ops import LambdaOp -from nvtabular.ops import Rename - -import cudf - -from morpheus.utils.column_info import BoolColumn -from morpheus.utils.column_info import ColumnInfo -from morpheus.utils.column_info import CustomColumn -from morpheus.utils.column_info import DataFrameInputSchema -from morpheus.utils.column_info import DateTimeColumn -from morpheus.utils.column_info import DistinctIncrementColumn -from morpheus.utils.column_info import IncrementColumn -from morpheus.utils.column_info import RenameColumn -from morpheus.utils.column_info import StringCatColumn -from morpheus.utils.column_info import StringJoinColumn -from morpheus.utils.column_info import create_increment_col -from morpheus.utils.nvt.decorators import sync_df_as_pandas -from morpheus.utils.nvt.mutate import MutateOp -from morpheus.utils.nvt.transforms import json_flatten - - -@dataclasses.dataclass -class JSONFlattenInfo(ColumnInfo): - """ - Subclass of `ColumnInfo`. Makes it easier to generate a graph of the column dependencies. - - Attributes - ---------- - input_col_names : list - List of input column names. - output_col_names : list - List of output column names. - """ - - input_col_names: list - output_col_names: list - - def get_input_column_types(self) -> dict[str, str]: - """ - Return a dictionary of input column names and types needed for processing. This is used for schema - validation and should be overridden by subclasses. - """ - return {name: ColumnInfo.convert_pandas_dtype(str) for name in self.input_col_names} - - -# Same in every way to the base, except we don't drop the index -class _MorpheusFilter(Filter): - - @annotate("Filter_op", color="darkgreen", domain="nvt_python") - def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: - filtered = self.f(df) - if is_dataframe_object(filtered): - new_df = filtered - elif is_series_object(filtered) and filtered.dtype == bool: - new_df = df[filtered] - else: - raise ValueError(f"Invalid output from filter op: f{filtered.__class__}") - - # new_df.reset_index(drop=True, inplace=True) - return new_df - - -def _get_ci_column_selector(col_info) -> typing.Union[str, typing.List[str]]: - """ - Return a column selector based on a ColumnInfo object. - - Parameters - ---------- - col_info : ColumnInfo - The ColumnInfo object. - - Returns - ------- - Union[str, list of str] - A column selector. - - Raises - ------ - TypeError - If the input `ci` is not an instance of ColumnInfo. - Exception - If the type of ColumnInfo is unknown. - """ - - if (not isinstance(col_info, ColumnInfo)): - raise TypeError - - selected_cols = col_info.get_input_column_types() - - if (len(selected_cols) == 0 and isinstance(col_info, CustomColumn)): - return ["*"] - - return list(selected_cols.keys()) - - -def _json_flatten_from_input_schema(json_input_cols: typing.List[str], - json_output_cols: typing.List[typing.Tuple[str, str]]) -> MutateOp: - """ - Return a JSON flatten operation from an input schema. - - Parameters - ---------- - json_input_cols : list of str - A list of JSON input columns. - json_output_cols : list of tuple - A list of JSON output columns. - - Returns - ------- - MutateOp - A MutateOp object that represents the JSON flatten operation. - """ - - json_flatten_op = MutateOp(json_flatten, dependencies=json_input_cols, output_columns=json_output_cols) - - return json_flatten_op - - -@sync_df_as_pandas() -def _string_cat_col(df: pd.DataFrame, output_column: str, sep: str) -> pd.DataFrame: - """ - Concatenate the string representation of all supplied columns in a DataFrame. - - Parameters - ---------- - df : pandas.DataFrame - The input DataFrame. - output_column : str - The name of the output column. - sep : str - The separator to use when concatenating the strings. - - Returns - ------- - pandas.DataFrame - The resulting DataFrame. - """ - - cat_col = df.apply(lambda row: sep.join(row.values.astype(str)), axis=1) - - return pd.DataFrame({output_column: cat_col}, index=cat_col.index) - - -# pylint -def _nvt_string_cat_col( - column_selector: ColumnSelector, # pylint: disable=unused-argument - df: typing.Union[pd.DataFrame, cudf.DataFrame], - output_column: str, - input_columns: typing.List[str], - sep: str = ', '): - """ - Concatenates the string representation of the specified columns in a DataFrame. - - Parameters - ---------- - column_selector : ColumnSelector - A ColumnSelector object. - df : Union[pandas.DataFrame, cudf.DataFrame] - The input DataFrame. - output_column : str - The name of the output column. - input_columns : list of str - The input columns to concatenate. - sep : str, default is ', ' - The separator to use when concatenating the strings. - - Returns - ------- - Union[pandas.DataFrame, cudf.DataFrame] - The resulting DataFrame. - """ - - return _string_cat_col(df[input_columns], output_column=output_column, sep=sep) - - -@sync_df_as_pandas() -def _increment_column(df: pd.DataFrame, - output_column: str, - input_column: str, - groupby_column: str, - period: str = 'D') -> pd.DataFrame: - """ - Crete an increment a column in a DataFrame. - - Parameters - ---------- - df : pandas.DataFrame - The input DataFrame. - output_column : str - The name of the output column. - input_column : str - The name of the input column. - period : str, default is 'D' - The period to increment by. - - Returns - ------- - pandas.DataFrame - The resulting DataFrame. - """ - - period_index = pd.to_datetime(df[input_column]).dt.to_period(period) - groupby_col = df.groupby([groupby_column, period_index]).cumcount() - - return pd.DataFrame({output_column: groupby_col}, index=groupby_col.index) - - -def _nvt_increment_column( - column_selector: ColumnSelector, # pylint: disable=unused-argument - df: typing.Union[pd.DataFrame, cudf.DataFrame], - output_column: str, - input_column: str, - groupby_column: str, - period: str = 'D') -> typing.Union[pd.DataFrame, cudf.DataFrame]: - """ - Increment a column in a DataFrame. - - Parameters - ---------- - column_selector : ColumnSelector - A ColumnSelector object. Unused. - df : Union[pandas.DataFrame, cudf.DataFrame] - The input DataFrame. - output_column : str - The name of the output column. - input_column : str - The name of the input column. - groupby_column : str - Name of the column to groupby after creating the increment - period : str, default is 'D' - The period to increment by. - - Returns - ------- - Union[pandas.DataFrame, cudf.DataFrame] - The resulting DataFrame. - """ - - return _increment_column(df, output_column, input_column, groupby_column, period) - - -@sync_df_as_pandas() -def _distinct_increment_column(df: pd.DataFrame, - output_column: str, - input_column: str, - groupby_column: str = "username", - period: str = 'D', - timestamp_column: str = "timestamp") -> pd.DataFrame: - - output_series = create_increment_col(df=df, - column_name=input_column, - groupby_column=groupby_column, - period=period, - timestamp_column=timestamp_column) - - return pd.DataFrame({output_column: output_series}, index=output_series.index) - - -def _nvt_distinct_increment_column(_: ColumnSelector, - df: typing.Union[pd.DataFrame, cudf.DataFrame], - output_column: str, - input_column: str, - groupby_column: str = "username", - period: str = 'D', - timestamp_column: str = "timestamp") -> typing.Union[pd.DataFrame, cudf.DataFrame]: - - return _distinct_increment_column(df, output_column, input_column, groupby_column, period, timestamp_column) - - -@sync_df_as_pandas() -def _nvt_try_rename(df: pd.DataFrame, input_col_name: str, output_col_name: str, dtype: None) -> pd.Series: - if (input_col_name in df.columns): - return df.rename(columns={input_col_name: output_col_name}) - - return pd.Series(None, index=df.index, dtype=dtype) - - -# Mappings from ColumnInfo types to functions that create the corresponding NVT operator -ColumnInfoProcessingMap = { - BoolColumn: - lambda ci, - deps: [ - LambdaOp( - lambda series: series.map(ci.value_map).astype(bool), dtype="bool", label=f"[BoolColumn] '{ci.name}'") - ], - ColumnInfo: - lambda ci, - deps: [ - MutateOp(lambda _, - df: df.assign(**{ci.name: df[ci.name].astype(ci.get_pandas_dtype())}) if (ci.name in df.columns) - else df.assign(**{ci.name: pd.Series(None, index=df.index, dtype=ci.get_pandas_dtype())}), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[ColumnInfo] '{ci.name}'") - ], - # Note(Devin): Custom columns are, potentially, very inefficient, because we have to run the custom function on the - # entire dataset this is because NVT requires the input column be available, but CustomColumn is a generic - # transform taking df->series(ci.name) - CustomColumn: - lambda ci, - deps: [ - MutateOp(lambda _, - df: cudf.DataFrame({ci.name: ci.process_column_fn(df)}, index=df.index), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[CustomColumn] '{ci.name}'") - ], - DateTimeColumn: - lambda ci, - deps: [ - Rename(f=lambda name: ci.name if name == ci.input_name else name), - LambdaOp(lambda series: series.astype(ci.dtype), dtype=ci.dtype, label=f"[DateTimeColumn] '{ci.name}'") - ], - IncrementColumn: - lambda ci, - deps: [ - MutateOp(partial(_nvt_increment_column, - output_column=ci.name, - input_column=ci.input_name, - groupby_column=ci.groupby_column, - period=ci.period), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[IncrementColumn] '{ci.input_name}.{ci.groupby_column}' => '{ci.name}'") - ], - DistinctIncrementColumn: - lambda ci, - deps: [ - MutateOp(partial(_nvt_distinct_increment_column, - output_column=ci.name, - input_column=ci.input_name, - groupby_column=ci.groupby_column, - period=ci.period, - timestamp_column=ci.timestamp_column), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=(f"[DistinctIncrementColumn] " - f"'{ci.input_name}.{ci.groupby_column}.{ci.timestamp_column}' => '{ci.name}'")) - ], - RenameColumn: - lambda ci, - deps: [ - MutateOp(lambda selector, - df: _nvt_try_rename(df, ci.input_name, ci.name, ci.dtype), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[RenameColumn] '{ci.input_name}' => '{ci.name}'") - ], - StringCatColumn: - lambda ci, - deps: [ - MutateOp(partial(_nvt_string_cat_col, output_column=ci.name, input_columns=ci.input_columns, sep=ci.sep), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[StringCatColumn] '{','.join(ci.input_columns)}' => '{ci.name}'") - ], - StringJoinColumn: - lambda ci, - deps: [ - MutateOp(partial( - _nvt_string_cat_col, output_column=ci.name, input_columns=[ci.name, ci.input_name], sep=ci.sep), - dependencies=deps, - output_columns=[(ci.name, ci.dtype)], - label=f"[StringJoinColumn] '{ci.input_name}' => '{ci.name}'") - ], - JSONFlattenInfo: - lambda ci, - deps: [_json_flatten_from_input_schema(ci.input_col_names, ci.output_col_names)] -} - - -def _build_nx_dependency_graph(column_info_objects: typing.List[ColumnInfo]) -> nx.DiGraph: - """ - Build a networkx directed graph for dependencies among columns. - - Parameters - ---------- - column_info_objects : list of ColumnInfo - List of column information objects. - - Returns - ------- - nx.DiGraph - A networkx DiGraph where nodes represent columns and edges represent dependencies between columns. - - """ - graph = nx.DiGraph() - - def _find_dependent_column(name, current_name): - for col_info in column_info_objects: - if col_info.name == current_name: - continue - - # pylint: disable=no-else-return - if col_info.name == name: - return col_info - elif col_info.__class__ == JSONFlattenInfo: - if name in [c for c, _ in col_info.output_col_names]: - return col_info - - return None - - # For each column, determine the inputs for that column and add edges to the graph - # Some columns will use simple strings - for col_info in column_info_objects: - graph.add_node(col_info.name) - - for input_col_name in col_info.get_input_column_types().keys(): - dep_col_info = _find_dependent_column(input_col_name, col_info.name) - if (dep_col_info): - graph.add_edge(dep_col_info.name, col_info.name) - - return graph - - -def _bfs_traversal_with_op_map(graph: nx.Graph, - ci_map: typing.Dict[str, ColumnInfo], - root_nodes: typing.List[typing.Any]): - """ - Perform Breadth-First Search (BFS) on a given graph. - - Parameters - ---------- - graph : nx.Graph - The graph on which BFS needs to be performed. - ci_map : dict - The dictionary mapping column info. - root_nodes : list - List of root nodes where BFS should start. - - Returns - ------- - tuple - Tuple containing the visited nodes and node-operation mapping. - """ - - visited = set() - queue = list(root_nodes) - node_op_map = {} - - while queue: - node = queue.pop(0) - if node not in visited: - visited.add(node) - - # We need to start an operator chain with a column selector, so root nodes need to prepend a parent - # column selection operator - parent_input = _get_ci_column_selector(ci_map[node]) - - parents = list(graph.predecessors(node)) - - # Thin the parent_input by any actual parents - parent_input = [x for x in parent_input if x not in parents] - - # If we are a single element list, just use the element - if (len(parent_input) == 1): - parent_input = parent_input[0] - - if len(parents) > 0: - # Not a root node, so we need to gather the parent operators, and collect them up. - for parent in parents: - if isinstance(parent_input, list) and len(parent_input) == 0: - parent_input = node_op_map[parent] - else: - parent_input = parent_input + node_op_map[parent] - - # Map the column info object to its NVT operator implementation - nvt_ops = ColumnInfoProcessingMap[type(ci_map[node])](ci_map[node], deps=[]) - - # Chain ops together into a compound op - node_op = parent_input - for nvt_op in nvt_ops: - node_op = node_op >> nvt_op - - # Set the op for this node to the compound operator - node_op_map[node] = node_op - - # Add our neighbors to the queue - neighbors = list(graph.neighbors(node)) - for neighbor in neighbors: - queue.append(neighbor) - - return visited, node_op_map - - -def _coalesce_leaf_nodes(node_op_map: typing.Dict[typing.Any, typing.Any], - column_info_objects: list[ColumnInfo]) -> typing.Any: - """ - Coalesce (combine) operations for the leaf nodes of a graph. - - Parameters - ---------- - node_op_map : dict - Dictionary mapping nodes to operations. - graph : nx.Graph - The graph to be processed. - preserve_re : regex - Regular expression for nodes to be preserved. - - Returns - ------- - obj - Coalesced workflow for leaf nodes. - """ - coalesced_workflow = None - - for column_info in column_info_objects: - - nvt_op = node_op_map[column_info.name] - - if coalesced_workflow is None: - coalesced_workflow = nvt_op - else: - coalesced_workflow = coalesced_workflow + nvt_op - - return coalesced_workflow - - -def _coalesce_ops(graph: nx.Graph, column_info_objects: list[ColumnInfo]) -> typing.Any: - """ - Coalesce (combine) operations for a graph. - - Parameters - ---------- - graph : nx.Graph - The graph to be processed. - ci_map : dict - The dictionary mapping column info. - preserve_re : regex, optional - Regular expression for nodes to be preserved. - - Returns - ------- - obj - Coalesced workflow for the graph. - """ - - ci_map = {ci.name: ci for ci in column_info_objects} - - root_nodes = [node for node, in_degree in graph.in_degree() if in_degree == 0] - - _, node_op_map = _bfs_traversal_with_op_map(graph, ci_map, root_nodes) - - coalesced_workflow = _coalesce_leaf_nodes(node_op_map, column_info_objects) - - return coalesced_workflow - - -def create_and_attach_nvt_workflow(input_schema: DataFrameInputSchema, - visualize: typing.Optional[bool] = False) -> DataFrameInputSchema: - """ - Converts an `input_schema` to a `nvt.Workflow` object. - - Parameters - ---------- - input_schema : DataFrameInputSchema - Input schema which specifies how the DataFrame should be processed. - visualize : bool, optional - If True, the resulting workflow graph will be visualized. - Default is False. - - Returns - ------- - nvt.Workflow - A nvt.Workflow object representing the steps specified in the input schema. - - Raises - ------ - ValueError - If the input schema is empty. - - Notes - ----- - First we aggregate all preprocessing steps, which we assume are independent of each other - and can be run in parallel. - - Next we aggregate all column operations, which we assume are independent of each other and - can be run in parallel and pass them the updated schema from the preprocessing steps. - """ - - if (input_schema is None): - input_schema = DataFrameInputSchema() - return input_schema - if (len(input_schema.column_info) == 0): - input_schema.nvt_workflow = None - return input_schema - - # Note(Devin): soft locking problem with nvt operators, skip for now. - # column_info_objects.append( - # JSONFlattenInfo(input_col_names=list(json_cols), - # output_col_names=json_output_cols, - # dtype="str", - # name="json_info")) - - graph = _build_nx_dependency_graph(input_schema.column_info) - - if os.getenv('MORPHEUS_NVT_VIS_DEBUG') is not None: - from matplotlib import pyplot as plt - from networkx.drawing.nx_pydot import graphviz_layout - pos = graphviz_layout(graph, prog='neato') - nx.draw(graph, pos, with_labels=True, font_weight='bold') - plt.show() - - coalesced_workflow = _coalesce_ops(graph, input_schema.column_info) - if (input_schema.row_filter is not None): - # Use our own filter here to preserve any index from the DataFrame - coalesced_workflow = coalesced_workflow >> _MorpheusFilter(f=input_schema.row_filter) - - if (visualize): - coalesced_workflow.graph.render(view=True, format='svg') - - input_schema.nvt_workflow = nvt.Workflow(coalesced_workflow) - - return input_schema diff --git a/morpheus/utils/nvt/transforms.py b/morpheus/utils/nvt/transforms.py deleted file mode 100644 index c8aab33b81..0000000000 --- a/morpheus/utils/nvt/transforms.py +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import typing - -import pandas as pd -from nvtabular import ColumnSelector - -import cudf - - -def json_flatten(col_selector: ColumnSelector, - df: typing.Union[pd.DataFrame, cudf.DataFrame]) -> typing.Union[pd.DataFrame, cudf.DataFrame]: - """ - Flattens JSON columns in the given DataFrame and concatenates them into a single DataFrame. - - Parameters - ---------- - col_selector : ColumnSelector - An instance of ColumnSelector that contains the names of the columns to flatten. - df : Union[pd.DataFrame, cudf.DataFrame] - The input DataFrame that contains the JSON columns to flatten. - - Returns - ------- - Union[pd.DataFrame, cudf.DataFrame] - A new DataFrame with flattened JSON columns. If 'df' was a cudf.DataFrame, - the return type is cudf.DataFrame. Otherwise, it is pd.DataFrame. - """ - convert_to_cudf = False - if isinstance(df, cudf.DataFrame): - convert_to_cudf = True - - # Normalize JSON columns and accumulate into a single dataframe - df_normalized = None - for col in col_selector.names: - pd_series = df[col] if not convert_to_cudf else df[col].to_pandas() - pd_series = pd_series.apply(lambda x: x if isinstance(x, dict) else json.loads(x)) - pdf_norm = pd.json_normalize(pd_series) - pdf_norm.rename(columns=lambda x, col=col: col + "." + x, inplace=True) - pdf_norm.reset_index(drop=True, inplace=True) - - if (df_normalized is None): - df_normalized = pdf_norm - else: - df_normalized = pd.concat([df_normalized, pdf_norm], axis=1) - - # Convert back to cudf if necessary - if convert_to_cudf: - df_normalized = cudf.from_pandas(df_normalized) - - return df_normalized diff --git a/morpheus/utils/schema_transforms.py b/morpheus/utils/schema_transforms.py index c0203d4453..5a306e0878 100644 --- a/morpheus/utils/schema_transforms.py +++ b/morpheus/utils/schema_transforms.py @@ -13,36 +13,13 @@ # limitations under the License. import logging -import os import typing -import warnings import pandas as pd import cudf from morpheus.utils.column_info import DataFrameInputSchema -from morpheus.utils.column_info import PreparedDFInfo -from morpheus.utils.nvt import patches -from morpheus.utils.nvt.extensions import morpheus_ext -from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow - -with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - import nvtabular as nvt - - if os.environ.get("MORPHEUS_IN_SPHINX_BUILD") is None: - # Apply patches to NVT - # TODO(Devin): Can be removed, once numpy mappings are updated in Merlin - # ======================================================================== - patches.patch_numpy_dtype_registry() - # ======================================================================== - - # Add morpheus conversion mappings - # ======================================================================== - morpheus_ext.register_morpheus_extensions() - # ========================================================================= logger = logging.getLogger(__name__) @@ -50,7 +27,7 @@ @typing.overload def process_dataframe( df_in: pd.DataFrame, - input_schema: typing.Union[nvt.Workflow, DataFrameInputSchema], + input_schema: DataFrameInputSchema, ) -> pd.DataFrame: ... @@ -58,14 +35,14 @@ def process_dataframe( @typing.overload def process_dataframe( df_in: cudf.DataFrame, - input_schema: typing.Union[nvt.Workflow, DataFrameInputSchema], + input_schema: DataFrameInputSchema, ) -> cudf.DataFrame: ... def process_dataframe( df_in: typing.Union[pd.DataFrame, cudf.DataFrame], - input_schema: typing.Union[nvt.Workflow, DataFrameInputSchema], + input_schema: DataFrameInputSchema, ) -> typing.Union[pd.DataFrame, cudf.DataFrame]: """ Applies column transformations to the input dataframe as defined by the `input_schema`. @@ -77,10 +54,8 @@ def process_dataframe( ---------- df_in : Union[pd.DataFrame, cudf.DataFrame] The input DataFrame to process. - input_schema : Union[nvt.Workflow, DataFrameInputSchema] + input_schema : DataFrameInputSchema Defines the transformations to apply to 'df_in'. - If an instance of nvt.Workflow, it is directly used to transform the dataframe. - If an instance of DataFrameInputSchema, it is first converted to an nvt.Workflow, with JSON columns preprocessed if 'json_preproc' attribute is present. Returns @@ -95,63 +70,31 @@ def process_dataframe( If 'df_in' is a pandas DataFrame, it is temporarily converted into a cudf DataFrame for the transformation. """ - convert_to_pd = False - if (isinstance(df_in, pd.DataFrame)): - convert_to_pd = True + output_df = pd.DataFrame() - # If we're given a nvt_schema, we just use it. - nvt_workflow = input_schema - if (isinstance(input_schema, DataFrameInputSchema)): - if (input_schema.nvt_workflow is None): - input_schema = create_and_attach_nvt_workflow(input_schema) + convert_to_cudf = False + if (isinstance(df_in, cudf.DataFrame)): + df_in = df_in.to_pandas() + convert_to_cudf = True - # Note(Devin): pre-flatten to avoid Dask hang when calling json_normalize within an NVT operator - if (input_schema.prep_dataframe is not None): - prepared_df_info: PreparedDFInfo = input_schema.prep_dataframe(df_in) + # Iterate over the column info + for ci in input_schema.column_info: + try: + output_df[ci.name] = ci._process_column(df_in) + except Exception: + logger.exception("Failed to process column '%s'. Dataframe: \n%s", ci.name, df_in, exc_info=True) + raise - nvt_workflow = input_schema.nvt_workflow + if (input_schema.preserve_columns is not None): + # Get the list of remaining columns not already added + df_in_columns = set(df_in.columns) - set(output_df.columns) - preserve_df = None + # Finally, keep any columns that match the preserve filters + match_columns = [y for y in df_in_columns if input_schema.preserve_columns.match(y)] - if prepared_df_info is not None: - df_in = prepared_df_info.df + output_df[match_columns] = df_in[match_columns] - if prepared_df_info.columns_to_preserve: - preserve_df = df_in[prepared_df_info.columns_to_preserve] + if (convert_to_cudf): + return cudf.from_pandas(output_df) - if (convert_to_pd): - df_in = cudf.DataFrame(df_in) - - # NVT will always reset the index, so we need to save it and restore it after the transformation - saved_index = df_in.index - df_in.reset_index(drop=True, inplace=True) - - dataset = nvt.Dataset(df_in) - - if (nvt_workflow is not None): - df_result = nvt_workflow.fit_transform(dataset).to_ddf().compute() - else: - df_result = df_in - - # Now reset the index - if (len(df_result) == len(saved_index)): - df_result.set_index(saved_index, inplace=True) - else: - # Must have done some filtering. Use the new index to index into the old index - df_result.set_index(saved_index.take(df_result.index), inplace=True) - - if (convert_to_pd): - df_result = df_result.to_pandas() - - # Restore preserved columns - if (preserve_df is not None): - # Ensure there is no overlap with columns to preserve - columns_to_merge = set(preserve_df.columns) - set(df_result.columns) - columns_to_merge = list(columns_to_merge) - if (columns_to_merge): - if (convert_to_pd): - df_result = pd.concat([df_result, preserve_df[columns_to_merge]], axis=1) - else: - df_result = cudf.concat([df_result, preserve_df[columns_to_merge]], axis=1) - - return df_result + return output_df diff --git a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py index 7540836f1e..bb5da24aa1 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py +++ b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py @@ -105,7 +105,7 @@ def test_constructor(config: Config): @pytest.mark.parametrize('dl_type', ["single_thread", "dask", "dask_thread"]) @pytest.mark.parametrize('use_convert_to_dataframe', [True, False]) @mock.patch('dask.distributed.Client') -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') @mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') @mock.patch('morpheus.utils.downloader.Distributed') @mock.patch('morpheus.controllers.file_to_df_controller.process_dataframe') @@ -127,12 +127,6 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM mock_dask_client.__enter__.return_value = mock_dask_client mock_dask_client.__exit__.return_value = False - mock_dist_client = mock.MagicMock() - mock_distributed.return_value = mock_distributed - mock_distributed.client = mock_dist_client - mock_distributed.__enter__.return_value = mock_distributed - mock_distributed.__exit__.return_value = False - expected_hash = hashlib.md5(json.dumps([{ 'ukey': single_file_obj.fs.ukey(single_file_obj.path) }]).encode()).hexdigest() @@ -149,8 +143,7 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM returned_df = dataset_pandas['filter_probs.csv'] mock_proc_df.return_value = returned_df if dl_type.startswith('dask'): - mock_dist_client.map.return_value = [returned_df] - mock_dist_client.gather.return_value = [returned_df] + mock_dask_client.gather.return_value = [returned_df] else: mock_obf_to_df.return_value = returned_df @@ -174,12 +167,11 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM if dl_type.startswith('dask'): mock_dask_client.assert_called_once_with(mock_dask_cluster) - mock_dist_client.map.assert_called_once() - mock_dist_client.gather.assert_called_once() + mock_dask_client.map.assert_called_once() + mock_dask_client.gather.assert_called_once() else: mock_dask_cluster.assert_not_called() - mock_dist_client.map.assert_not_called() - mock_dist_client.gather.assert_not_called() + mock_dask_client.assert_not_called() dataset_pandas.assert_df_equal(output_df, expected_df) @@ -194,7 +186,7 @@ def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicM @pytest.mark.parametrize('use_convert_to_dataframe', [True, False]) @mock.patch('dask.config') @mock.patch('dask.distributed.Client') -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') @mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') def test_get_or_create_dataframe_from_batch_cache_hit(mock_obf_to_df: mock.MagicMock, mock_dask_cluster: mock.MagicMock, @@ -253,7 +245,7 @@ def test_get_or_create_dataframe_from_batch_cache_hit(mock_obf_to_df: mock.Magic @pytest.mark.parametrize('use_convert_to_dataframe', [True, False]) @mock.patch('dask.config') @mock.patch('dask.distributed.Client') -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') @mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') def test_get_or_create_dataframe_from_batch_none_noop(mock_obf_to_df: mock.MagicMock, mock_dask_cluster: mock.MagicMock, diff --git a/tests/test_column_info.py b/tests/test_column_info.py index 4cd71a9804..4ea8804b78 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -60,68 +60,6 @@ def azure_ad_logs_cdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): yield cudf.from_pandas(_azure_ad_logs_pdf) -@pytest.mark.use_python -def test_dataframe_input_schema_with_json_cols(azure_ad_logs_cdf: cudf.DataFrame): - raw_data_columns = [ - 'time', - 'resourceId', - 'operationName', - 'operationVersion', - 'category', - 'tenantId', - 'resultType', - 'resultSignature', - 'resultDescription', - 'durationMs', - 'callerIpAddress', - 'correlationId', - 'identity', - 'Level', - 'location', - 'properties' - ] - - assert len(azure_ad_logs_cdf.columns) == 16 - assert list(azure_ad_logs_cdf.columns) == raw_data_columns - - column_info = [ - DateTimeColumn(name="timestamp", dtype='datetime64[ns]', input_name="time"), - RenameColumn(name="userId", dtype='str', input_name="properties.userPrincipalName"), - RenameColumn(name="appDisplayName", dtype='str', input_name="properties.appDisplayName"), - ColumnInfo(name="category", dtype='str'), - RenameColumn(name="clientAppUsed", dtype='str', input_name="properties.clientAppUsed"), - RenameColumn(name="deviceDetailbrowser", dtype='str', input_name="properties.deviceDetail.browser"), - RenameColumn(name="deviceDetaildisplayName", dtype='str', input_name="properties.deviceDetail.displayName"), - RenameColumn(name="deviceDetailoperatingSystem", - dtype='str', - input_name="properties.deviceDetail.operatingSystem"), - StringCatColumn(name="location", - dtype='str', - input_columns=[ - "properties.location.city", - "properties.location.countryOrRegion", - ], - sep=", "), - RenameColumn(name="statusfailureReason", dtype='str', input_name="properties.status.failureReason"), - ] - - schema = DataFrameInputSchema(json_columns=["properties"], column_info=column_info) - - df_processed_schema = process_dataframe(azure_ad_logs_cdf, schema) - processed_df_cols = df_processed_schema.columns - - assert len(azure_ad_logs_cdf) == len(df_processed_schema) - assert len(processed_df_cols) == len(column_info) - assert "timestamp" in processed_df_cols - assert "userId" in processed_df_cols - assert "time" not in processed_df_cols - assert "properties.userPrincipalName" not in processed_df_cols - - nvt_workflow = create_and_attach_nvt_workflow(schema) - df_processed_workflow = process_dataframe(azure_ad_logs_cdf, nvt_workflow) - assert df_processed_schema.equals(df_processed_workflow) - - @pytest.mark.use_python def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFrame): assert len(azure_ad_logs_pdf.columns) == 16 diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 4534b88cbb..9d61401f59 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -38,14 +38,6 @@ def dask_distributed(fail_missing: bool): fail_missing=fail_missing) -@pytest.fixture(autouse=True, scope='session') -def dask_cuda(fail_missing: bool): - """ - Mark tests requiring dask_cuda - """ - yield import_or_skip("dask_cuda", reason="Downloader requires dask_cuda", fail_missing=fail_missing) - - @pytest.mark.usefixtures("restore_environ") @pytest.mark.parametrize('use_env', [True, False]) @pytest.mark.parametrize('dl_method', ["single_thread", "dask", "dask_thread"]) @@ -90,7 +82,7 @@ def test_constructor_invalid_dltype(use_env: bool): @pytest.mark.reload_modules(morpheus.utils.downloader) @pytest.mark.parametrize("dl_method", ["dask", "dask_thread"]) @pytest.mark.usefixtures("reload_modules") -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') def test_get_dask_cluster(mock_dask_cluster: mock.MagicMock, dl_method: str): mock_dask_cluster.return_value = mock_dask_cluster downloader1 = Downloader(download_method=dl_method) @@ -107,7 +99,7 @@ def test_get_dask_cluster(mock_dask_cluster: mock.MagicMock, dl_method: str): @pytest.mark.reload_modules(morpheus.utils.downloader) @pytest.mark.parametrize('dl_method', ["dask", "dask_thread"]) @pytest.mark.usefixtures("reload_modules") -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') def test_close(mock_dask_cluster: mock.MagicMock, dl_method: str): mock_dask_cluster.return_value = mock_dask_cluster downloader = Downloader(download_method=dl_method) @@ -117,7 +109,7 @@ def test_close(mock_dask_cluster: mock.MagicMock, dl_method: str): downloader.close() -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') @pytest.mark.parametrize('dl_method', ["single_thread"]) def test_close_noop(mock_dask_cluster: mock.MagicMock, dl_method: str): mock_dask_cluster.return_value = mock_dask_cluster @@ -135,7 +127,7 @@ def test_close_noop(mock_dask_cluster: mock.MagicMock, dl_method: str): @pytest.mark.parametrize('dl_method', ["single_thread", "dask", "dask_thread"]) @mock.patch('dask.config') @mock.patch('dask.distributed.Client') -@mock.patch('dask_cuda.LocalCUDACluster') +@mock.patch('dask.distributed.LocalCluster') def test_download(mock_dask_cluster: mock.MagicMock, mock_dask_client: mock.MagicMock, mock_dask_config: mock.MagicMock, @@ -177,6 +169,8 @@ def test_download(mock_dask_cluster: mock.MagicMock, mock_dask_client.assert_not_called() mock_dask_config.assert_not_called() + assert results == [returnd_df for _ in range(num_buckets)] + @pytest.mark.usefixtures("restore_environ") @pytest.mark.parametrize('use_env', [True, False]) diff --git a/tests/utils/nvt/__init__.py b/tests/utils/nvt/__init__.py deleted file mode 100644 index 66061e580b..0000000000 --- a/tests/utils/nvt/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/utils/nvt/integration/__init__.py b/tests/utils/nvt/integration/__init__.py deleted file mode 100644 index 66061e580b..0000000000 --- a/tests/utils/nvt/integration/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/utils/nvt/integration/test_mutate_op.py b/tests/utils/nvt/integration/test_mutate_op.py deleted file mode 100644 index ce0808681b..0000000000 --- a/tests/utils/nvt/integration/test_mutate_op.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -import pandas as pd -import pytest -from merlin.dag import ColumnSelector - -import cudf - -from morpheus.utils.nvt.mutate import MutateOp -from morpheus.utils.nvt.transforms import json_flatten - - -@pytest.fixture(name="json_data") -def json_data_fixture(): - yield [ - '{"key1": "value1", "key2": {"subkey1": "subvalue1", "subkey2": "subvalue2"}}', - '{"key1": "value2", "key2": {"subkey1": "subvalue3", "subkey2": "subvalue4"}}', - '{"key1": "value3", "key2": {"subkey1": "subvalue5", "subkey2": "subvalue6"}}' - ] - - -@pytest.fixture(name="expected_pdf") -def expected_pdf_fixture(): - yield pd.DataFrame({ - 'col1.key1': ['value1', 'value2', 'value3'], - 'col1.key2.subkey1': ['subvalue1', 'subvalue3', 'subvalue5'], - 'col1.key2.subkey2': ['subvalue2', 'subvalue4', 'subvalue6'] - }) - - -def test_integration_pandas(json_data: typing.List[str], expected_pdf: pd.DataFrame): - pdf = pd.DataFrame({'col1': json_data}) - col_selector = ColumnSelector(['col1']) - - nvt_op = MutateOp(json_flatten, [("col1.key1", "object"), ("col1.key2.subkey1", "object"), - ("col1.key2.subkey2", "object")]) - result_pdf = nvt_op.transform(col_selector, pdf) - - assert result_pdf.equals(expected_pdf), "Integration test with pandas DataFrame failed" - - -def test_integration_cudf(json_data: typing.List[str], expected_pdf: pd.DataFrame): - cdf = cudf.DataFrame({'col1': json_data}) - col_selector = ColumnSelector(['col1']) - - nvt_op = MutateOp(json_flatten, [("col1.key1", "object"), ("col1.key2.subkey1", "object"), - ("col1.key2.subkey2", "object")]) - result_cdf = nvt_op.transform(col_selector, cdf) - result_pdf = result_cdf.to_pandas() - - assert result_pdf.equals(expected_pdf), "Integration test with cuDF DataFrame failed" diff --git a/tests/utils/nvt/test_json_flatten_transform.py b/tests/utils/nvt/test_json_flatten_transform.py deleted file mode 100644 index e0657925f5..0000000000 --- a/tests/utils/nvt/test_json_flatten_transform.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings - -import pandas as pd -import pytest - -with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - from nvtabular.ops.operator import ColumnSelector - -import cudf - -from morpheus.utils.nvt.transforms import json_flatten - - -@pytest.fixture(name="data") -def data_fixture(): - yield { - "id": [1, 2], - "info": [ - '{"name": "John", "age": 30, "city": "New York"}', '{"name": "Jane", "age": 28, "city": "San Francisco"}' - ] - } - - -def test_json_flatten_pandas(data: dict): - df = pd.DataFrame(data) - col_selector = ColumnSelector(["info"]) - result = json_flatten(col_selector, df) - - expected_data = {"info.name": ["John", "Jane"], "info.age": [30, 28], "info.city": ["New York", "San Francisco"]} - expected_df = pd.DataFrame(expected_data) - - pd.testing.assert_frame_equal(result, expected_df) - - -def test_json_flatten_cudf(data: dict): - df = cudf.DataFrame(data) - col_selector = ColumnSelector(["info"]) - result = json_flatten(col_selector, df) - - expected_data = { - "id": [1, 2], "info.name": ["John", "Jane"], "info.age": [30, 28], "info.city": ["New York", "San Francisco"] - } - expected_df = cudf.DataFrame(expected_data) - - assert_frame_equal(result, expected_df) - - -def assert_frame_equal(df1, df2): - assert len(df1) == len(df2), "DataFrames have different lengths" - for col in df1.columns: - assert col in df2, f"Column {col} not found in the second DataFrame" - assert (df1[col] == df2[col]).all(), f"Column {col} values do not match" diff --git a/tests/utils/nvt/test_mutate_op.py b/tests/utils/nvt/test_mutate_op.py deleted file mode 100644 index 3023d9701e..0000000000 --- a/tests/utils/nvt/test_mutate_op.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings - -import numpy as np -import pandas as pd -import pytest - -with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - from merlin.core.dispatch import DataFrameType - from merlin.schema import ColumnSchema - from merlin.schema import Schema - from nvtabular.ops.operator import ColumnSelector - -from morpheus.utils.nvt.mutate import MutateOp - - -@pytest.fixture(name="df") -def df_fixture(): - yield pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - - -def example_transform(col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: - selected_columns = col_selector.names - for col in selected_columns: - df[col + '_new'] = df[col] * 2 - return df - - -def test_transform(df: DataFrameType): - nvt_op = MutateOp(example_transform, output_columns=[('A_new', np.dtype('int64')), ('B_new', np.dtype('int64'))]) - col_selector = ColumnSelector(['A', 'B']) - transformed_df = nvt_op.transform(col_selector, df) - - expected_df = df.copy() - expected_df['A_new'] = df['A'] * 2 - expected_df['B_new'] = df['B'] * 2 - - assert transformed_df.equals(expected_df), "Test transform failed" - - -# Test for lambda function transformation -def test_transform_lambda(df: DataFrameType): - nvt_op = MutateOp(lambda col_selector, - df: df.assign(**{f"{col}_new": df[col] * 2 - for col in col_selector.names}), - output_columns=[('A_new', np.dtype('int64')), ('B_new', np.dtype('int64'))]) - col_selector = ColumnSelector(['A', 'B']) - transformed_df = nvt_op.transform(col_selector, df) - - expected_df = df.copy() - expected_df['A_new'] = df['A'] * 2 - expected_df['B_new'] = df['B'] * 2 - - assert transformed_df.equals(expected_df), "Test transform with lambda failed" - - -def test_transform_additional_columns(df: DataFrameType): - - def additional_transform(col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: - selected_columns = col_selector.names - for col in selected_columns: - df[col + '_new'] = df[col] * 2 - df['D'] = df['A'] + df['B'] - return df - - nvt_op = MutateOp(additional_transform, - output_columns=[('A_new', np.dtype('int64')), ('B_new', np.dtype('int64')), - ('D', np.dtype('int64'))]) - col_selector = ColumnSelector(['A', 'B']) - transformed_df = nvt_op.transform(col_selector, df) - - expected_df = df.copy() - expected_df['A_new'] = df['A'] * 2 - expected_df['B_new'] = df['B'] * 2 - expected_df['D'] = df['A'] + df['B'] - - assert transformed_df.equals(expected_df), "Test transform with additional columns failed" - - -def test_column_mapping(): - nvt_op = MutateOp(example_transform, output_columns=[('A_new', np.dtype('int64')), ('B_new', np.dtype('int64'))]) - col_selector = ColumnSelector(['A', 'B']) - column_mapping = nvt_op.column_mapping(col_selector) - - expected_mapping = {'A_new': ['A', 'B'], 'B_new': ['A', 'B']} - - assert column_mapping == expected_mapping, "Test column mapping failed" - - -def test_compute_output_schema(): - nvt_op = MutateOp(example_transform, output_columns=[('A_new', np.dtype('int64')), ('B_new', np.dtype('int64'))]) - col_selector = ColumnSelector(['A', 'B']) - - input_schema = Schema([ - ColumnSchema('A', dtype=np.dtype('int64')), - ColumnSchema('B', dtype=np.dtype('int64')), - ColumnSchema('C', dtype=np.dtype('int64')) - ]) - - output_schema = nvt_op.compute_output_schema(input_schema, col_selector) - - expected_schema = Schema( - [ColumnSchema('A_new', dtype=np.dtype('int64')), ColumnSchema('B_new', dtype=np.dtype('int64'))]) - - assert str(output_schema) == str(expected_schema), "Test compute output schema failed" diff --git a/tests/utils/nvt/test_schema_converters.py b/tests/utils/nvt/test_schema_converters.py deleted file mode 100644 index 9b00440d1a..0000000000 --- a/tests/utils/nvt/test_schema_converters.py +++ /dev/null @@ -1,661 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import warnings - -with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - import nvtabular as nvt - -import pandas as pd -import pytest - -import cudf - -from morpheus.utils.column_info import BoolColumn -from morpheus.utils.column_info import ColumnInfo -from morpheus.utils.column_info import DataFrameInputSchema -from morpheus.utils.column_info import DateTimeColumn -from morpheus.utils.column_info import DistinctIncrementColumn -from morpheus.utils.column_info import IncrementColumn -from morpheus.utils.column_info import PreparedDFInfo -from morpheus.utils.column_info import RenameColumn -from morpheus.utils.column_info import StringCatColumn -from morpheus.utils.column_info import StringJoinColumn -from morpheus.utils.column_info import _resolve_json_output_columns -from morpheus.utils.nvt.schema_converters import JSONFlattenInfo -from morpheus.utils.nvt.schema_converters import _bfs_traversal_with_op_map -from morpheus.utils.nvt.schema_converters import _build_nx_dependency_graph -from morpheus.utils.nvt.schema_converters import _coalesce_leaf_nodes -from morpheus.utils.nvt.schema_converters import _get_ci_column_selector -from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow -from morpheus.utils.nvt.schema_converters import sync_df_as_pandas -from morpheus.utils.schema_transforms import process_dataframe - -source_column_info = [ - BoolColumn(name="result", - dtype="bool", - input_name="result", - true_values=["success", "SUCCESS"], - false_values=["denied", "Denied", "DENIED", "FRAUD"]), - ColumnInfo(name="reason", dtype=str), - DateTimeColumn(name="timestamp", dtype="datetime64[us]", input_name="timestamp"), - StringCatColumn( - name="location", - dtype="str", - input_columns=["access_device.location.city", "access_device.location.state", "access_device.location.country"], - sep=", "), - RenameColumn(name="authdevicename", dtype="str", input_name="auth_device.name"), - RenameColumn(name="username", dtype="str", input_name="user.name"), - RenameColumn(name="accessdevicebrowser", dtype="str", input_name="access_device.browser"), - RenameColumn(name="accessdeviceos", dtype="str", input_name="access_device.os"), -] - - -def create_test_dataframe(): - return pd.DataFrame({ - "access_device": [ - '{"browser": "Firefox", "os": "Linux", "location": ' - '{"city": "San Francisco", "state": "CA", "country": "USA"}}' - ], - "application": ['{"name": "AnotherApp"}'], - "auth_device": ['{"name": "Device2"}'], - "user": ['{"name": "Jane Smith"}'], - "timestamp": [pd.Timestamp("2021-02-02 12:00:00")], - "result": ["denied"], - "reason": ["Denied"] - }) - - -def test_sync_df_as_pandas_pd_dataframe(): - - @sync_df_as_pandas() - def test_func(df: pd.DataFrame, value: int) -> pd.DataFrame: - df['test_col'] = df['test_col'] * value - return df - - df = pd.DataFrame({'test_col': [1, 2, 3]}) - result = test_func(df, value=2) - expected = pd.DataFrame({'test_col': [2, 4, 6]}) - pd.testing.assert_frame_equal(result, expected) - - -def test_sync_df_as_pandas_cudf_dataframe(): - - @sync_df_as_pandas() - def test_func(df: pd.DataFrame, value: int) -> pd.DataFrame: - df['test_col'] = df['test_col'] * value - return df - - df = cudf.DataFrame({'test_col': [1, 2, 3]}) - result = test_func(df, value=2) - expected = cudf.DataFrame({'test_col': [2, 4, 6]}) - cudf.testing.assert_frame_equal(result, expected) - - -def test_json_flatten_info_init(): - col_info = JSONFlattenInfo(name="json_info", - dtype="str", - input_col_names=["json_col1.a", "json_col2.b"], - output_col_names=["json_output_col1", "json_output_col2"]) - assert col_info.name == "json_info" - assert col_info.dtype == "str" - assert col_info.input_col_names == ["json_col1.a", "json_col2.b"] - assert col_info.output_col_names == ["json_output_col1", "json_output_col2"] - - -def test_json_flatten_info_init_missing_input_col_names(): - with pytest.raises(TypeError): - # pylint: disable=no-value-for-parameter - # pylint: disable=unused-variable - col_info = JSONFlattenInfo( # noqa F841 - name="json_info", dtype="str", output_col_names=["json_output_col1", "json_output_col2"]) - - -def test_json_flatten_info_init_missing_output_col_names(): - with pytest.raises(TypeError): - # pylint: disable=no-value-for-parameter - # pylint: disable=unused-variable - col_info = JSONFlattenInfo( # noqa F841 - name="json_info", dtype="str", input_col_names=["json_col1.a", "json_col2.b"]) - - -def test_get_ci_column_selector_rename_column(): - col_info = RenameColumn(input_name="original_name", name="new_name", dtype="str") - result = _get_ci_column_selector(col_info) - assert result == ["original_name"] - - -def test_get_ci_column_selector_bool_column(): - col_info = BoolColumn(input_name="original_name", - name="new_name", - dtype="bool", - true_values=["True"], - false_values=["False"]) - result = _get_ci_column_selector(col_info) - assert result == ["original_name"] - - -def test_get_ci_column_selector_datetime_column(): - col_info = DateTimeColumn(input_name="original_name", name="new_name", dtype="datetime64[ns]") - result = _get_ci_column_selector(col_info) - assert result == ["original_name"] - - -def test_get_ci_column_selector_string_join_column(): - col_info = StringJoinColumn(input_name="original_name", name="new_name", dtype="str", sep=",") - result = _get_ci_column_selector(col_info) - assert result == ["original_name"] - - -def test_get_ci_column_selector_increment_column(): - col_info = IncrementColumn(input_name="original_name", - name="new_name", - dtype="datetime64[ns]", - groupby_column="groupby_col") - result = _get_ci_column_selector(col_info) - assert result == ["original_name", "groupby_col"] - - -def test_get_ci_column_selector_distinct_increment_column(): - col_info = DistinctIncrementColumn(input_name="original_name", - name="new_name", - dtype="datetime64[ns]", - groupby_column="groupby_col", - timestamp_column="timestamp_col") - result = _get_ci_column_selector(col_info) - assert result == ["original_name", "groupby_col", "timestamp_col"] - - -def test_get_ci_column_selector_string_cat_column(): - col_info = StringCatColumn(name="new_name", dtype="str", input_columns=["col1", "col2"], sep=", ") - result = _get_ci_column_selector(col_info) - assert result == ["col1", "col2"] - - -def test_get_ci_column_selector_json_flatten_info(): - col_info = JSONFlattenInfo(name="json_info", - dtype="str", - input_col_names=["json_col1.a", "json_col2.b"], - output_col_names=["json_col1_a", "json_col2_b"]) - result = _get_ci_column_selector(col_info) - assert result == ["json_col1.a", "json_col2.b"] - - -def test_resolve_json_output_columns(): - input_schema = DataFrameInputSchema(json_columns=["json_col"], - column_info=[ - BoolColumn(input_name="bool_col", - name="bool_col", - dtype="bool", - true_values=["True"], - false_values=["False"]), - DateTimeColumn(input_name="datetime_col", - name="datetime_col", - dtype="datetime64[ns]"), - RenameColumn(input_name="json_col.a", name="new_rename_col", dtype="str"), - StringCatColumn(name="new_str_cat_col", - dtype="str", - input_columns=["A", "B"], - sep=", "), - ]) - - output_cols = _resolve_json_output_columns(input_schema.json_columns, input_schema.input_columns) - expected_output_cols = [ - ("json_col.a", "str"), - ] - assert output_cols == expected_output_cols - - -def test_resolve_json_output_columns_empty_input_schema(): - input_schema = DataFrameInputSchema() - output_cols = _resolve_json_output_columns(input_schema.json_columns, input_schema.input_columns) - assert not output_cols - - -def test_resolve_json_output_columns_no_json_columns(): - input_schema = DataFrameInputSchema( - column_info=[ColumnInfo(name="column1", dtype="int"), ColumnInfo(name="column2", dtype="str")]) - output_cols = _resolve_json_output_columns(input_schema.json_columns, input_schema.input_columns) - assert not output_cols - - -def test_resolve_json_output_columns_with_json_columns(): - input_schema = DataFrameInputSchema(json_columns=["json_col"], - column_info=[ - ColumnInfo(name="json_col.a", dtype="str"), - ColumnInfo(name="json_col.b", dtype="int"), - ColumnInfo(name="column3", dtype="float") - ]) - output_cols = _resolve_json_output_columns(input_schema.json_columns, input_schema.input_columns) - assert output_cols == [("json_col.a", "str"), ("json_col.b", "int")] - - -def test_resolve_json_output_columns_with_complex_schema(): - input_schema = DataFrameInputSchema(json_columns=["json_col"], - column_info=[ - ColumnInfo(name="json_col.a", dtype="str"), - ColumnInfo(name="json_col.b", dtype="int"), - ColumnInfo(name="column3", dtype="float"), - RenameColumn(name="new_column", dtype="str", input_name="column4") - ]) - output_cols = _resolve_json_output_columns(input_schema.json_columns, input_schema.input_columns) - assert output_cols == [("json_col.a", "str"), ("json_col.b", "int")] - - -def test_bfs_traversal_with_op_map(): - input_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=source_column_info) - - column_info_objects = list(input_schema.column_info) - column_info_map = {col_info.name: col_info for col_info in column_info_objects} - graph = _build_nx_dependency_graph(column_info_objects) - root_nodes = [node for node, in_degree in graph.in_degree() if in_degree == 0] - visited, node_op_map = _bfs_traversal_with_op_map(graph, column_info_map, root_nodes) - - # Check if all nodes have been visited - assert len(visited) == len(column_info_map) - - # Check if node_op_map is constructed for all nodes - assert len(node_op_map) == len(column_info_map) - - -def test_coalesce_leaf_nodes(): - input_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=source_column_info) - - column_info_objects = list(input_schema.column_info) - column_info_map = {col_info.name: col_info for col_info in column_info_objects} - graph = _build_nx_dependency_graph(column_info_objects) - root_nodes = [node for node, in_degree in graph.in_degree() if in_degree == 0] - - # Call bfs_traversal_with_op_map() and coalesce_leaf_nodes() - _, node_op_map = _bfs_traversal_with_op_map(graph, column_info_map, root_nodes) - coalesced_workflow = _coalesce_leaf_nodes(node_op_map, column_info_objects) - - # Check if the coalesced workflow is not None - assert coalesced_workflow is not None - - # Extract the leaf nodes from the coalesced workflow - leaf_nodes = [] - for node, _ in node_op_map.items(): - neighbors = list(graph.neighbors(node)) - if len(neighbors) == 0: - leaf_nodes.append(node) - - # Define the expected leaf node names - expected_leaf_node_names = [ - "result", - "reason", - "timestamp", - "location", - "authdevicename", - "username", - "accessdevicebrowser", - "accessdeviceos", - ] - - # Compare the expected leaf node names with the actual leaf node names - assert set(leaf_nodes) == set(expected_leaf_node_names) - - -def test_input_schema_conversion_empty_schema(): - empty_schema = DataFrameInputSchema() - - # pylint: disable=unused-variable - empty_schema = create_and_attach_nvt_workflow(empty_schema) # noqa - - -def test_input_schema_conversion_additional_column(): - additional_columns = [ - RenameColumn(name="appname", dtype="str", input_name="application.name"), - ] - - modified_source_column_info = source_column_info + additional_columns - - modified_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=modified_source_column_info) - test_df = create_test_dataframe() - - output_df = process_dataframe(test_df, modified_schema) - - expected_df = pd.DataFrame({ - "result": [False], - "reason": ["Denied"], - "timestamp": [pd.Timestamp("2021-02-02 12:00:00")], - "location": ["San Francisco, CA, USA"], - "authdevicename": ["Device2"], - "username": ["Jane Smith"], - "accessdevicebrowser": ["Firefox"], - "accessdeviceos": ["Linux"], - "appname": ["AnotherApp"] - }) - - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_interdependent_columns(): - additional_column_1 = StringCatColumn(name="fullname", - dtype="str", - input_columns=["user.firstname", "user.lastname"], - sep=" ") - additional_column_2 = StringCatColumn(name="appinfo", - dtype="str", - input_columns=["application.name", "application.version"], - sep="-") - - modified_source_column_info = source_column_info + [additional_column_1, additional_column_2] - - modified_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=modified_source_column_info) - - test_df = create_test_dataframe() - test_df["user"] = ['{"firstname": "Jane", "lastname": "Smith", "name": "Jane Smith"}'] - test_df["application"] = ['{"name": "AnotherApp", "version": "1.0"}'] - - modified_schema = create_and_attach_nvt_workflow(modified_schema) - prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(prepared_df_info.df) - output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - expected_df = pd.DataFrame({ - "result": [False], - "reason": ["Denied"], - "timestamp": [pd.Timestamp("2021-02-02 12:00:00")], - "location": ["San Francisco, CA, USA"], - "authdevicename": ["Device2"], - "username": ["Jane Smith"], - "accessdevicebrowser": ["Firefox"], - "accessdeviceos": ["Linux"], - "fullname": ["Jane Smith"], - "appinfo": ["AnotherApp-1.0"] - }) - - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_nested_operations(): - app_column = ColumnInfo(name="application.name", dtype="str") - additional_column = StringCatColumn(name="appname", - dtype="str", - input_columns=["application.name", "appsuffix"], - sep="") - modified_source_column_info = source_column_info + [additional_column, app_column] - - modified_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=modified_source_column_info) - - test_df = create_test_dataframe() - test_df["appsuffix"] = ["_v1"] - - # Add the 'appsuffix' column to the schema - modified_schema.column_info.append(ColumnInfo(name="appsuffix", dtype="str")) - - modified_schema = create_and_attach_nvt_workflow(modified_schema) - prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(prepared_df_info.df) - output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - expected_df = pd.DataFrame({ - "result": [False], - "reason": ["Denied"], - "timestamp": [pd.Timestamp("2021-02-02 12:00:00")], - "location": ["San Francisco, CA, USA"], - "authdevicename": ["Device2"], - "username": ["Jane Smith"], - "accessdevicebrowser": ["Firefox"], - "accessdeviceos": ["Linux"], - "appname": ["AnotherApp_v1"], - "application.name": ["AnotherApp"], - "appsuffix": ["_v1"] - }) - - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_root_schema_parent_schema_mix_operations(): - additional_column_1 = StringCatColumn(name="rootcat", - dtype="str", - input_columns=["lhs_top_level", "rhs_top_level"], - sep="-") - additional_column_2 = RenameColumn(name="rhs_top_level", dtype="str", input_name="rhs_top_level_pre") - additional_column_3 = ColumnInfo(name="lhs_top_level", dtype="str") - modified_source_column_info = [additional_column_1, additional_column_2, additional_column_3] - - modified_schema = DataFrameInputSchema(json_columns=[], column_info=modified_source_column_info) - - test_df = create_test_dataframe() - test_df["lhs_top_level"] = ["lhs"] - test_df["rhs_top_level_pre"] = ["rhs"] - - modified_schema = create_and_attach_nvt_workflow(modified_schema) - dataset = nvt.Dataset(test_df) - output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - expected_df = pd.DataFrame({ - "rootcat": ["lhs-rhs"], - "rhs_top_level": ["rhs"], - "lhs_top_level": ["lhs"], - }) - - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_preserve_column(): - additional_column_1 = StringCatColumn(name="rootcat", - dtype="str", - input_columns=["lhs_top_level", "rhs_top_level"], - sep="-") - additional_column_2 = RenameColumn(name="rhs_top_level", dtype="str", input_name="rhs_top_level_pre") - additional_column_3 = ColumnInfo(name="lhs_top_level", dtype="str") - modified_source_column_info = [additional_column_1, additional_column_2, additional_column_3] - - modified_schema = DataFrameInputSchema(json_columns=[], - column_info=modified_source_column_info, - preserve_columns=["to_preserve"]) - - test_df = create_test_dataframe() - test_df["lhs_top_level"] = ["lhs"] - test_df["rhs_top_level_pre"] = ["rhs"] - test_df["to_preserve"] = ["preserve me"] - - modified_schema = create_and_attach_nvt_workflow(modified_schema) - dataset = nvt.Dataset(test_df) - output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - # See issue #1074. This should include the `to_preserve` column, but it doesn't. - expected_df = pd.DataFrame({ - "rootcat": ["lhs-rhs"], - "rhs_top_level": ["rhs"], - "lhs_top_level": ["lhs"], # "to_preserve": ["preserve me"], - }) - - pd.testing.assert_frame_equal(output_df, expected_df) - - -# Test the conversion of a DataFrameInputSchema to an nvt.Workflow -def test_input_schema_conversion(): - # Create a DataFrameInputSchema instance with the example schema provided - example_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=source_column_info) - - # Create a test dataframe with data according to the schema - test_df = pd.DataFrame({ - "access_device": [ - '{"browser": "Chrome", "os": "Windows", "location": {"city": "New York", "state": "NY", "country": "USA"}}' - ], - "application": ['{"name": "TestApp"}'], - "auth_device": ['{"name": "Device1"}'], - "user": ['{"name": "John Doe"}'], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00")], - "result": ["SUCCESS"], - "reason": ["Authorized"] - }) - - # Call `input_schema_to_nvt_workflow` with the created instance - modified_schema = create_and_attach_nvt_workflow(example_schema) - - # Apply the returned nvt.Workflow to the test dataframe - prepared_df_info: PreparedDFInfo = modified_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(prepared_df_info.df) - output_df = modified_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - # Check if the output dataframe has the expected schema and values - expected_df = pd.DataFrame({ - "result": [True], - "reason": ["Authorized"], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00")], - "location": ["New York, NY, USA"], - "authdevicename": ["Device1"], - "username": ["John Doe"], - "accessdevicebrowser": ["Chrome"], - "accessdeviceos": ["Windows"], - }) - - pd.set_option('display.max_columns', None) - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_with_trivial_filter(): - # Create a DataFrameInputSchema instance with the example schema provided - example_schema = DataFrameInputSchema(json_columns=["access_device", "application", "auth_device", "user"], - column_info=source_column_info, - row_filter=lambda df: df) - - # Create a test dataframe with data according to the schema - test_df = pd.DataFrame({ - "access_device": [ - '{"browser": "Chrome", "os": "Windows", "location": {"city": "New York", "state": "NY", "country": "USA"}}' - ], - "application": ['{"name": "TestApp"}'], - "auth_device": ['{"name": "Device1"}'], - "user": ['{"name": "John Doe"}'], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00")], - "result": ["SUCCESS"], - "reason": ["Authorized"] - }) - - output_df = process_dataframe(test_df, example_schema) - - # Check if the output dataframe has the expected schema and values - expected_df = pd.DataFrame({ - "result": [True], - "reason": ["Authorized"], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00")], - "location": ["New York, NY, USA"], - "authdevicename": ["Device1"], - "username": ["John Doe"], - "accessdevicebrowser": ["Chrome"], - "accessdeviceos": ["Windows"], - }) - - pd.set_option('display.max_columns', None) - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_with_functional_filter(): - # Create a DataFrameInputSchema instance with the example schema provided - example_schema = DataFrameInputSchema( - json_columns=["access_device", "application", "auth_device", "user"], - column_info=source_column_info, - # pylint: disable=singleton-comparison - row_filter=lambda df: df[df["result"] == True]) # noqa E712 - - # Create a test dataframe with data according to the schema - test_df = pd.DataFrame({ - "access_device": [ - '{"browser": "Chrome", "os": "Windows", "location": {"city": "New York", "state": "NY", "country": "USA"}}', - '{"browser": "Firefox", "os": "Linux", "location": ' - '{"city": "San Francisco", "state": "CA", "country": "USA"}}' - ], - "application": ['{"name": "TestApp"}', '{"name": "AnotherApp"}'], - "auth_device": ['{"name": "Device1"}', '{"name": "Device2"}'], - "user": ['{"name": "John Doe"}', '{"name": "Jane Smith"}'], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00"), pd.Timestamp("2021-02-02 12:00:00")], - "result": ["SUCCESS", "FAILURE"], - "reason": ["Authorized", "Unauthorized"] - }) - - # Call `input_schema_to_nvt_workflow` with the created instance - example_schema = create_and_attach_nvt_workflow(example_schema) - - # Apply the returned nvt.Workflow to the test dataframe - prepared_df_info: PreparedDFInfo = example_schema.prep_dataframe(test_df) - dataset = nvt.Dataset(prepared_df_info.df) - output_df = example_schema.nvt_workflow.transform(dataset).to_ddf().compute().to_pandas() - - # Check if the output dataframe has the expected schema and values - expected_df = pd.DataFrame({ - "result": [True], - "reason": ["Authorized"], - "timestamp": [pd.Timestamp("2021-01-01 00:00:00")], - "location": ["New York, NY, USA"], - "authdevicename": ["Device1"], - "username": ["John Doe"], - "accessdevicebrowser": ["Chrome"], - "accessdeviceos": ["Windows"], - }) - - pd.set_option('display.max_columns', None) - pd.testing.assert_frame_equal(output_df, expected_df) - - -def test_input_schema_conversion_with_filter_and_index(): - # Create a DataFrameInputSchema instance with the example schema provided - example_schema = DataFrameInputSchema( - json_columns=["access_device"], - column_info=[ - BoolColumn(name="result", - dtype="bool", - input_name="result", - true_values=["success", "SUCCESS"], - false_values=["denied", "Denied", "DENIED", "FRAUD"]), - RenameColumn(name="accessdeviceos", dtype="str", input_name="access_device.os"), - ], - # pylint: disable=singleton-comparison - row_filter=lambda df: df[df["result"] == True]) # noqa E712 - - # Create a test dataframe with data according to the schema - test_df = pd.DataFrame({ - "access_device": [ - '{"browser": "Chrome", "os": "Windows", "location": {"city": "New York", "state": "NY", "country": "USA"}}', - '{"browser": "Firefox", "os": "Linux", "location": ' - '{"city": "San Francisco", "state": "CA", "country": "USA"}}', - '{"browser": "Chrome", "os": "Windows", "location": {"city": "New York", "state": "NY", "country": "USA"}}', - '{"browser": "Firefox", "os": "Linux", "location": ' - '{"city": "San Francisco", "state": "CA", "country": "USA"}}', - ], - "result": ["SUCCESS", "FAILURE", "FAILURE", "SUCCESS"], - }) - - # Offset the index - test_df.index += 5 - - # Apply the returned nvt.Workflow to the test dataframe - output_df = process_dataframe(test_df, example_schema) - - # Check if the output dataframe has the expected schema and values - expected_df = test_df.copy() - - # Filter the rows - expected_df = expected_df[expected_df["result"] == "SUCCESS"] - - expected_df["result"] = expected_df["result"] == "SUCCESS" - expected_df["accessdeviceos"] = expected_df["access_device"].apply(lambda x: json.loads(x)["os"]) - expected_df = expected_df[["result", "accessdeviceos"]] - - pd.set_option('display.max_columns', None) - pd.testing.assert_frame_equal(output_df, expected_df) diff --git a/tests/utils/nvt/test_transforms.py b/tests/utils/nvt/test_transforms.py deleted file mode 100644 index 96df15447c..0000000000 --- a/tests/utils/nvt/test_transforms.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings - -import pandas as pd -import pytest - -with warnings.catch_warnings(): - # Ignore warning regarding tensorflow not being installed - warnings.filterwarnings("ignore", message=".*No module named 'tensorflow'", category=UserWarning) - from nvtabular.ops.operator import ColumnSelector - -from _utils.dataset_manager import DatasetManager -from morpheus.utils.nvt.transforms import json_flatten -from morpheus.utils.type_aliases import DataFrameType - - -@pytest.fixture(name="data") -def data_fixture(): - yield { - "id": [1, 2], - "info": [ - '{"name": "John", "age": 30, "city": "New York"}', '{"name": "Jane", "age": 28, "city": "San Francisco"}' - ] - } - - -@pytest.fixture(name="df") -def df_fixture(dataset: DatasetManager, data: dict): - yield dataset.df_class(data) - - -def test_json_flatten(df: DataFrameType): - col_selector = ColumnSelector(["info"]) - result = json_flatten(col_selector, df) - - expected_data = {"info.name": ["John", "Jane"], "info.age": [30, 28], "info.city": ["New York", "San Francisco"]} - expected_df = pd.DataFrame(expected_data) - - DatasetManager.assert_df_equal(result, expected_df) From 769eba156260ec2819a6b34e4fe02cbbdfc04e62 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:05:10 +0000 Subject: [PATCH 02/12] finish removing nvtabular related code --- morpheus/utils/downloader.py | 1 - tests/examples/digital_fingerprinting/test_dfp_file_to_df.py | 1 - tests/test_column_info.py | 1 - 3 files changed, 3 deletions(-) diff --git a/morpheus/utils/downloader.py b/morpheus/utils/downloader.py index d352f7ef22..846d73b65a 100644 --- a/morpheus/utils/downloader.py +++ b/morpheus/utils/downloader.py @@ -66,7 +66,6 @@ def __init__(self, download_method: typing.Union[DownloadMethods, str] = DownloadMethods.DASK_THREAD, dask_heartbeat_interval: str = "30s"): - self._merlin_distributed = None self._dask_heartbeat_interval = dask_heartbeat_interval download_method = os.environ.get("MORPHEUS_FILE_DOWNLOAD_TYPE", download_method) diff --git a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py index bb5da24aa1..db70e475fe 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py +++ b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py @@ -110,7 +110,6 @@ def test_constructor(config: Config): @mock.patch('morpheus.utils.downloader.Distributed') @mock.patch('morpheus.controllers.file_to_df_controller.process_dataframe') def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicMock, - mock_distributed: mock.MagicMock, mock_obf_to_df: mock.MagicMock, mock_dask_cluster: mock.MagicMock, mock_dask_client: mock.MagicMock, diff --git a/tests/test_column_info.py b/tests/test_column_info.py index 4ea8804b78..c40e7854ac 100644 --- a/tests/test_column_info.py +++ b/tests/test_column_info.py @@ -35,7 +35,6 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.column_info import StringJoinColumn -from morpheus.utils.nvt.schema_converters import create_and_attach_nvt_workflow from morpheus.utils.schema_transforms import process_dataframe From 7a0c823ab2621bd4c760fc3fb19910487df8569d Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:09:08 +0000 Subject: [PATCH 03/12] finish removing nvtabular related code --- morpheus/utils/downloader.py | 3 --- tests/examples/digital_fingerprinting/test_dfp_file_to_df.py | 1 - tests/test_downloader.py | 1 - 3 files changed, 5 deletions(-) diff --git a/morpheus/utils/downloader.py b/morpheus/utils/downloader.py index 846d73b65a..a43ac6ff51 100644 --- a/morpheus/utils/downloader.py +++ b/morpheus/utils/downloader.py @@ -105,9 +105,6 @@ def get_dask_cluster(self): logger.debug("Creating dask cluster...") - n_workers = dask_cuda.utils.get_n_gpus() - threads_per_worker = mp.cpu_count() // n_workers - Downloader._dask_cluster = dask.distributed.LocalCluster(start=True, processes=self.download_method != "dask_thread") diff --git a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py index db70e475fe..19fa6add61 100644 --- a/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py +++ b/tests/examples/digital_fingerprinting/test_dfp_file_to_df.py @@ -107,7 +107,6 @@ def test_constructor(config: Config): @mock.patch('dask.distributed.Client') @mock.patch('dask.distributed.LocalCluster') @mock.patch('morpheus.controllers.file_to_df_controller.single_object_to_dataframe') -@mock.patch('morpheus.utils.downloader.Distributed') @mock.patch('morpheus.controllers.file_to_df_controller.process_dataframe') def test_get_or_create_dataframe_from_batch_cache_miss(mock_proc_df: mock.MagicMock, mock_obf_to_df: mock.MagicMock, diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 9d61401f59..451c6cde64 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -153,7 +153,6 @@ def test_download(mock_dask_cluster: mock.MagicMock, downloader = Downloader(download_method=dl_method) results = downloader.download(download_buckets, download_fn) - assert results == [returnd_df for _ in range(num_buckets)] if dl_method == "single_thread": download_fn.assert_has_calls([mock.call(bucket) for bucket in download_buckets]) From c718e7e410367dd986b9923c139682b01ea15af3 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:11:17 +0000 Subject: [PATCH 04/12] finish removing nvtabular related code --- ci/conda/recipes/morpheus/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/conda/recipes/morpheus/meta.yaml b/ci/conda/recipes/morpheus/meta.yaml index 2d3921bbe9..fc55e54f30 100644 --- a/ci/conda/recipes/morpheus/meta.yaml +++ b/ci/conda/recipes/morpheus/meta.yaml @@ -101,7 +101,6 @@ outputs: - mrc - networkx=2.8.8 - numpydoc =1.5.* - - nvtabular =23.08.00 - pydantic - pluggy =1.3.* - python From 8a45bba3b81368e5e76f68c83683de91a36970bb Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:11:40 +0000 Subject: [PATCH 05/12] finish removing nvtabular related code --- morpheus/utils/column_info.py | 1 - 1 file changed, 1 deletion(-) diff --git a/morpheus/utils/column_info.py b/morpheus/utils/column_info.py index 9ebba46f2b..eaef229666 100644 --- a/morpheus/utils/column_info.py +++ b/morpheus/utils/column_info.py @@ -788,4 +788,3 @@ def __post_init__(self): input_columns=self.input_columns, json_cols=self.json_columns, preserve_re=self.preserve_columns) - From f174055e96ba074cf3356ea6ab1104155fe54f4c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:24:30 +0000 Subject: [PATCH 06/12] updates for rapids-24.06 --- .github/workflows/pr.yaml | 4 +- CMakeLists.txt | 2 +- .../recipes/morpheus/conda_build_config.yaml | 8 +- ci/conda/recipes/morpheus/meta.yaml | 4 +- ci/runner/Dockerfile | 4 +- cmake/dependencies.cmake | 16 +--- ..._64.yaml => all_cuda-125_arch-x86_64.yaml} | 54 ++++++------ ..._64.yaml => dev_cuda-125_arch-x86_64.yaml} | 42 ++++----- ...aml => examples_cuda-125_arch-x86_64.yaml} | 22 ++--- ... => model-utils_cuda-125_arch-x86_64.yaml} | 4 +- ...yaml => runtime_cuda-125_arch-x86_64.yaml} | 24 +++--- dependencies.yaml | 85 ++++++++++--------- docs/README.md | 2 +- docs/source/developer_guide/contributing.md | 2 +- .../3_simple_cpp_stage/README.md | 2 +- .../4_rabbitmq_cpp_stage/README.md | 2 +- .../production/Dockerfile | 2 +- .../production/morpheus/benchmarks/README.md | 2 +- examples/doca/vdb_realtime/README.md | 2 +- .../gnn_fraud_detection_pipeline/README.md | 4 +- examples/llm/agents/README.md | 4 +- examples/llm/completion/README.md | 4 +- examples/llm/rag/README.md | 2 +- examples/llm/vdb_upload/README.md | 2 +- external/utilities | 2 +- .../fraud-detection-models/README.md | 2 +- morpheus/_lib/cmake/libmorpheus.cmake | 2 + morpheus/_lib/cudf_helpers.pyx | 26 ++++-- morpheus/_lib/src/messages/multi.cpp | 10 ++- morpheus/_lib/src/utilities/matx_util.cu | 4 +- morpheus/_lib/tests/CMakeLists.txt | 1 + morpheus/llm/services/nemo_llm_service.py | 2 +- .../llm/services/nvfoundation_llm_service.py | 2 +- morpheus/llm/services/openai_chat_service.py | 2 +- morpheus/messages/multi_message.py | 11 ++- morpheus/parsers/event_parser.py | 2 +- morpheus/stages/input/arxiv_source.py | 4 +- tests/benchmarks/README.md | 2 +- tests/conftest.py | 6 +- tests/examples/llm/common/conftest.py | 2 +- tests/stages/arxiv/conftest.py | 2 +- 41 files changed, 209 insertions(+), 172 deletions(-) rename conda/environments/{all_cuda-121_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (78%) rename conda/environments/{dev_cuda-121_arch-x86_64.yaml => dev_cuda-125_arch-x86_64.yaml} (76%) rename conda/environments/{examples_cuda-121_arch-x86_64.yaml => examples_cuda-125_arch-x86_64.yaml} (85%) rename conda/environments/{model-utils_cuda-121_arch-x86_64.yaml => model-utils_cuda-125_arch-x86_64.yaml} (88%) rename conda/environments/{runtime_cuda-121_arch-x86_64.yaml => runtime_cuda-125_arch-x86_64.yaml} (75%) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 42a261fdc0..3fb3b618ca 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -49,7 +49,7 @@ jobs: - checks - ci_pipe secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 prepare: # Executes the get-pr-info action to determine if the PR has the skip-ci label, if the action fails we assume the @@ -76,7 +76,7 @@ jobs: # Only run the CI pipeline if the PR does not have the skip-ci label and we are on a PR branch if: ${{ !fromJSON(needs.prepare.outputs.has_skip_ci_label) && fromJSON(needs.prepare.outputs.is_pr )}} secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06 with: enable_check_generated_files: false diff --git a/CMakeLists.txt b/CMakeLists.txt index f2658237ab..90ebdc4f7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,7 +36,7 @@ option(MORPHEUS_USE_IWYU "Enable running include-what-you-use as part of the bui set(MORPHEUS_PY_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/wheel" CACHE STRING "Location to install the python directory") -set(MORPHEUS_RAPIDS_VERSION "24.02" CACHE STRING "Sets default versions for RAPIDS libraries.") +set(MORPHEUS_RAPIDS_VERSION "24.06" CACHE STRING "Sets default versions for RAPIDS libraries.") set(MORPHEUS_CACHE_DIR "${CMAKE_SOURCE_DIR}/.cache" CACHE PATH "Directory to contain all CPM and CCache data") mark_as_advanced(MORPHEUS_CACHE_DIR) diff --git a/ci/conda/recipes/morpheus/conda_build_config.yaml b/ci/conda/recipes/morpheus/conda_build_config.yaml index 4b051dc074..a5681ca12e 100644 --- a/ci/conda/recipes/morpheus/conda_build_config.yaml +++ b/ci/conda/recipes/morpheus/conda_build_config.yaml @@ -14,19 +14,19 @@ # limitations under the License. c_compiler_version: - - 11.2 + - 12.1 cxx_compiler_version: - - 11.2 + - 12.1 cuda_compiler: - cuda-nvcc cuda_compiler_version: - - 12.1 + - 12.5 python: - 3.10 rapids_version: - - 24.02 + - 24.06 diff --git a/ci/conda/recipes/morpheus/meta.yaml b/ci/conda/recipes/morpheus/meta.yaml index fc55e54f30..2961b541b1 100644 --- a/ci/conda/recipes/morpheus/meta.yaml +++ b/ci/conda/recipes/morpheus/meta.yaml @@ -56,7 +56,7 @@ outputs: - libtool # Needed for DOCA build - ninja =1.11 - pkg-config =0.29 # for mrc cmake - - sysroot_linux-64 =2.17 + - sysroot_linux-64 =2.28 host: # CUDA dependencies - cuda-cudart-dev {{ cuda_compiler_version }}.* @@ -68,7 +68,7 @@ outputs: # Non-CUDA dependencies - cudf {{ rapids_version }} - cython 3.0.* - - glog 0.6.* + - glog >=0.7.1,<8 - libcudf {{ rapids_version }} - librdkafka >=1.9.2,<1.10.0a0 - mrc {{ minor_version }} diff --git a/ci/runner/Dockerfile b/ci/runner/Dockerfile index 40b035c402..e3d7347268 100644 --- a/ci/runner/Dockerfile +++ b/ci/runner/Dockerfile @@ -16,8 +16,8 @@ # Args used in FROM commands must come first ARG FROM_IMAGE="rapidsai/ci-conda" ARG CUDA_PKG_VER=12-0 -ARG CUDA_SHORT_VER=12.1 -ARG CUDA_VER=12.1.1 +ARG CUDA_SHORT_VER=12.5 +ARG CUDA_VER=12.5.1 ARG LINUX_DISTRO=ubuntu ARG LINUX_VER=22.04 ARG PROJ_NAME=morpheus diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 5bc821773e..0353fc1373 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -43,12 +43,8 @@ if(MORPHEUS_BUILD_BENCHMARKS) # google benchmark # - Expects package to pre-exist in the build environment # ================ - rapids_find_package(benchmark REQUIRED - GLOBAL_TARGETS benchmark::benchmark - BUILD_EXPORT_SET ${PROJECT_NAME}-core-exports - INSTALL_EXPORT_SET ${PROJECT_NAME}-core-exports - FIND_ARGS CONFIG - ) + include(${rapids-cmake-dir}/cpm/gbench.cmake) + rapids_cpm_gbench(BUILD_STATIC) endif() # glog @@ -59,12 +55,8 @@ if(MORPHEUS_BUILD_TESTS) # google test # - Expects package to pre-exist in the build environment # =========== - rapids_find_package(GTest REQUIRED - GLOBAL_TARGETS GTest::gtest GTest::gmock GTest::gtest_main GTest::gmock_main - BUILD_EXPORT_SET ${PROJECT_NAME}-core-exports - INSTALL_EXPORT_SET ${PROJECT_NAME}-core-exports - FIND_ARGS CONFIG - ) + include(${rapids-cmake-dir}/cpm/gtest.cmake) + rapids_cpm_gtest(BUILD_STATIC) endif() # cccl -- get an explicit cccl build, matx tries to pull a tag that doesn't exist. diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml similarity index 78% rename from conda/environments/all_cuda-121_arch-x86_64.yaml rename to conda/environments/all_cuda-125_arch-x86_64.yaml index fe924bf8a6..4b403c88ae 100644 --- a/conda/environments/all_cuda-121_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,18 +21,18 @@ dependencies: - clangdev=16 - click>=8 - cmake=3.27 -- cuda-cudart-dev=12.1 -- cuda-cudart=12.1 -- cuda-nvcc=12.1 -- cuda-nvml-dev=12.1 -- cuda-nvrtc-dev=12.1 -- cuda-nvrtc=12.1 -- cuda-nvtx-dev=12.1 -- cuda-nvtx=12.1 -- cuda-tools=12.1 -- cuda-version=12.1 -- cudf=24.02 -- cuml=24.02.* +- cuda-cudart-dev=12.5 +- cuda-cudart=12.5 +- cuda-nvcc=12.5 +- cuda-nvml-dev=12.5 +- cuda-nvrtc-dev=12.5 +- cuda-nvrtc=12.5 +- cuda-nvtx-dev=12.5 +- cuda-nvtx=12.5 +- cuda-sanitizer-api +- cuda-version=12.5 +- cudf=24.06 +- cuml=24.06.6 - cupy - cxx-compiler - cython=3.0 @@ -44,23 +44,25 @@ dependencies: - exhale=0.3.6 - feedparser=6.0 - flake8 -- gcc_linux-64=11.2 - git-lfs -- glog=0.6 -- grpcio-status=1.59 -- grpcio=1.59 -- gxx_linux-64=11.2 +- glog>=0.7.1,<0.8 +- grpcio +- grpcio-status +- gtest=1.14 +- gxx=12.1 - huggingface_hub=0.20.2 - include-what-you-use=0.20 - ipython - isort - jsonpatch>=1.33 - kfp -- libcudf=24.02 +- libcudf=24.06 +- libcufft-dev - librdkafka>=1.9.2,<1.10.0a0 - libtool - libwebp=1.3.2 -- mlflow>=2.10.0,<3 +- libzlib >=1.3.1,<2 +- mlflow - mrc=24.06 - myst-parser=0.18.1 - nbsphinx @@ -91,22 +93,20 @@ dependencies: - python-docx==1.1.0 - python-graphviz - python=3.10 -- pytorch-cuda -- pytorch=*=*cuda* - rapidjson=1.1.0 -- rapids-dask-dependency=24.02 +- rapids-dask-dependency=24.06 - rdma-core>=48 - requests - requests-cache=1.1 - requests-toolbelt=1.0 -- s3fs=2023.12.2 +- s3fs - scikit-build=0.17.6 - scikit-learn=1.3.2 - sentence-transformers=2.7 - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 -- sysroot_linux-64=2.17 +- sysroot_linux-64>=2.28 - tqdm=4 - transformers=4.36.2 - tritonclient=2.34 @@ -118,13 +118,14 @@ dependencies: - yapf=0.40.1 - zlib=1.2.13 - pip: + - --extra-index-url https://download.pytorch.org/whl/cu124 - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 - dglgo - - faiss-gpu==1.7.* + - faiss-cpu - google-search-results==2.4 - langchain-nvidia-ai-endpoints==0.0.11 - langchain==0.1.16 @@ -132,4 +133,5 @@ dependencies: - nemollm==0.3.5 - pymilvus==2.3.6 - pytest-kafka==0.6.0 -name: all_cuda-121_arch-x86_64 + - torch==2.4.0+cu124 +name: all_cuda-125_arch-x86_64 diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-125_arch-x86_64.yaml similarity index 76% rename from conda/environments/dev_cuda-121_arch-x86_64.yaml rename to conda/environments/dev_cuda-125_arch-x86_64.yaml index 345461bf2c..0a3bd79236 100644 --- a/conda/environments/dev_cuda-121_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-125_arch-x86_64.yaml @@ -18,14 +18,14 @@ dependencies: - clangdev=16 - click>=8 - cmake=3.27 -- cuda-cudart-dev=12.1 -- cuda-nvcc=12.1 -- cuda-nvml-dev=12.1 -- cuda-nvrtc-dev=12.1 -- cuda-nvtx-dev=12.1 -- cuda-tools=12.1 -- cuda-version=12.1 -- cudf=24.02 +- cuda-cudart-dev=12.5 +- cuda-nvcc=12.5 +- cuda-nvml-dev=12.5 +- cuda-nvrtc-dev=12.5 +- cuda-nvtx-dev=12.5 +- cuda-sanitizer-api +- cuda-version=12.5 +- cudf=24.06 - cupy - cxx-compiler - cython=3.0 @@ -37,20 +37,22 @@ dependencies: - exhale=0.3.6 - feedparser=6.0 - flake8 -- gcc_linux-64=11.2 - git-lfs -- glog=0.6 -- grpcio-status=1.59 -- grpcio=1.59 -- gxx_linux-64=11.2 +- glog>=0.7.1,<0.8 +- grpcio +- grpcio-status +- gtest=1.14 +- gxx=12.1 - include-what-you-use=0.20 - ipython - isort -- libcudf=24.02 +- libcudf=24.06 +- libcufft-dev - librdkafka>=1.9.2,<1.10.0a0 - libtool - libwebp=1.3.2 -- mlflow>=2.10.0,<3 +- libzlib >=1.3.1,<2 +- mlflow - mrc=24.06 - myst-parser=0.18.1 - nbsphinx @@ -75,10 +77,8 @@ dependencies: - python-docx==1.1.0 - python-graphviz - python=3.10 -- pytorch-cuda -- pytorch=*=*cuda* - rapidjson=1.1.0 -- rapids-dask-dependency=24.02 +- rapids-dask-dependency=24.06 - rdma-core>=48 - requests - requests-cache=1.1 @@ -87,7 +87,7 @@ dependencies: - sphinx - sphinx_rtd_theme - sqlalchemy<2.0 -- sysroot_linux-64=2.17 +- sysroot_linux-64>=2.28 - tqdm=4 - tritonclient=2.34 - typing_utils=0.1 @@ -98,9 +98,11 @@ dependencies: - yapf=0.40.1 - zlib=1.2.13 - pip: + - --extra-index-url https://download.pytorch.org/whl/cu124 - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 - pymilvus==2.3.6 - pytest-kafka==0.6.0 -name: dev_cuda-121_arch-x86_64 + - torch==2.4.0+cu124 +name: dev_cuda-125_arch-x86_64 diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-125_arch-x86_64.yaml similarity index 85% rename from conda/environments/examples_cuda-121_arch-x86_64.yaml rename to conda/environments/examples_cuda-125_arch-x86_64.yaml index 1ec20467f3..2578571367 100644 --- a/conda/environments/examples_cuda-121_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-125_arch-x86_64.yaml @@ -15,21 +15,21 @@ dependencies: - beautifulsoup4=4.12 - boto3 - click>=8 -- cudf=24.02 -- cuml=24.02.* +- cudf=24.06 +- cuml=24.06.6 - cupy - datacompy=0.10 - dill=0.3.7 - docker-py=5.0 - elasticsearch==8.9.0 - feedparser=6.0 -- grpcio-status=1.59 -- grpcio=1.59 +- grpcio +- grpcio-status - huggingface_hub=0.20.2 - jsonpatch>=1.33 - kfp - libwebp=1.3.2 -- mlflow>=2.10.0,<3 +- mlflow - mrc=24.06 - networkx=2.8.8 - newspaper3k=0.2 @@ -48,13 +48,11 @@ dependencies: - python-docx==1.1.0 - python-graphviz - python=3.10 -- pytorch-cuda -- pytorch=*=*cuda* -- rapids-dask-dependency=24.02 +- rapids-dask-dependency=24.06 - requests - requests-cache=1.1 - requests-toolbelt=1.0 -- s3fs=2023.12.2 +- s3fs - scikit-learn=1.3.2 - sentence-transformers=2.7 - sqlalchemy<2.0 @@ -65,17 +63,19 @@ dependencies: - watchdog=3.0 - websockets - pip: + - --extra-index-url https://download.pytorch.org/whl/cu124 - --find-links https://data.dgl.ai/wheels-test/repo.html - --find-links https://data.dgl.ai/wheels/cu121/repo.html - databricks-cli < 0.100 - databricks-connect - dgl==2.0.0 - dglgo - - faiss-gpu==1.7.* + - faiss-cpu - google-search-results==2.4 - langchain-nvidia-ai-endpoints==0.0.11 - langchain==0.1.16 - milvus==2.3.5 - nemollm==0.3.5 - pymilvus==2.3.6 -name: examples_cuda-121_arch-x86_64 + - torch==2.4.0+cu124 +name: examples_cuda-125_arch-x86_64 diff --git a/conda/environments/model-utils_cuda-121_arch-x86_64.yaml b/conda/environments/model-utils_cuda-125_arch-x86_64.yaml similarity index 88% rename from conda/environments/model-utils_cuda-121_arch-x86_64.yaml rename to conda/environments/model-utils_cuda-125_arch-x86_64.yaml index 761f19aaa0..5b1f535718 100644 --- a/conda/environments/model-utils_cuda-121_arch-x86_64.yaml +++ b/conda/environments/model-utils_cuda-125_arch-x86_64.yaml @@ -9,7 +9,7 @@ channels: - nvidia/label/dev - pytorch dependencies: -- cuml=24.02.* +- cuml=24.06.6 - jupyterlab - matplotlib - onnx @@ -20,4 +20,4 @@ dependencies: - seqeval=1.2.2 - transformers=4.36.2 - xgboost -name: model-utils_cuda-121_arch-x86_64 +name: model-utils_cuda-125_arch-x86_64 diff --git a/conda/environments/runtime_cuda-121_arch-x86_64.yaml b/conda/environments/runtime_cuda-125_arch-x86_64.yaml similarity index 75% rename from conda/environments/runtime_cuda-121_arch-x86_64.yaml rename to conda/environments/runtime_cuda-125_arch-x86_64.yaml index d9f23b252d..b375d2b63e 100644 --- a/conda/environments/runtime_cuda-121_arch-x86_64.yaml +++ b/conda/environments/runtime_cuda-125_arch-x86_64.yaml @@ -12,21 +12,21 @@ dependencies: - appdirs - beautifulsoup4=4.12 - click>=8 -- cuda-cudart=12.1 -- cuda-nvrtc=12.1 -- cuda-nvtx=12.1 -- cuda-version=12.1 -- cudf=24.02 +- cuda-cudart=12.5 +- cuda-nvrtc=12.5 +- cuda-nvtx=12.5 +- cuda-version=12.5 +- cudf=24.06 - cupy - datacompy=0.10 - dill=0.3.7 - docker-py=5.0 - elasticsearch==8.9.0 - feedparser=6.0 -- grpcio-status=1.59 -- grpcio=1.59 +- grpcio +- grpcio-status - libwebp=1.3.2 -- mlflow>=2.10.0,<3 +- mlflow - mrc=24.06 - networkx=2.8.8 - numpydoc=1.5 @@ -36,9 +36,7 @@ dependencies: - python-confluent-kafka>=1.9.2,<1.10.0a0 - python-graphviz - python=3.10 -- pytorch-cuda -- pytorch=*=*cuda* -- rapids-dask-dependency=24.02 +- rapids-dask-dependency=24.06 - requests - requests-cache=1.1 - scikit-learn=1.3.2 @@ -49,8 +47,10 @@ dependencies: - watchdog=3.0 - websockets - pip: + - --extra-index-url https://download.pytorch.org/whl/cu124 - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 - pymilvus==2.3.6 -name: runtime_cuda-121_arch-x86_64 + - torch==2.4.0+cu124 +name: runtime_cuda-125_arch-x86_64 diff --git a/dependencies.yaml b/dependencies.yaml index 9c29642132..d239bb59e1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -19,7 +19,7 @@ files: all: output: conda matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - benchmark_cpp @@ -45,7 +45,7 @@ files: dev: output: conda matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - benchmark_cpp @@ -67,7 +67,7 @@ files: build: output: none matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - benchmark_cpp @@ -87,7 +87,7 @@ files: test: output: none matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - benchmark_cpp @@ -108,7 +108,7 @@ files: docs: output: none matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - benchmark_cpp @@ -131,7 +131,7 @@ files: runtime: output: conda matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - cudatoolkit @@ -145,7 +145,7 @@ files: examples: output: conda matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - cve-mitigation @@ -161,7 +161,7 @@ files: model-utils: output: conda matrix: - cuda: ["12.1"] + cuda: ["12.5"] arch: [x86_64] includes: - model-training-tuning @@ -196,26 +196,28 @@ dependencies: - output_types: [conda] matrices: - matrix: - cuda: "12.1" + cuda: "12.5" packages: - - cuda-cudart=12.1 - - cuda-nvrtc=12.1 - - cuda-nvtx=12.1 - - cuda-version=12.1 + - cuda-cudart=12.5 + - cuda-nvrtc=12.5 + - cuda-nvtx=12.5 + - cuda-version=12.5 cudatoolkit-dev: specific: - output_types: [conda] matrices: - matrix: - cuda: "12.1" + cuda: "12.5" packages: - - cuda-cudart-dev=12.1 - - cuda-nvml-dev=12.1 - - cuda-nvrtc-dev=12.1 - - cuda-nvtx-dev=12.1 - - cuda-tools=12.1 - - cuda-version=12.1 + - cuda-cudart-dev=12.5 + - cuda-nvml-dev=12.5 + - cuda-nvrtc-dev=12.5 + - cuda-nvtx-dev=12.5 + # - cuda-tools=12.5 + - cuda-version=12.5 + - libcufft-dev # required by matx + - cuda-sanitizer-api @@ -226,21 +228,20 @@ dependencies: - output_types: [conda] packages: # Compilers - - cuda-nvcc=12.1 + - cuda-nvcc=12.5 - cxx-compiler - - gcc_linux-64=11.2 - - gxx_linux-64=11.2 + - gxx=12.1 # Non-Compiler Dependencies - automake=1.16.5 # Needed for DOCA build - ccache - cmake=3.27 - - cuda-cudart-dev=12.1 - - cuda-version=12.1 + - cuda-cudart-dev=12.5 + - cuda-version=12.5 - libtool # Needed for DOCA build - ninja=1.11 - pkg-config=0.29 # for mrc cmake - - sysroot_linux-64=2.17 + - sysroot_linux-64>=2.28 # Build dependencies for Morpheus on the host arch. Mirrors the `host` section in # ci/conda/recipes/morpheus/meta.yaml @@ -249,11 +250,13 @@ dependencies: - output_types: [conda] packages: # Include: cudatoolkit-dev - - cudf=24.02 + - cudf=24.06 - cython=3.0 - - glog=0.6 - - libcudf=24.02 + - glog>=0.7.1,<0.8 + - gtest=1.14 + - libcudf=24.06 - librdkafka>=1.9.2,<1.10.0a0 + - libzlib >=1.3.1,<2 - mrc=24.06 - nlohmann_json=3.11 - pybind11-stubgen=0.10.5 @@ -319,18 +322,18 @@ dependencies: - appdirs - beautifulsoup4=4.12 - click>=8 - # - cuda-version=12.1 ## - - cudf=24.02 + # - cuda-version=12.5 ## + - cudf=24.06 - cupy # Version determined from cudf - datacompy=0.10 - dill=0.3.7 - docker-py=5.0 - elasticsearch==8.9.0 - feedparser=6.0 - - grpcio=1.59 - - grpcio-status=1.59 + - grpcio + - grpcio-status # - libwebp=1.3.2 # Required for CVE mitigation: https://nvd.nist.gov/vuln/detail/CVE-2023-4863 ## - - mlflow>=2.10.0,<3 + - mlflow #>=2.10.0,<3 - mrc=24.06 - networkx=2.8.8 - numpydoc=1.5 @@ -338,10 +341,10 @@ dependencies: # - python ## - python-confluent-kafka>=1.9.2,<1.10.0a0 - python-graphviz - - pytorch-cuda - - pytorch=*=*cuda* + # - pytorch-cuda + # - pytorch=*=*cuda* - pluggy=1.3 - - rapids-dask-dependency=24.02 # provides dask and distributed + - rapids-dask-dependency=24.06 # provides dask and distribute6 - requests - requests-cache=1.1 - scikit-learn=1.3.2 @@ -353,10 +356,12 @@ dependencies: - websockets - pip - pip: + - --extra-index-url https://download.pytorch.org/whl/cu124 - databricks-cli < 0.100 - databricks-connect - milvus==2.3.5 # update to match pymilvus when available - pymilvus==2.3.6 + - torch==2.4.0+cu124 test_python_morpheus: common: @@ -381,13 +386,13 @@ dependencies: - boto3 - kfp - papermill=2.4.0 - - s3fs=2023.12.2 + - s3fs example-gnn: common: - output_types: [conda] packages: - - &cuml cuml=24.02.* + - &cuml cuml=24.06.6 - pip - pip: - --find-links https://data.dgl.ai/wheels/cu121/repo.html @@ -417,7 +422,7 @@ dependencies: - pip: - langchain==0.1.16 - langchain-nvidia-ai-endpoints==0.0.11 - - faiss-gpu==1.7.* + - faiss-cpu - google-search-results==2.4 - nemollm==0.3.5 diff --git a/docs/README.md b/docs/README.md index 469303430e..4fe4c43e58 100644 --- a/docs/README.md +++ b/docs/README.md @@ -22,7 +22,7 @@ Additional packages required for building the documentation are defined in `./co ## Install Additional Dependencies From the root of the Morpheus repo: ```bash -conda env update --solver=libmamba -n morpheus --file conda/environments/dev_cuda-121_arch-x86_64.yaml --prune +conda env update --solver=libmamba -n morpheus --file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune ``` ## Build Morpheus and Documentation diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index 1c9299f1a4..fd6227e1b1 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -186,7 +186,7 @@ git submodule update --init --recursive 1. Create the Morpheus Conda environment ```bash - conda env create --solver=libmamba -n morpheus --file conda/environments/dev_cuda-121_arch-x86_64.yaml + conda env create --solver=libmamba -n morpheus --file conda/environments/dev_cuda-125_arch-x86_64.yaml conda activate morpheus ``` diff --git a/examples/developer_guide/3_simple_cpp_stage/README.md b/examples/developer_guide/3_simple_cpp_stage/README.md index 6e62534325..51573b0ad4 100644 --- a/examples/developer_guide/3_simple_cpp_stage/README.md +++ b/examples/developer_guide/3_simple_cpp_stage/README.md @@ -21,5 +21,5 @@ limitations under the License. |-------------|-----------|-------| | Conda | ✔ | | | Morpheus Docker Container | ✔ | | -| Morpheus Release Container | ✔ | Requires adding development packages to the container's Conda environment via `conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/dev_cuda-121_arch-x86_64.yaml` | +| Morpheus Release Container | ✔ | Requires adding development packages to the container's Conda environment via `conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/dev_cuda-125_arch-x86_64.yaml` | | Dev Container | ✔ | | diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md index 313fa34f98..b1246d02c0 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md @@ -25,7 +25,7 @@ This example adds two flags to the `read_simple.py` script. A `--use_cpp` flag w |-------------|-----------|-------| | Conda | ✔ | | | Morpheus Docker Container | ✔ | Requires launching the RabbitMQ container on the host | -| Morpheus Release Container | ✔ | Requires launching the RabbitMQ container on the host, and adding development packages to the container's Conda environment via `conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/dev_cuda-121_arch-x86_64.yaml` | +| Morpheus Release Container | ✔ | Requires launching the RabbitMQ container on the host, and adding development packages to the container's Conda environment via `conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/dev_cuda-125_arch-x86_64.yaml` | | Dev Container | ✘ | | ## Installing Pika diff --git a/examples/digital_fingerprinting/production/Dockerfile b/examples/digital_fingerprinting/production/Dockerfile index 14e9ffa38b..0dc4995514 100644 --- a/examples/digital_fingerprinting/production/Dockerfile +++ b/examples/digital_fingerprinting/production/Dockerfile @@ -28,7 +28,7 @@ WORKDIR /workspace/examples/digital_fingerprinting/ # Install DFP dependencies RUN source activate morpheus \ - && /opt/conda/bin/conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/examples_cuda-121_arch-x86_64.yaml + && /opt/conda/bin/conda env update --solver=libmamba -n morpheus --file /workspace/conda/environments/examples_cuda-125_arch-x86_64.yaml # Set the tracking URI for mlflow ENV MLFLOW_TRACKING_URI="http://mlflow:5000" diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md index a9c09197d2..76d38c1b04 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md @@ -47,7 +47,7 @@ Install additonal required dependencies: ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` diff --git a/examples/doca/vdb_realtime/README.md b/examples/doca/vdb_realtime/README.md index e4b02e6042..e416031c79 100644 --- a/examples/doca/vdb_realtime/README.md +++ b/examples/doca/vdb_realtime/README.md @@ -94,7 +94,7 @@ export NGC_API_KEY="" Then install basic requirements: ```bash -conda env update --solver=libmamba -n morpheus --file conda/environments/examples_cuda-121_arch-x86_64.yaml --prune +conda env update --solver=libmamba -n morpheus --file conda/environments/examples_cuda-125_arch-x86_64.yaml --prune ``` Run the RAG example to query the Milvus database: diff --git a/examples/gnn_fraud_detection_pipeline/README.md b/examples/gnn_fraud_detection_pipeline/README.md index 9084471400..9aa6c1afc3 100644 --- a/examples/gnn_fraud_detection_pipeline/README.md +++ b/examples/gnn_fraud_detection_pipeline/README.md @@ -17,7 +17,7 @@ limitations under the License. # GNN Fraud Detection Pipeline ## Supported Environments -All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-121_arch-x86_64.yaml` or `conda/environments/examples_cuda-121_arch-x86_64.yaml` environment files. Refer to the [Requirements](#requirements) section for more information. +All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-125_arch-x86_64.yaml` or `conda/environments/examples_cuda-125_arch-x86_64.yaml` environment files. Refer to the [Requirements](#requirements) section for more information. | Environment | Supported | Notes | |-------------|-----------|-------| | Conda | ✔ | | @@ -32,7 +32,7 @@ Prior to running the GNN fraud detection pipeline, additional requirements must ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` ## Running diff --git a/examples/llm/agents/README.md b/examples/llm/agents/README.md index 7f51397c14..b7a570b885 100644 --- a/examples/llm/agents/README.md +++ b/examples/llm/agents/README.md @@ -35,7 +35,7 @@ limitations under the License. - [Run example (Kafka Pipeline)](#run-example-kafka-pipeline) ## Supported Environments -All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-121_arch-x86_64.yaml` or `conda/environments/examples_cuda-121_arch-x86_64.yaml` environment files. Refer to the [Install Dependencies](#install-dependencies) section for more information. +All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-125_arch-x86_64.yaml` or `conda/environments/examples_cuda-125_arch-x86_64.yaml` environment files. Refer to the [Install Dependencies](#install-dependencies) section for more information. | Environment | Supported | Notes | |-------------|-----------|-------| | Conda | ✔ | | @@ -106,7 +106,7 @@ Install the required dependencies. ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` diff --git a/examples/llm/completion/README.md b/examples/llm/completion/README.md index c619546c47..e72ffe1ce6 100644 --- a/examples/llm/completion/README.md +++ b/examples/llm/completion/README.md @@ -31,7 +31,7 @@ limitations under the License. - [Running the Morpheus Pipeline](#running-the-morpheus-pipeline) ## Supported Environments -All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-121_arch-x86_64.yaml` or `conda/environments/examples_cuda-121_arch-x86_64.yaml` environment files. Refer to the [Install Dependencies](#install-dependencies) section for more information. +All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-125_arch-x86_64.yaml` or `conda/environments/examples_cuda-125_arch-x86_64.yaml` environment files. Refer to the [Install Dependencies](#install-dependencies) section for more information. | Environment | Supported | Notes | |-------------|-----------|-------| | Conda | ✔ | | @@ -80,7 +80,7 @@ Install the required dependencies. ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` diff --git a/examples/llm/rag/README.md b/examples/llm/rag/README.md index 1fb5d451f7..60003e53b3 100644 --- a/examples/llm/rag/README.md +++ b/examples/llm/rag/README.md @@ -18,7 +18,7 @@ limitations under the License. # Retrieval Augmented Generation (RAG) Pipeline ## Supported Environments -All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-121_arch-x86_64.yaml` or `conda/environments/examples_cuda-121_arch-x86_64.yaml` environment files. This example also requires the [VDB upload](../vdb_upload/README.md) pipeline to have been run previously. +All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-125_arch-x86_64.yaml` or `conda/environments/examples_cuda-125_arch-x86_64.yaml` environment files. This example also requires the [VDB upload](../vdb_upload/README.md) pipeline to have been run previously. | Environment | Supported | Notes | |-------------|-----------|-------| | Conda | ✔ | | diff --git a/examples/llm/vdb_upload/README.md b/examples/llm/vdb_upload/README.md index b8a3ef35e5..9b5c698052 100644 --- a/examples/llm/vdb_upload/README.md +++ b/examples/llm/vdb_upload/README.md @@ -34,7 +34,7 @@ limitations under the License. - [Exporting and Deploying a Different Model from Huggingface](#exporting-and-deploying-a-different-model-from-huggingface) ## Supported Environments -All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-121_arch-x86_64.yaml` or `conda/environments/examples_cuda-121_arch-x86_64.yaml` environment files. +All environments require additional Conda packages which can be installed with either the `conda/environments/all_cuda-125_arch-x86_64.yaml` or `conda/environments/examples_cuda-125_arch-x86_64.yaml` environment files. | Environment | Supported | Notes | |-------------|-----------|-------| | Conda | ✔ | | diff --git a/external/utilities b/external/utilities index 54be32e6d3..af13da47e7 160000 --- a/external/utilities +++ b/external/utilities @@ -1 +1 @@ -Subproject commit 54be32e6d3e1c7dea65ede5d721ef4496a225aec +Subproject commit af13da47e72419c5ad675df504d5c176d43f3a6e diff --git a/models/training-tuning-scripts/fraud-detection-models/README.md b/models/training-tuning-scripts/fraud-detection-models/README.md index 14e4b32084..b228742ea0 100644 --- a/models/training-tuning-scripts/fraud-detection-models/README.md +++ b/models/training-tuning-scripts/fraud-detection-models/README.md @@ -26,7 +26,7 @@ Install packages for training GNN model. ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/model-utils-121_arch-x86_64.yaml + --file ./conda/environments/model-utils-125_arch-x86_64.yaml ``` ### Options for training and tuning models. diff --git a/morpheus/_lib/cmake/libmorpheus.cmake b/morpheus/_lib/cmake/libmorpheus.cmake index b4d3e5baaf..2df8ae7cb3 100644 --- a/morpheus/_lib/cmake/libmorpheus.cmake +++ b/morpheus/_lib/cmake/libmorpheus.cmake @@ -100,6 +100,8 @@ target_link_libraries(morpheus $ cudf::cudf CUDA::nvtx3 + cudf::cudf + glog::glog mrc::pymrc RDKAFKA::RDKAFKA TritonClient::httpclient_static diff --git a/morpheus/_lib/cudf_helpers.pyx b/morpheus/_lib/cudf_helpers.pyx index 0940fd8f18..fbda85cf72 100644 --- a/morpheus/_lib/cudf_helpers.pyx +++ b/morpheus/_lib/cudf_helpers.pyx @@ -21,16 +21,32 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.cpp.io.types cimport column_name_info -from cudf._lib.cpp.io.types cimport table_metadata -from cudf._lib.cpp.io.types cimport table_with_metadata -from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type +from cudf._lib.pylibcudf.libcudf.io.types cimport column_name_info +from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata +from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata +from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view +from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.utils cimport data_from_unique_ptr from cudf._lib.utils cimport get_column_names from cudf._lib.utils cimport table_view_from_table +cdef vector[string] get_column_names(object tbl, object index): + cdef vector[string] column_names + if index is not False: + if isinstance(tbl._index, cudf.core.multiindex.MultiIndex): + for idx_name in tbl._index.names: + column_names.push_back(str.encode(idx_name)) + else: + if tbl._index.name is not None: + column_names.push_back(str.encode(tbl._index.name)) + + for col_name in tbl._column_names: + column_names.push_back(str.encode(col_name)) + + return column_names + + cdef extern from "morpheus/objects/table_info.hpp" namespace "morpheus" nogil: diff --git a/morpheus/_lib/src/messages/multi.cpp b/morpheus/_lib/src/messages/multi.cpp index 6e42e839d7..987be62377 100644 --- a/morpheus/_lib/src/messages/multi.cpp +++ b/morpheus/_lib/src/messages/multi.cpp @@ -397,7 +397,15 @@ void MultiMessageInterfaceProxy::set_meta(MultiMessage& self, pybind11::object c } // Perform the update via slices - df.attr("loc")[pybind11::make_tuple(df.attr("index")[row_indexer], columns)] = value; + auto is_string_dtype = pybind11::module_::import("cudf.api.types").attr("is_string_dtype"); + auto series = pybind11::module_::import("cudf").attr("Series"); + + if (is_string_dtype(series(value)).cast()) { + df[columns] = pybind11::str(); + df[columns].attr("iloc")[row_indexer] = value; + } else { + df.attr("loc")[pybind11::make_tuple(df.attr("index")[row_indexer], columns)] = value; + } // Reset the index if we changed it if (!saved_index.is_none()) diff --git a/morpheus/_lib/src/utilities/matx_util.cu b/morpheus/_lib/src/utilities/matx_util.cu index a1dc626242..b5bf6c6b22 100644 --- a/morpheus/_lib/src/utilities/matx_util.cu +++ b/morpheus/_lib/src/utilities/matx_util.cu @@ -274,7 +274,7 @@ struct MatxUtil__MatxThreshold auto output_tensor = matx::make_tensor(static_cast(output_data), output_shape); // Convert max value to bool - (output_tensor = matx::rmax(input_tensor, {1}) > (InputT)threshold).run(stream.value()); + (output_tensor = matx::max(input_tensor, {1}) > (InputT)threshold).run(stream.value()); } /** @@ -362,7 +362,7 @@ struct MatxUtil__MatxReduceMax auto output_slice = output_tensor.template Slice<1>({output_idx, 0}, {matx::matxDropDim, matx::matxEnd}); - (output_slice = matx::rmax(input_slice.Permute({1, 0}))).run(stream.value()); + (output_slice = matx::max(input_slice.Permute({1, 0}))).run(stream.value()); } }; } // namespace diff --git a/morpheus/_lib/tests/CMakeLists.txt b/morpheus/_lib/tests/CMakeLists.txt index 788055a7c0..16379378b8 100644 --- a/morpheus/_lib/tests/CMakeLists.txt +++ b/morpheus/_lib/tests/CMakeLists.txt @@ -26,6 +26,7 @@ add_executable(test_cuda target_link_libraries(test_cuda PRIVATE + glog::glog GTest::gtest GTest::gtest_main matx::matx diff --git a/morpheus/llm/services/nemo_llm_service.py b/morpheus/llm/services/nemo_llm_service.py index 364b3928bc..e90460ed05 100644 --- a/morpheus/llm/services/nemo_llm_service.py +++ b/morpheus/llm/services/nemo_llm_service.py @@ -27,7 +27,7 @@ IMPORT_ERROR_MESSAGE = ( "NemoLLM not found. Install it and other additional dependencies by running the following command:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/dev_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune`") try: import nemollm diff --git a/morpheus/llm/services/nvfoundation_llm_service.py b/morpheus/llm/services/nvfoundation_llm_service.py index da6820d0d9..5955932344 100644 --- a/morpheus/llm/services/nvfoundation_llm_service.py +++ b/morpheus/llm/services/nvfoundation_llm_service.py @@ -26,7 +26,7 @@ "The `langchain-nvidia-ai-endpoints` package was not found. Install it and other additional dependencies by " "running the following command:" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/examples_cuda-121_arch-x86_64.yaml`") + "--file conda/environments/examples_cuda-125_arch-x86_64.yaml`") try: from langchain_core.prompt_values import StringPromptValue diff --git a/morpheus/llm/services/openai_chat_service.py b/morpheus/llm/services/openai_chat_service.py index 3b2c87b4f2..d644f35dd3 100644 --- a/morpheus/llm/services/openai_chat_service.py +++ b/morpheus/llm/services/openai_chat_service.py @@ -32,7 +32,7 @@ IMPORT_ERROR_MESSAGE = ("OpenAIChatService & OpenAIChatClient require the openai package to be installed. " "Install it by running the following command:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/dev_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune`") try: import openai diff --git a/morpheus/messages/multi_message.py b/morpheus/messages/multi_message.py index 44e1bb6cba..eb8f1863bf 100644 --- a/morpheus/messages/multi_message.py +++ b/morpheus/messages/multi_message.py @@ -291,7 +291,16 @@ def set_meta(self, columns: typing.Union[None, str, typing.List[str]], value): saved_index = df.index df.reset_index(drop=True, inplace=True) - df.loc[df.index[row_indexer], columns] = value + + # TODO: when value is strings, make all values empty strings + from cudf.api.types import is_string_dtype + + if is_string_dtype(cudf.Series(value)): + df[columns] = "" + df[columns].iloc[row_indexer] = value + else: + df.loc[df.index[row_indexer], columns] = value + df.set_index(saved_index, inplace=True) else: # Need to determine the boolean mask to use indexes with df.loc diff --git a/morpheus/parsers/event_parser.py b/morpheus/parsers/event_parser.py index 998232c130..a82785b48a 100644 --- a/morpheus/parsers/event_parser.py +++ b/morpheus/parsers/event_parser.py @@ -98,7 +98,7 @@ def parse_raw_event(self, text: cudf.Series, event_regex: typing.Dict[str, any]) # Applies regex pattern for each expected output column to raw data for col in event_specific_columns: regex_pattern = event_regex.get(col) - extracted_gdf = text.str.extract(regex_pattern) + extracted_gdf = text.str.extract(regex_pattern).reset_index() if not extracted_gdf.empty: parsed_gdf[col] = extracted_gdf[0] diff --git a/morpheus/stages/input/arxiv_source.py b/morpheus/stages/input/arxiv_source.py index c1ed77c0cb..dc03858c93 100644 --- a/morpheus/stages/input/arxiv_source.py +++ b/morpheus/stages/input/arxiv_source.py @@ -37,7 +37,7 @@ IMPORT_ERROR_MESSAGE = ( "ArxivSource requires additional dependencies to be installed. Install them by running the following command: " "`conda env update --solver=libmamba -n morpheus" - "--file conda/environments/all_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/all_cuda-125_arch-x86_64.yaml --prune`") @register_stage("from-arxiv") @@ -47,7 +47,7 @@ class ArxivSource(PreallocatorMixin, SingleOutputSource): This stage requires several additional dependencies to be installed. Install them by running the following command: `conda env update --solver=libmamba -n morpheus " - "--file conda/environments/all_cuda-121_arch-x86_64.yaml --prune` + "--file conda/environments/all_cuda-125_arch-x86_64.yaml --prune` Parameters ---------- diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md index 905e6e30b5..afcb4ffe6f 100644 --- a/tests/benchmarks/README.md +++ b/tests/benchmarks/README.md @@ -200,5 +200,5 @@ You can use the same Dev container created here to run the Production DFP benchm ```bash mamba env update \ -n ${CONDA_DEFAULT_ENV} \ - --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` diff --git a/tests/conftest.py b/tests/conftest.py index 732dee996c..733dd65d43 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1051,7 +1051,7 @@ def nemollm_fixture(fail_missing: bool): """ skip_reason = ("Tests for the NeMoLLMService require the nemollm package to be installed, to install this run:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/all_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/all_cuda-125_arch-x86_64.yaml --prune`") yield import_or_skip("nemollm", reason=skip_reason, fail_missing=fail_missing) @@ -1063,7 +1063,7 @@ def nvfoundationllm_fixture(fail_missing: bool): skip_reason = ( "Tests for NVFoundation require the langchain-nvidia-ai-endpoints package to be installed, to install this " "run:\n `conda env update --solver=libmamba -n morpheus " - "--file conda/environments/all_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/all_cuda-125_arch-x86_64.yaml --prune`") yield import_or_skip("langchain_nvidia_ai_endpoints", reason=skip_reason, fail_missing=fail_missing) @@ -1074,7 +1074,7 @@ def openai_fixture(fail_missing: bool): """ skip_reason = ("Tests for the OpenAIChatService require the openai package to be installed, to install this run:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/all_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/all_cuda-125_arch-x86_64.yaml --prune`") yield import_or_skip("openai", reason=skip_reason, fail_missing=fail_missing) diff --git a/tests/examples/llm/common/conftest.py b/tests/examples/llm/common/conftest.py index 259b535304..591ed21cba 100644 --- a/tests/examples/llm/common/conftest.py +++ b/tests/examples/llm/common/conftest.py @@ -60,5 +60,5 @@ def langchain_fixture(fail_missing: bool): skip_reason = ("Tests for the WebScraperStage require the langchain package to be installed, to install this run:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/dev_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune`") yield import_or_skip("langchain", reason=skip_reason, fail_missing=fail_missing) diff --git a/tests/stages/arxiv/conftest.py b/tests/stages/arxiv/conftest.py index 0865faada0..6166cb1250 100644 --- a/tests/stages/arxiv/conftest.py +++ b/tests/stages/arxiv/conftest.py @@ -23,7 +23,7 @@ SKIP_REASON = ("Tests for the arxiv_source require a number of packages not installed in the Morpheus development " "environment. To install these run:\n" "`conda env update --solver=libmamba -n morpheus " - "--file conda/environments/dev_cuda-121_arch-x86_64.yaml --prune`") + "--file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune`") @pytest.fixture(name="arxiv", autouse=True, scope='session') From 39954e42bc109391cd0d02bbc8c70a0f1ccdc602 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 17:46:30 +0000 Subject: [PATCH 07/12] updates for rapids-24.06 --- conda/environments/all_cuda-125_arch-x86_64.yaml | 3 +-- conda/environments/dev_cuda-125_arch-x86_64.yaml | 1 - conda/environments/examples_cuda-125_arch-x86_64.yaml | 2 +- conda/environments/model-utils_cuda-125_arch-x86_64.yaml | 2 +- dependencies.yaml | 3 +-- 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 4b403c88ae..5d392aea16 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - cuda-sanitizer-api - cuda-version=12.5 - cudf=24.06 -- cuml=24.06.6 +- cuml=24.06 - cupy - cxx-compiler - cython=3.0 @@ -116,7 +116,6 @@ dependencies: - watchdog=3.0 - websockets - yapf=0.40.1 -- zlib=1.2.13 - pip: - --extra-index-url https://download.pytorch.org/whl/cu124 - --find-links https://data.dgl.ai/wheels-test/repo.html diff --git a/conda/environments/dev_cuda-125_arch-x86_64.yaml b/conda/environments/dev_cuda-125_arch-x86_64.yaml index 0a3bd79236..da42e11f9c 100644 --- a/conda/environments/dev_cuda-125_arch-x86_64.yaml +++ b/conda/environments/dev_cuda-125_arch-x86_64.yaml @@ -96,7 +96,6 @@ dependencies: - watchdog=3.0 - websockets - yapf=0.40.1 -- zlib=1.2.13 - pip: - --extra-index-url https://download.pytorch.org/whl/cu124 - databricks-cli < 0.100 diff --git a/conda/environments/examples_cuda-125_arch-x86_64.yaml b/conda/environments/examples_cuda-125_arch-x86_64.yaml index 2578571367..e0cd6a30f1 100644 --- a/conda/environments/examples_cuda-125_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-125_arch-x86_64.yaml @@ -16,7 +16,7 @@ dependencies: - boto3 - click>=8 - cudf=24.06 -- cuml=24.06.6 +- cuml=24.06 - cupy - datacompy=0.10 - dill=0.3.7 diff --git a/conda/environments/model-utils_cuda-125_arch-x86_64.yaml b/conda/environments/model-utils_cuda-125_arch-x86_64.yaml index 5b1f535718..b2dfab6133 100644 --- a/conda/environments/model-utils_cuda-125_arch-x86_64.yaml +++ b/conda/environments/model-utils_cuda-125_arch-x86_64.yaml @@ -9,7 +9,7 @@ channels: - nvidia/label/dev - pytorch dependencies: -- cuml=24.06.6 +- cuml=24.06 - jupyterlab - matplotlib - onnx diff --git a/dependencies.yaml b/dependencies.yaml index d239bb59e1..a74fd54516 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -264,7 +264,6 @@ dependencies: - rdma-core>=48 # Needed for DOCA. - scikit-build=0.17.6 - versioneer-518 - - zlib=1.2.13 checks: common: @@ -392,7 +391,7 @@ dependencies: common: - output_types: [conda] packages: - - &cuml cuml=24.06.6 + - &cuml cuml=24.06 - pip - pip: - --find-links https://data.dgl.ai/wheels/cu121/repo.html From 6357df879f01ea12fa44f951f118322d5373a33c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 9 Sep 2024 19:33:19 +0000 Subject: [PATCH 08/12] rapids 24.06 updates --- morpheus/_lib/cudf_helpers.pyx | 1 - tests/dfencoder/test_autoencoder.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/morpheus/_lib/cudf_helpers.pyx b/morpheus/_lib/cudf_helpers.pyx index fbda85cf72..6db47c1b13 100644 --- a/morpheus/_lib/cudf_helpers.pyx +++ b/morpheus/_lib/cudf_helpers.pyx @@ -27,7 +27,6 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.pylibcudf.libcudf.types cimport size_type from cudf._lib.utils cimport data_from_unique_ptr -from cudf._lib.utils cimport get_column_names from cudf._lib.utils cimport table_view_from_table diff --git a/tests/dfencoder/test_autoencoder.py b/tests/dfencoder/test_autoencoder.py index 43a1b7574b..bd02907f92 100755 --- a/tests/dfencoder/test_autoencoder.py +++ b/tests/dfencoder/test_autoencoder.py @@ -503,4 +503,4 @@ def test_auto_encoder_num_only_convergence(train_ae: autoencoder.AutoEncoder): for loss in train_ae.logger.train_fts.values()], axis=0) / len(train_ae.logger.train_fts) # Make sure the model converges with numerical feats only - assert avg_loss[-1] < avg_loss[0] / 2 + assert avg_loss[-1] < avg_loss[0] * 0.75 From 7204629dcd31948f022763e243ba9a727117c2c9 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Sep 2024 19:50:01 +0000 Subject: [PATCH 09/12] update utilities branch --- external/utilities | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/utilities b/external/utilities index af13da47e7..ce8410887a 160000 --- a/external/utilities +++ b/external/utilities @@ -1 +1 @@ -Subproject commit af13da47e72419c5ad675df504d5c176d43f3a6e +Subproject commit ce8410887ad229e9c84d216985ada77eda263b49 From b5007a82ecf626a4bc572d39480ab563412203c4 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Sep 2024 19:59:21 +0000 Subject: [PATCH 10/12] fix get-pr-info gha action reference --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 3fb3b618ca..55fabe7961 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -61,7 +61,7 @@ jobs: steps: - name: Get PR Info id: get-pr-info - uses: rapidsai/shared-action-workflows/get-pr-info@branch-23.08 + uses: nv-gha-runners/get-pr-info@branch-23.08 if: ${{ startsWith(github.ref_name, 'pull-request/') }} outputs: is_pr: ${{ startsWith(github.ref_name, 'pull-request/') }} From fccddbb181d0d92bbb77aa2bb4ff5fdaa421cb29 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Sep 2024 20:04:02 +0000 Subject: [PATCH 11/12] fix get-pr-info gha action reference --- .github/workflows/pr.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 55fabe7961..b4cb867e78 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -61,7 +61,7 @@ jobs: steps: - name: Get PR Info id: get-pr-info - uses: nv-gha-runners/get-pr-info@branch-23.08 + uses: nv-gha-runners/get-pr-info@main if: ${{ startsWith(github.ref_name, 'pull-request/') }} outputs: is_pr: ${{ startsWith(github.ref_name, 'pull-request/') }} From c9b7dcd051e35dbf614ccf6391fccb68b9e16834 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 18 Sep 2024 15:46:28 -0500 Subject: [PATCH 12/12] . --- .github/workflows/pr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b4cb867e78..de4b43f66f 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -49,7 +49,7 @@ jobs: - checks - ci_pipe secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 prepare: # Executes the get-pr-info action to determine if the PR has the skip-ci label, if the action fails we assume the @@ -76,7 +76,7 @@ jobs: # Only run the CI pipeline if the PR does not have the skip-ci label and we are on a PR branch if: ${{ !fromJSON(needs.prepare.outputs.has_skip_ci_label) && fromJSON(needs.prepare.outputs.is_pr )}} secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false