diff --git a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md index 19965504d7..fa3a37dd61 100644 --- a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md +++ b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md @@ -35,7 +35,7 @@ from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.config import CppConfig from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file +from morpheus.utils.file_utils import load_labels_file ``` ```python CppConfig.set_should_use_cpp(False) diff --git a/examples/digital_fingerprinting/production/grafana/run.py b/examples/digital_fingerprinting/production/grafana/run.py index f5768eab1c..2bb7ade0e4 100644 --- a/examples/digital_fingerprinting/production/grafana/run.py +++ b/examples/digital_fingerprinting/production/grafana/run.py @@ -40,7 +40,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_log_level from morpheus.common import FileTypes from morpheus.common import FilterSource @@ -60,6 +59,7 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.file_utils import date_extractor +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py index 927c9bfbb7..e6726b18c4 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/config_generator.py @@ -21,11 +21,11 @@ from dfp.utils.schema_utils import Schema from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.config import CppConfig from morpheus.messages import ControlMessage +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py index 81de60094d..e35c3d5f02 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp_azure_pipeline.py @@ -38,7 +38,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_log_level from morpheus.common import FileTypes from morpheus.common import FilterSource @@ -58,6 +57,7 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.file_utils import date_extractor +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging diff --git a/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py index 4ba43aced1..4f8333d632 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp_duo_pipeline.py @@ -38,7 +38,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_log_level from morpheus.common import FileTypes from morpheus.common import FilterSource @@ -59,6 +58,7 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.file_utils import date_extractor +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb index 8e5413f71c..1b40052f04 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb @@ -1,497 +1,497 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2941e94f-db20-44a5-ab87-2cab499825f7", - "metadata": {}, - "source": [ - "# Digital Finger Printing (DFP) with Morpheus - Azure Inference\n", - "## Introduction\n", - "\n", - "In this notebook, we will be building and running a DFP pipeline that performs inference on Azure logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", - "\n", - "
\n", - "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", - "import sys\n", - "import os\n", - "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "102ce011-3ca3-4f96-a72d-de28fad32003", - "metadata": {}, - "outputs": [], - "source": [ - "import functools\n", - "import logging\n", - "import os\n", - "import typing\n", - "import mlflow\n", - "\n", - "from datetime import datetime\n", - "from functools import partial\n", - "\n", - "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", - "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", - "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", - "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", - "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", - "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", - "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", - "from dfp.stages.multi_file_source import MultiFileSource\n", - "from dfp.utils.regex_utils import iso_date_regex\n", - "\n", - "from morpheus.common import FileTypes\n", - "from morpheus.common import FilterSource\n", - "from morpheus.cli.utils import get_log_levels\n", - "from morpheus.cli.utils import get_package_relative_file\n", - "from morpheus.cli.utils import load_labels_file\n", - "from morpheus.cli.utils import parse_log_level\n", - "from morpheus.config import Config\n", - "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", - "from morpheus.pipeline import LinearPipeline\n", - "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", - "from morpheus.utils.column_info import ColumnInfo\n", - "from morpheus.utils.column_info import DataFrameInputSchema\n", - "from morpheus.utils.column_info import DateTimeColumn\n", - "from morpheus.utils.column_info import DistinctIncrementColumn\n", - "from morpheus.utils.column_info import IncrementColumn\n", - "from morpheus.utils.column_info import RenameColumn\n", - "from morpheus.utils.column_info import StringCatColumn\n", - "from morpheus.utils.file_utils import date_extractor\n", - "from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage\n", - "from morpheus.stages.postprocess.serialize_stage import SerializeStage\n", - "from morpheus.utils.logger import configure_logging\n", - "\n", - "# Left align all tables\n", - "from IPython.core.display import HTML\n", - "table_css = 'table {align:left;display:block}'\n", - "HTML(''.format(table_css))" - ] - }, - { - "cell_type": "markdown", - "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", - "metadata": {}, - "source": [ - "## High Level Configuration\n", - "\n", - "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", - "\n", - "### Options\n", - "\n", - "| Name | Type | Description |\n", - "| --- | --- | :-- |\n", - "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", - "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", - "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", - "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", - "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", - "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ee00703-75c5-46fc-890c-86733da906c4", - "metadata": {}, - "outputs": [], - "source": [ - "# Global options\n", - "train_users = \"none\"\n", - "\n", - "# Enter any users to skip here\n", - "skip_users: typing.List[str] = []\n", - "\n", - "# Location where cache objects will be saved\n", - "cache_dir = \"./.cache/dfp\"\n", - "\n", - "# Input files to read from\n", - "input_files = [\n", - " \"../../../../data/dfp/azure-inference-data/AZUREAD_*.json\",\n", - "]\n", - "\n", - "# The format to use for models\n", - "model_name_formatter = \"DFP-azure-{user_id}\"\n", - "\n", - "# === Derived Options ===\n", - "# To include the generic, we must be training all or generic\n", - "include_generic = train_users == \"all\" or train_users == \"generic\"\n", - "\n", - "# To include individual, we must be either training or inferring\n", - "include_individual = train_users != \"generic\"\n", - "\n", - "# None indicates we arent training anything\n", - "is_training = train_users != \"none\"\n", - "\n", - "# Tracking URI\n", - "tracking_uri = \"http://mlflow:5000\"" - ] - }, - { - "cell_type": "markdown", - "id": "9b586016", - "metadata": {}, - "source": [ - "### Set MLFlow Tracking URI\n", - "Set MLFlow tracking URI to make inference calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ea82337", - "metadata": {}, - "outputs": [], - "source": [ - "mlflow.set_tracking_uri(tracking_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", - "metadata": {}, - "source": [ - "### Global Config Object\n", - "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", - " - Indicating whether to use C++ or Python stages\n", - " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", - " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", - " - Sets the feature column names that will be used in model training\n", - " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", - " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", - " - The column name that indicates the user's unique identifier\n", - " - It is required for DFP to have a user ID column\n", - " - The column name that indicates the timestamp for the log\n", - " - It is required for DFP to know when each log occurred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01abd537-9162-49dc-8e83-d9465592f1d5", - "metadata": {}, - "outputs": [], - "source": [ - "# Enable the Morpheus logger\n", - "configure_logging(log_level=logging.DEBUG)\n", - "\n", - "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", - "config.num_threads = len(os.sched_getaffinity(0))\n", - "\n", - "config.ae = ConfigAutoEncoder()\n", - "\n", - "config.ae.feature_columns = [\n", - " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\",\n", - "]\n", - "config.ae.userid_column_name = \"username\"\n", - "config.ae.timestamp_column_name = \"timestamp\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", - "metadata": {}, - "outputs": [], - "source": [ - "# Specify the column names to ensure all data is uniform\n", - "source_column_info = [\n", - " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", - " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", - " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", - " ColumnInfo(name=\"category\", dtype=str),\n", - " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", - " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", - " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", - " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", - " dtype=str,\n", - " input_name=\"properties.deviceDetail.operatingSystem\"),\n", - " StringCatColumn(name=\"location\",\n", - " dtype=str,\n", - " input_columns=[\n", - " \"properties.location.city\",\n", - " \"properties.location.countryOrRegion\",\n", - " ],\n", - " sep=\", \"),\n", - " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", - "]\n", - "\n", - "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", - "metadata": {}, - "outputs": [], - "source": [ - "# Preprocessing schema\n", - "preprocess_column_info = [\n", - " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", - " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", - " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", - " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", - " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", - "\n", - " # Derived columns\n", - " IncrementColumn(name=\"logcount\",\n", - " dtype=int,\n", - " input_name=config.ae.timestamp_column_name,\n", - " groupby_column=config.ae.userid_column_name),\n", - " DistinctIncrementColumn(name=\"locincrement\",\n", - " dtype=int,\n", - " input_name=\"location\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name),\n", - " DistinctIncrementColumn(name=\"appincrement\",\n", - " dtype=int,\n", - " input_name=\"appDisplayName\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name)\n", - "]\n", - "\n", - "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", - "metadata": {}, - "source": [ - "## Pipeline Construction\n", - "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", - "\n", - "### Source Stage (`MultiFileSource`)\n", - "\n", - "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", - "\n", - "### File Batcher Stage (`DFPFileBatcherStage`)\n", - "\n", - "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", - "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", - "\n", - "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", - "\n", - "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", - "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", - "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", - "\n", - "### Split Users Stage (`DFPSplitUsersStage`)\n", - "\n", - "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", - "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", - "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", - "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", - "\n", - "### Rolling Window Stage (`DFPRollingWindowStage`)\n", - "\n", - "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", - "1. This stage keeps a moving window of logs on a per user basis\n", - " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", - "1. It only emits logs when the window history requirements are met\n", - " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", - " 1. Configuration options for defining the window history requirements are detailed below.\n", - "1. It repeats the necessary logs to properly calculate log dependent features.\n", - " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", - " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", - " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", - "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", - "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", - "\n", - "### Preprocessing Stage (`DFPPreprocessingStage`)\n", - "\n", - "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", - "\n", - "### Inference Stage (`DFPInference`)\n", - "\n", - "This stage performs several tasks to aid in performing inference. This stage will:\n", - "1. Download models as needed from MLFlow\n", - "1. Cache previously downloaded models to improve performance\n", - " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", - "1. Perform inference using the downloaded model\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", - "\n", - "### Filter Detection Stage (`FilterDetectionsStage`)\n", - "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |\n", - "| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |\n", - "| `filter_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |\n", - "| `field_name` | `str` | `probs` | Name of the tensor (`filter_source=FilterSource.TENSOR`) or DataFrame column (`filter_source=FilterSource.DATAFRAME`) to use as the filter criteria. |\n", - "\n", - "### Post Processing Stage (`DFPPostprocessingStage`)\n", - "This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.\n", - "\n", - "### Serialize Stage (`SerializeStage`)\n", - "This stage controls which columns in the DataFrame will be included in the output. For the purposes of the DFP pipeline, we will exclude columns that are used internally by the pipeline which are not of interest to the end-user.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include` | List of `str` | `[]` | List of regular expression patterns matching columns to include in the output. Specifying an empty list causes all columns to be included not explicitly excluded. |\n", - "| `exclude` | List of `str` | `[r'^ID$', r'^_ts_']` | List of regular expression patterns matching columns to exclude from the output. |\n", - "| `fixed_columns` | `bool` | `True` | When `True` it is assumed that the Dataframe in all messages contain the same columns as the first message received. |\n", - "\n", - "### Write to File Stage (`WriteToFileStage`)\n", - "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filename` | `str` | | The file to write anomalous log messages to. |\n", - "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "825390ad-ce64-4949-b324-33039ffdf264", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a linear pipeline object\n", - "pipeline = LinearPipeline(config)\n", - "\n", - "# Source stage\n", - "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", - "\n", - "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", - "pipeline.add_stage(\n", - " DFPFileBatcherStage(config,\n", - " period=\"D\",\n", - " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", - "\n", - "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", - "pipeline.add_stage(\n", - " DFPFileToDataFrameStage(config,\n", - " schema=source_schema,\n", - " file_type=FileTypes.JSON,\n", - " parser_kwargs={\n", - " \"lines\": False, \"orient\": \"records\"\n", - " },\n", - " cache_dir=cache_dir))\n", - "\n", - "\n", - "# This will split users or just use one single user\n", - "pipeline.add_stage(\n", - " DFPSplitUsersStage(config,\n", - " include_generic=include_generic,\n", - " include_individual=include_individual,\n", - " skip_users=skip_users))\n", - "\n", - "# Next, have a stage that will create rolling windows\n", - "pipeline.add_stage(\n", - " DFPRollingWindowStage(\n", - " config,\n", - " min_history=300 if is_training else 1,\n", - " min_increment=300 if is_training else 0,\n", - " # For inference, we only ever want 1 day max\n", - " max_history=\"60d\" if is_training else \"1d\",\n", - " cache_dir=cache_dir))\n", - "\n", - "# Output is UserMessageMeta -- Cached frame set\n", - "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", - "\n", - "# Perform inference on the preprocessed data\n", - "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", - "\n", - "# Filter for only the anomalous logs\n", - "pipeline.add_stage(\n", - " FilterDetectionsStage(config, threshold=2.0, filter_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))\n", - "pipeline.add_stage(DFPPostprocessingStage(config))\n", - "\n", - "# Exclude the columns we don't want in our output\n", - "pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))\n", - "\n", - "# Write all anomalies to a CSV file\n", - "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_azure.csv\", overwrite=True))\n", - "\n", - "# Run the pipeline\n", - "await pipeline.run_async()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39971052-6211-4bfe-82d8-88f5845562bc", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "morpheus", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Azure logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import typing\n", + "import mlflow\n", + "\n", + "from datetime import datetime\n", + "from functools import partial\n", + "\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.regex_utils import iso_date_regex\n", + "\n", + "from morpheus.common import FileTypes\n", + "from morpheus.common import FilterSource\n", + "from morpheus.cli.utils import get_log_levels\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.utils.file_utils import load_labels_file\n", + "from morpheus.cli.utils import parse_log_level\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.column_info import ColumnInfo\n", + "from morpheus.utils.column_info import DataFrameInputSchema\n", + "from morpheus.utils.column_info import DateTimeColumn\n", + "from morpheus.utils.column_info import DistinctIncrementColumn\n", + "from morpheus.utils.column_info import IncrementColumn\n", + "from morpheus.utils.column_info import RenameColumn\n", + "from morpheus.utils.column_info import StringCatColumn\n", + "from morpheus.utils.file_utils import date_extractor\n", + "from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage\n", + "from morpheus.stages.postprocess.serialize_stage import SerializeStage\n", + "from morpheus.utils.logger import configure_logging\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"../../../../data/dfp/azure-inference-data/AZUREAD_*.json\",\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"\n", + "\n", + "# Tracking URI\n", + "tracking_uri = \"http://mlflow:5000\"" + ] + }, + { + "cell_type": "markdown", + "id": "9b586016", + "metadata": {}, + "source": [ + "### Set MLFlow Tracking URI\n", + "Set MLFlow tracking URI to make inference calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ea82337", + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(tracking_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = len(os.sched_getaffinity(0))\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\",\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " DistinctIncrementColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " input_name=\"location\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name),\n", + " DistinctIncrementColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " input_name=\"appDisplayName\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name)\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. Configuration options for defining the window history requirements are detailed below.\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Filter Detection Stage (`FilterDetectionsStage`)\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |\n", + "| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |\n", + "| `filter_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |\n", + "| `field_name` | `str` | `probs` | Name of the tensor (`filter_source=FilterSource.TENSOR`) or DataFrame column (`filter_source=FilterSource.DATAFRAME`) to use as the filter criteria. |\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.\n", + "\n", + "### Serialize Stage (`SerializeStage`)\n", + "This stage controls which columns in the DataFrame will be included in the output. For the purposes of the DFP pipeline, we will exclude columns that are used internally by the pipeline which are not of interest to the end-user.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include` | List of `str` | `[]` | List of regular expression patterns matching columns to include in the output. Specifying an empty list causes all columns to be included not explicitly excluded. |\n", + "| `exclude` | List of `str` | `[r'^ID$', r'^_ts_']` | List of regular expression patterns matching columns to exclude from the output. |\n", + "| `fixed_columns` | `bool` | `True` | When `True` it is assumed that the Dataframe in all messages contain the same columns as the first message received. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(\n", + " FilterDetectionsStage(config, threshold=2.0, filter_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))\n", + "pipeline.add_stage(DFPPostprocessingStage(config))\n", + "\n", + "# Exclude the columns we don't want in our output\n", + "pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_azure.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline.run_async()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39971052-6211-4bfe-82d8-88f5845562bc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "morpheus", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb index fc49b736b6..acb759341b 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb @@ -1,464 +1,464 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2941e94f-db20-44a5-ab87-2cab499825f7", - "metadata": {}, - "source": [ - "# Digital Finger Printing (DFP) with Morpheus - Azure Training\n", - "## Introduction\n", - "\n", - "In this notebook, we will be building and running a DFP pipeline that performs training on Azure logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", - "\n", - "
\n", - "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", - "import sys\n", - "import os\n", - "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "102ce011-3ca3-4f96-a72d-de28fad32003", - "metadata": {}, - "outputs": [], - "source": [ - "import functools\n", - "import logging\n", - "import os\n", - "import mlflow\n", - "import typing\n", - "from datetime import datetime\n", - "\n", - "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", - "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", - "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", - "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", - "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", - "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", - "from dfp.stages.dfp_training import DFPTraining\n", - "from dfp.stages.multi_file_source import MultiFileSource\n", - "from dfp.utils.regex_utils import iso_date_regex\n", - "\n", - "from morpheus.common import FileTypes\n", - "from morpheus.cli.utils import get_log_levels\n", - "from morpheus.cli.utils import get_package_relative_file\n", - "from morpheus.cli.utils import load_labels_file\n", - "from morpheus.cli.utils import parse_log_level\n", - "from morpheus.config import Config\n", - "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", - "from morpheus.pipeline import LinearPipeline\n", - "from morpheus.utils.column_info import ColumnInfo\n", - "from morpheus.utils.column_info import DataFrameInputSchema\n", - "from morpheus.utils.column_info import DateTimeColumn\n", - "from morpheus.utils.column_info import DistinctIncrementColumn\n", - "from morpheus.utils.column_info import IncrementColumn\n", - "from morpheus.utils.column_info import RenameColumn\n", - "from morpheus.utils.column_info import StringCatColumn\n", - "from morpheus.utils.file_utils import date_extractor\n", - "from morpheus.utils.logger import configure_logging\n", - "\n", - "# Left align all tables\n", - "from IPython.core.display import HTML\n", - "table_css = 'table {align:left;display:block}'\n", - "HTML(''.format(table_css))" - ] - }, - { - "cell_type": "markdown", - "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", - "metadata": {}, - "source": [ - "## High Level Configuration\n", - "\n", - "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", - "\n", - "### Options\n", - "\n", - "| Name | Type | Description |\n", - "| --- | --- | :-- |\n", - "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:|\n", - "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", - "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", - "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", - "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", - "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ee00703-75c5-46fc-890c-86733da906c4", - "metadata": {}, - "outputs": [], - "source": [ - "# Global options\n", - "train_users = \"all\"\n", - "\n", - "# Enter any users to skip here\n", - "skip_users: typing.List[str] = []\n", - "\n", - "# Location where cache objects will be saved\n", - "cache_dir = \"/workspace/.cache/dfp\"\n", - "\n", - "# Input files to read from\n", - "input_files = [\n", - " \"../../../../data/dfp/azure-training-data/AZUREAD_2022-08-0*.json\"\n", - "]\n", - "\n", - "# The format to use for models\n", - "model_name_formatter = \"DFP-azure-{user_id}\"\n", - "\n", - "# The format to use for experiment names\n", - "experiment_name_formatter = \"dfp/azure/training/{reg_model_name}\"\n", - "\n", - "# === Derived Options ===\n", - "# To include the generic, we must be training all or generic\n", - "include_generic = train_users == \"all\" or train_users == \"generic\"\n", - "\n", - "# To include individual, we must be either training or inferring\n", - "include_individual = train_users != \"generic\"\n", - "\n", - "# None indicates we arent training anything\n", - "is_training = train_users != \"none\"\n", - "\n", - "# Tracking URI\n", - "tracking_uri = \"http://mlflow:5000\"" - ] - }, - { - "cell_type": "markdown", - "id": "61fd858e", - "metadata": {}, - "source": [ - "### Set MLFlow Tracking URI\n", - "Set MLFlow tracking URI to make inference calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30c80a17", - "metadata": {}, - "outputs": [], - "source": [ - "mlflow.set_tracking_uri(tracking_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", - "metadata": {}, - "source": [ - "### Global Config Object\n", - "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", - " - Indicating whether to use C++ or Python stages\n", - " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", - " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", - " - Sets the feature column names that will be used in model training\n", - " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", - " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", - " - The column name that indicates the user's unique identifier\n", - " - It is required for DFP to have a user ID column\n", - " - The column name that indicates the timestamp for the log\n", - " - It is required for DFP to know when each log occurred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01abd537-9162-49dc-8e83-d9465592f1d5", - "metadata": {}, - "outputs": [], - "source": [ - "# Enable the Morpheus logger\n", - "configure_logging(log_level=logging.DEBUG)\n", - "\n", - "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", - "config.num_threads = len(os.sched_getaffinity(0))\n", - "\n", - "config.ae = ConfigAutoEncoder()\n", - "\n", - "config.ae.feature_columns = [\n", - " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\",\n", - "]\n", - "config.ae.userid_column_name = \"username\"\n", - "config.ae.timestamp_column_name = \"timestamp\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", - "metadata": {}, - "outputs": [], - "source": [ - "# Specify the column names to ensure all data is uniform\n", - "source_column_info = [\n", - " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", - " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", - " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", - " ColumnInfo(name=\"category\", dtype=str),\n", - " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", - " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", - " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", - " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", - " dtype=str,\n", - " input_name=\"properties.deviceDetail.operatingSystem\"),\n", - " StringCatColumn(name=\"location\",\n", - " dtype=str,\n", - " input_columns=[\n", - " \"properties.location.city\",\n", - " \"properties.location.countryOrRegion\",\n", - " ],\n", - " sep=\", \"),\n", - " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", - "]\n", - "\n", - "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", - "metadata": {}, - "outputs": [], - "source": [ - "# Preprocessing schema\n", - "preprocess_column_info = [\n", - " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", - " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", - " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", - " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", - " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", - " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", - "\n", - " # Derived columns\n", - " IncrementColumn(name=\"logcount\",\n", - " dtype=int,\n", - " input_name=config.ae.timestamp_column_name,\n", - " groupby_column=config.ae.userid_column_name),\n", - " DistinctIncrementColumn(name=\"locincrement\",\n", - " dtype=int,\n", - " input_name=\"location\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name),\n", - " DistinctIncrementColumn(name=\"appincrement\",\n", - " dtype=int,\n", - " input_name=\"appDisplayName\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name)\n", - "]\n", - "\n", - "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", - "metadata": {}, - "source": [ - "## Pipeline Construction\n", - "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", - "\n", - "### Source Stage (`MultiFileSource`)\n", - "\n", - "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", - "\n", - "### File Batcher Stage (`DFPFileBatcherStage`)\n", - "\n", - "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", - "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", - "\n", - "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", - "\n", - "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", - "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", - "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", - "\n", - "### Split Users Stage (`DFPSplitUsersStage`)\n", - "\n", - "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", - "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", - "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", - "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", - "\n", - "### Rolling Window Stage (`DFPRollingWindowStage`)\n", - "\n", - "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", - "1. This stage keeps a moving window of logs on a per user basis\n", - " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", - "1. It only emits logs when the window history requirements are met\n", - " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", - " 1. Configuration options for defining the window history requirements are detailed below.\n", - "1. It repeats the necessary logs to properly calculate log dependent features.\n", - " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", - " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", - " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", - "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", - "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", - "\n", - "### Preprocessing Stage (`DFPPreprocessingStage`)\n", - "\n", - "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", - "\n", - "### Training Stage (`DFPTraining`)\n", - "\n", - "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. Refer to `DFPAutoEncoder` for information on the available options. |\n", - "\n", - "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", - "\n", - "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", - "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "825390ad-ce64-4949-b324-33039ffdf264", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a linear pipeline object\n", - "pipeline = LinearPipeline(config)\n", - "\n", - "# Source stage\n", - "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", - "\n", - "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", - "pipeline.add_stage(\n", - " DFPFileBatcherStage(config,\n", - " period=\"D\",\n", - " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", - "\n", - "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", - "pipeline.add_stage(\n", - " DFPFileToDataFrameStage(config,\n", - " schema=source_schema,\n", - " file_type=FileTypes.JSON,\n", - " parser_kwargs={\n", - " \"lines\": False, \"orient\": \"records\"\n", - " },\n", - " cache_dir=cache_dir))\n", - "\n", - "\n", - "# This will split users or just use one single user\n", - "pipeline.add_stage(\n", - " DFPSplitUsersStage(config,\n", - " include_generic=include_generic,\n", - " include_individual=include_individual,\n", - " skip_users=skip_users))\n", - "\n", - "# Next, have a stage that will create rolling windows\n", - "pipeline.add_stage(\n", - " DFPRollingWindowStage(\n", - " config,\n", - " min_history=300 if is_training else 1,\n", - " min_increment=300 if is_training else 0,\n", - " # For inference, we only ever want 1 day max\n", - " max_history=\"60d\" if is_training else \"1d\",\n", - " cache_dir=cache_dir))\n", - "\n", - "# Output is UserMessageMeta -- Cached frame set\n", - "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", - "\n", - "# Finally, perform training which will output a model\n", - "pipeline.add_stage(DFPTraining(config, validation_size=0.10))\n", - "\n", - "# Write that model to MLFlow\n", - "pipeline.add_stage(\n", - " DFPMLFlowModelWriterStage(config,\n", - " model_name_formatter=model_name_formatter,\n", - " experiment_name_formatter=experiment_name_formatter))\n", - "\n", - "# Run the pipeline\n", - "await pipeline.run_async()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d2f7b52-bd77-4130-bbfa-97371b31ce24", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "morpheus", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - Azure Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Azure logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import mlflow\n", + "import typing\n", + "from datetime import datetime\n", + "\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.regex_utils import iso_date_regex\n", + "\n", + "from morpheus.common import FileTypes\n", + "from morpheus.cli.utils import get_log_levels\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.utils.file_utils import load_labels_file\n", + "from morpheus.cli.utils import parse_log_level\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.utils.column_info import ColumnInfo\n", + "from morpheus.utils.column_info import DataFrameInputSchema\n", + "from morpheus.utils.column_info import DateTimeColumn\n", + "from morpheus.utils.column_info import DistinctIncrementColumn\n", + "from morpheus.utils.column_info import IncrementColumn\n", + "from morpheus.utils.column_info import RenameColumn\n", + "from morpheus.utils.column_info import StringCatColumn\n", + "from morpheus.utils.file_utils import date_extractor\n", + "from morpheus.utils.logger import configure_logging\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_azure_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"/workspace/.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"../../../../data/dfp/azure-training-data/AZUREAD_2022-08-0*.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-azure-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/azure/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"\n", + "\n", + "# Tracking URI\n", + "tracking_uri = \"http://mlflow:5000\"" + ] + }, + { + "cell_type": "markdown", + "id": "61fd858e", + "metadata": {}, + "source": [ + "### Set MLFlow Tracking URI\n", + "Set MLFlow tracking URI to make inference calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30c80a17", + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(tracking_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = len(os.sched_getaffinity(0))\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " \"appDisplayName\", \"clientAppUsed\", \"deviceDetailbrowser\", \"deviceDetaildisplayName\", \"deviceDetailoperatingSystem\", \"statusfailureReason\", \"appincrement\", \"locincrement\", \"logcount\",\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"time\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"properties.userPrincipalName\"),\n", + " RenameColumn(name=\"appDisplayName\", dtype=str, input_name=\"properties.appDisplayName\"),\n", + " ColumnInfo(name=\"category\", dtype=str),\n", + " RenameColumn(name=\"clientAppUsed\", dtype=str, input_name=\"properties.clientAppUsed\"),\n", + " RenameColumn(name=\"deviceDetailbrowser\", dtype=str, input_name=\"properties.deviceDetail.browser\"),\n", + " RenameColumn(name=\"deviceDetaildisplayName\", dtype=str, input_name=\"properties.deviceDetail.displayName\"),\n", + " RenameColumn(name=\"deviceDetailoperatingSystem\",\n", + " dtype=str,\n", + " input_name=\"properties.deviceDetail.operatingSystem\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"properties.location.city\",\n", + " \"properties.location.countryOrRegion\",\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"statusfailureReason\", dtype=str, input_name=\"properties.status.failureReason\"),\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"properties\"], column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"appDisplayName\", dtype=str),\n", + " ColumnInfo(name=\"clientAppUsed\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailbrowser\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetaildisplayName\", dtype=str),\n", + " ColumnInfo(name=\"deviceDetailoperatingSystem\", dtype=str),\n", + " ColumnInfo(name=\"statusfailureReason\", dtype=str),\n", + "\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " DistinctIncrementColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " input_name=\"location\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name),\n", + " DistinctIncrementColumn(name=\"appincrement\",\n", + " dtype=int,\n", + " input_name=\"appDisplayName\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name)\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. Configuration options for defining the window history requirements are detailed below.\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. Refer to `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config, validation_size=0.10))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline.run_async()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d2f7b52-bd77-4130-bbfa-97371b31ce24", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "morpheus", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb index 8bb35d5f78..c2b126d2cc 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb @@ -1,496 +1,496 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2941e94f-db20-44a5-ab87-2cab499825f7", - "metadata": {}, - "source": [ - "# Digital Finger Printing (DFP) with Morpheus - DUO Inference\n", - "## Introduction\n", - "\n", - "In this notebook, we will be building and running a DFP pipeline that performs inference on Duo authentication logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", - "\n", - "
\n", - "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", - "import sys\n", - "import os\n", - "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "102ce011-3ca3-4f96-a72d-de28fad32003", - "metadata": {}, - "outputs": [], - "source": [ - "import functools\n", - "import logging\n", - "import os\n", - "import mlflow\n", - "import typing\n", - "from datetime import datetime\n", - "\n", - "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", - "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", - "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", - "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", - "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", - "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", - "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", - "from dfp.stages.multi_file_source import MultiFileSource\n", - "from dfp.utils.regex_utils import iso_date_regex\n", - "\n", - "from morpheus.common import FileTypes\n", - "from morpheus.common import FilterSource\n", - "from morpheus.cli.utils import get_log_levels\n", - "from morpheus.cli.utils import get_package_relative_file\n", - "from morpheus.cli.utils import load_labels_file\n", - "from morpheus.cli.utils import parse_log_level\n", - "from morpheus.config import Config\n", - "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", - "from morpheus.pipeline import LinearPipeline\n", - "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", - "from morpheus.utils.column_info import BoolColumn\n", - "from morpheus.utils.column_info import ColumnInfo\n", - "from morpheus.utils.column_info import DataFrameInputSchema\n", - "from morpheus.utils.column_info import DateTimeColumn\n", - "from morpheus.utils.column_info import DistinctIncrementColumn\n", - "from morpheus.utils.column_info import IncrementColumn\n", - "from morpheus.utils.column_info import RenameColumn\n", - "from morpheus.utils.column_info import StringCatColumn\n", - "from morpheus.utils.file_utils import date_extractor\n", - "from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage\n", - "from morpheus.stages.postprocess.serialize_stage import SerializeStage\n", - "from morpheus.utils.logger import configure_logging\n", - "\n", - "# Left align all tables\n", - "from IPython.core.display import HTML\n", - "table_css = 'table {align:left;display:block}'\n", - "HTML(''.format(table_css))" - ] - }, - { - "cell_type": "markdown", - "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", - "metadata": {}, - "source": [ - "## High Level Configuration\n", - "\n", - "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_duo_pipeline.py`, configures these options via command line arguments.\n", - "\n", - "### Options\n", - "\n", - "| Name | Type | Description |\n", - "| --- | --- | :-- |\n", - "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", - "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", - "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", - "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", - "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", - "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ee00703-75c5-46fc-890c-86733da906c4", - "metadata": {}, - "outputs": [], - "source": [ - "# Global options\n", - "train_users = \"none\"\n", - "\n", - "# Enter any users to skip here\n", - "skip_users: typing.List[str] = []\n", - "\n", - "# Location where cache objects will be saved\n", - "cache_dir = \"./.cache/dfp\"\n", - "\n", - "# Input files to read from\n", - "input_files = [\n", - " \"../../../../data/dfp/duo-inference-data/DUO_2022-08-*.json\"\n", - "]\n", - "\n", - "# The format to use for models\n", - "model_name_formatter = \"DFP-duo-{user_id}\"\n", - "\n", - "# === Derived Options ===\n", - "# To include the generic, we must be training all or generic\n", - "include_generic = train_users == \"all\" or train_users == \"generic\"\n", - "\n", - "# To include individual, we must be either training or inferring\n", - "include_individual = train_users != \"generic\"\n", - "\n", - "# None indicates we arent training anything\n", - "is_training = train_users != \"none\"\n", - "\n", - "# Tracking URI\n", - "tracking_uri = \"http://mlflow:5000\"" - ] - }, - { - "cell_type": "markdown", - "id": "fb6b7d75", - "metadata": {}, - "source": [ - "### Set MLFlow Tracking URI\n", - "Set MLFlow tracking URI to make inference calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a01ceb8", - "metadata": {}, - "outputs": [], - "source": [ - "mlflow.set_tracking_uri(tracking_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", - "metadata": {}, - "source": [ - "### Global Config Object\n", - "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", - " - Indicating whether to use C++ or Python stages\n", - " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", - " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", - " - Sets the feature column names that will be used in model training\n", - " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", - " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", - " - The column name that indicates the user's unique identifier\n", - " - It is required for DFP to have a user ID column\n", - " - The column name that indicates the timestamp for the log\n", - " - It is required for DFP to know when each log occurred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01abd537-9162-49dc-8e83-d9465592f1d5", - "metadata": {}, - "outputs": [], - "source": [ - "# Enable the Morpheus logger\n", - "configure_logging(log_level=logging.DEBUG)\n", - "\n", - "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", - "config.num_threads = len(os.sched_getaffinity(0))\n", - "\n", - "config.ae = ConfigAutoEncoder()\n", - "\n", - "config.ae.feature_columns = [\n", - " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", - "]\n", - "config.ae.userid_column_name = \"username\"\n", - "config.ae.timestamp_column_name = \"timestamp\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", - "metadata": {}, - "outputs": [], - "source": [ - "# Specify the column names to ensure all data is uniform\n", - "source_column_info = [\n", - " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", - " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", - " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", - " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", - " StringCatColumn(name=\"location\",\n", - " dtype=str,\n", - " input_columns=[\n", - " \"access_device.location.city\",\n", - " \"access_device.location.state\",\n", - " \"access_device.location.country\"\n", - " ],\n", - " sep=\", \"),\n", - " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", - " BoolColumn(name=\"result\",\n", - " dtype=bool,\n", - " input_name=\"result\",\n", - " true_values=[\"success\", \"SUCCESS\"],\n", - " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", - " ColumnInfo(name=\"reason\", dtype=str),\n", - " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", - "]\n", - "\n", - "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", - " column_info=source_column_info)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", - "metadata": {}, - "outputs": [], - "source": [ - "# Preprocessing schema\n", - "preprocess_column_info = [\n", - " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", - " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", - " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", - " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", - " ColumnInfo(name=\"authdevicename\", dtype=str),\n", - " ColumnInfo(name=\"result\", dtype=bool),\n", - " ColumnInfo(name=\"reason\", dtype=str),\n", - " # Derived columns\n", - " IncrementColumn(name=\"logcount\",\n", - " dtype=int,\n", - " input_name=config.ae.timestamp_column_name,\n", - " groupby_column=config.ae.userid_column_name),\n", - " DistinctIncrementColumn(name=\"locincrement\",\n", - " dtype=int,\n", - " input_name=\"location\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name)\n", - "]\n", - "\n", - "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", - "metadata": { - "tags": [] - }, - "source": [ - "## Pipeline Construction\n", - "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", - "\n", - "### Source Stage (`MultiFileSource`)\n", - "\n", - "This pipeline read input logs from one or more input files. This source stage will read all specified log files, combine them into a single `DataFrame`, and pass it into the pipeline. Once all of the logs have been read, the source completes. \n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", - "\n", - "### File Batcher Stage (`DFPFileBatcherStage`)\n", - "\n", - "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", - "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", - "\n", - "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", - "\n", - "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", - "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", - "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", - "\n", - "### Split Users Stage (`DFPSplitUsersStage`)\n", - "\n", - "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", - "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", - "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", - "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", - "\n", - "### Rolling Window Stage (`DFPRollingWindowStage`)\n", - "\n", - "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", - "1. This stage keeps a moving window of logs on a per user basis\n", - " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", - "1. It only emits logs when the window history requirements are met\n", - " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", - " 1. Configuration options for defining the window history requirements are detailed below.\n", - "1. It repeats the necessary logs to properly calculate log dependent features.\n", - " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", - " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", - " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", - "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", - "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", - "\n", - "### Preprocessing Stage (`DFPPreprocessingStage`)\n", - "\n", - "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", - "\n", - "### Inference Stage (`DFPInference`)\n", - "\n", - "This stage performs several tasks to aid in performing inference. This stage will:\n", - "1. Download models as needed from MLFlow\n", - "1. Cache previously downloaded models to improve performance\n", - " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", - "1. Perform inference using the downloaded model\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", - "\n", - "### Filter Detection Stage (`FilterDetectionsStage`)\n", - "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |\n", - "| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |\n", - "| `filter_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |\n", - "| `field_name` | `str` | `probs` | Name of the tensor (`filter_source=FilterSource.TENSOR`) or DataFrame column (`filter_source=FilterSource.DATAFRAME`) to use as the filter criteria. |\n", - "\n", - "\n", - "### Post Processing Stage (`DFPPostprocessingStage`)\n", - "This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.\n", - "\n", - "### Serialize Stage (`SerializeStage`)\n", - "This stage controls which columns in the DataFrame will be included in the output. For the purposes of the DFP pipeline, we will exclude columns that are used internally by the pipeline which are not of interest to the end-user.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include` | List of `str` | `[]` | List of regular expression patterns matching columns to include in the output. Specifying an empty list causes all columns to be included not explicitly excluded. |\n", - "| `exclude` | List of `str` | `[r'^ID$', r'^_ts_']` | List of regular expression patterns matching columns to exclude from the output. |\n", - "| `fixed_columns` | `bool` | `True` | When `True` it is assumed that the Dataframe in all messages contain the same columns as the first message received. |\n", - "\n", - "### Write to File Stage (`WriteToFileStage`)\n", - "\n", - "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filename` | `str` | | The file to write anomalous log messages to. |\n", - "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "825390ad-ce64-4949-b324-33039ffdf264", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a linear pipeline object\n", - "pipeline = LinearPipeline(config)\n", - "\n", - "# Source stage\n", - "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", - "\n", - "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", - "pipeline.add_stage(\n", - " DFPFileBatcherStage(config,\n", - " period=\"D\",\n", - " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", - "\n", - "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", - "pipeline.add_stage(\n", - " DFPFileToDataFrameStage(config,\n", - " schema=source_schema,\n", - " file_type=FileTypes.JSON,\n", - " parser_kwargs={\n", - " \"lines\": False, \"orient\": \"records\"\n", - " },\n", - " cache_dir=cache_dir))\n", - "\n", - "\n", - "# This will split users or just use one single user\n", - "pipeline.add_stage(\n", - " DFPSplitUsersStage(config,\n", - " include_generic=include_generic,\n", - " include_individual=include_individual,\n", - " skip_users=skip_users))\n", - "\n", - "# Next, have a stage that will create rolling windows\n", - "pipeline.add_stage(\n", - " DFPRollingWindowStage(\n", - " config,\n", - " min_history=300 if is_training else 1,\n", - " min_increment=300 if is_training else 0,\n", - " # For inference, we only ever want 1 day max\n", - " max_history=\"60d\" if is_training else \"1d\",\n", - " cache_dir=cache_dir))\n", - "\n", - "# Output is UserMessageMeta -- Cached frame set\n", - "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", - "\n", - "# Perform inference on the preprocessed data\n", - "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", - "\n", - "# Filter for only the anomalous logs\n", - "pipeline.add_stage(\n", - " FilterDetectionsStage(config, threshold=2.0, filter_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))\n", - "pipeline.add_stage(DFPPostprocessingStage(config))\n", - "\n", - "# Exclude the columns we don't want in our output\n", - "pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))\n", - "\n", - "# Write all anomalies to a CSV file\n", - "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_duo.csv\", overwrite=True))\n", - "\n", - "# Run the pipeline\n", - "await pipeline.run_async()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75c0cf6b-8255-4d90-b67c-151518c7423b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "morpheus", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Inference\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs inference on Duo authentication logs. The goal is to use the pretrained models generated in the Duo Training notebook to generate anomaly scores for each log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import mlflow\n", + "import typing\n", + "from datetime import datetime\n", + "\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_inference_stage import DFPInferenceStage\n", + "from dfp.stages.dfp_postprocessing_stage import DFPPostprocessingStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.regex_utils import iso_date_regex\n", + "\n", + "from morpheus.common import FileTypes\n", + "from morpheus.common import FilterSource\n", + "from morpheus.cli.utils import get_log_levels\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.utils.file_utils import load_labels_file\n", + "from morpheus.cli.utils import parse_log_level\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", + "from morpheus.utils.column_info import BoolColumn\n", + "from morpheus.utils.column_info import ColumnInfo\n", + "from morpheus.utils.column_info import DataFrameInputSchema\n", + "from morpheus.utils.column_info import DateTimeColumn\n", + "from morpheus.utils.column_info import DistinctIncrementColumn\n", + "from morpheus.utils.column_info import IncrementColumn\n", + "from morpheus.utils.column_info import RenameColumn\n", + "from morpheus.utils.column_info import StringCatColumn\n", + "from morpheus.utils.file_utils import date_extractor\n", + "from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage\n", + "from morpheus.stages.postprocess.serialize_stage import SerializeStage\n", + "from morpheus.utils.logger import configure_logging\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_duo_pipeline.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"none\"]` | For inference, this option should always be `\"none\"` |\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"none\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"./.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"../../../../data/dfp/duo-inference-data/DUO_2022-08-*.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"\n", + "\n", + "# Tracking URI\n", + "tracking_uri = \"http://mlflow:5000\"" + ] + }, + { + "cell_type": "markdown", + "id": "fb6b7d75", + "metadata": {}, + "source": [ + "### Set MLFlow Tracking URI\n", + "Set MLFlow tracking URI to make inference calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a01ceb8", + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(tracking_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = len(os.sched_getaffinity(0))\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " DistinctIncrementColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " input_name=\"location\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name)\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": { + "tags": [] + }, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will read all specified log files, combine them into a single `DataFrame`, and pass it into the pipeline. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. Configuration options for defining the window history requirements are detailed below.\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `1` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `0` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"1d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "\n", + "### Inference Stage (`DFPInference`)\n", + "\n", + "This stage performs several tasks to aid in performing inference. This stage will:\n", + "1. Download models as needed from MLFlow\n", + "1. Cache previously downloaded models to improve performance\n", + " 1. Models in the cache will be periodically refreshed from MLFlow at a configured rate\n", + "1. Perform inference using the downloaded model\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage` |\n", + "\n", + "### Filter Detection Stage (`FilterDetectionsStage`)\n", + "This stage filters the output from the inference stage for any anomalous messages. Logs which exceed the specified Z-Score will be passed onto the next stage. All remaining logs which are below the threshold will be dropped. For the purposes of the DFP pipeline, this stage is configured to use the `mean_abs_z` column of the DataFrame as the filter criteria.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `threshold` | `float` | `0.5` | The threshold value above which logs are considered to be anomalous. The default is `0.5`, however the DFP pipeline uses a value of `2.0`. All normal logs will be filtered out and anomalous logs will be passed on. |\n", + "| `copy` | `bool` | `True` | When the `copy` argument is `True` (default), rows that meet the filter criteria are copied into a new dataframe. When `False` sliced views are used instead. This is a performance optimization, and has no functional impact. |\n", + "| `filter_source` | `FilterSource` | `FilterSource.Auto` | Indicates if the filter criteria exists in an output tensor (`FilterSource.TENSOR`) or a column in a DataFrame (`FilterSource.DATAFRAME`). |\n", + "| `field_name` | `str` | `probs` | Name of the tensor (`filter_source=FilterSource.TENSOR`) or DataFrame column (`filter_source=FilterSource.DATAFRAME`) to use as the filter criteria. |\n", + "\n", + "\n", + "### Post Processing Stage (`DFPPostprocessingStage`)\n", + "This stage adds a new `event_time` column to the DataFrame indicating the time which Morpheus detected the anomalous messages, and replaces any `NAN` values with the a string value of `'NaN'`.\n", + "\n", + "### Serialize Stage (`SerializeStage`)\n", + "This stage controls which columns in the DataFrame will be included in the output. For the purposes of the DFP pipeline, we will exclude columns that are used internally by the pipeline which are not of interest to the end-user.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include` | List of `str` | `[]` | List of regular expression patterns matching columns to include in the output. Specifying an empty list causes all columns to be included not explicitly excluded. |\n", + "| `exclude` | List of `str` | `[r'^ID$', r'^_ts_']` | List of regular expression patterns matching columns to exclude from the output. |\n", + "| `fixed_columns` | `bool` | `True` | When `True` it is assumed that the Dataframe in all messages contain the same columns as the first message received. |\n", + "\n", + "### Write to File Stage (`WriteToFileStage`)\n", + "\n", + "This final stage will write all received messages to a single output file in either CSV or JSON format.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filename` | `str` | | The file to write anomalous log messages to. |\n", + "| `overwrite` | `bool` | `False` | If the file specified in `filename` already exists, it will be overwritten if this option is set to `True` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", + "\n", + "# Perform inference on the preprocessed data\n", + "pipeline.add_stage(DFPInferenceStage(config, model_name_formatter=model_name_formatter))\n", + "\n", + "# Filter for only the anomalous logs\n", + "pipeline.add_stage(\n", + " FilterDetectionsStage(config, threshold=2.0, filter_source=FilterSource.DATAFRAME, field_name='mean_abs_z'))\n", + "pipeline.add_stage(DFPPostprocessingStage(config))\n", + "\n", + "# Exclude the columns we don't want in our output\n", + "pipeline.add_stage(SerializeStage(config, exclude=['batch_count', 'origin_hash', '_row_hash', '_batch_id']))\n", + "\n", + "# Write all anomalies to a CSV file\n", + "pipeline.add_stage(WriteToFileStage(config, filename=\"dfp_detections_duo.csv\", overwrite=True))\n", + "\n", + "# Run the pipeline\n", + "await pipeline.run_async()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c0cf6b-8255-4d90-b67c-151518c7423b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "morpheus", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb index 1b1837d3e5..598903af35 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb @@ -1,461 +1,461 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "2941e94f-db20-44a5-ab87-2cab499825f7", - "metadata": {}, - "source": [ - "# Digital Finger Printing (DFP) with Morpheus - DUO Training\n", - "## Introduction\n", - "\n", - "In this notebook, we will be building and running a DFP pipeline that performs training on Duo authentication logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", - "\n", - "
\n", - "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", - "import sys\n", - "import os\n", - "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "102ce011-3ca3-4f96-a72d-de28fad32003", - "metadata": {}, - "outputs": [], - "source": [ - "import functools\n", - "import logging\n", - "import os\n", - "import mlflow\n", - "import typing\n", - "from datetime import datetime\n", - "\n", - "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", - "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", - "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", - "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", - "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", - "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", - "from dfp.stages.dfp_training import DFPTraining\n", - "from dfp.stages.multi_file_source import MultiFileSource\n", - "from dfp.utils.regex_utils import iso_date_regex\n", - "\n", - "from morpheus.common import FileTypes\n", - "from morpheus.cli.utils import get_log_levels\n", - "from morpheus.cli.utils import get_package_relative_file\n", - "from morpheus.cli.utils import load_labels_file\n", - "from morpheus.cli.utils import parse_log_level\n", - "from morpheus.config import Config\n", - "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", - "from morpheus.pipeline import LinearPipeline\n", - "from morpheus.utils.column_info import BoolColumn\n", - "from morpheus.utils.column_info import ColumnInfo\n", - "from morpheus.utils.column_info import DataFrameInputSchema\n", - "from morpheus.utils.column_info import DateTimeColumn\n", - "from morpheus.utils.column_info import DistinctIncrementColumn\n", - "from morpheus.utils.column_info import IncrementColumn\n", - "from morpheus.utils.column_info import RenameColumn\n", - "from morpheus.utils.column_info import StringCatColumn\n", - "from morpheus.utils.file_utils import date_extractor\n", - "from morpheus.utils.logger import configure_logging\n", - "\n", - "# Left align all tables\n", - "from IPython.core.display import HTML\n", - "table_css = 'table {align:left;display:block}'\n", - "HTML(''.format(table_css))" - ] - }, - { - "cell_type": "markdown", - "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", - "metadata": {}, - "source": [ - "## High Level Configuration\n", - "\n", - "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_pipeline_duo.py`, configures these options via command line arguments.\n", - "\n", - "### Options\n", - "\n", - "| Name | Type | Description |\n", - "| --- | --- | :-- |\n", - "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:|\n", - "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", - "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", - "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", - "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", - "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9ee00703-75c5-46fc-890c-86733da906c4", - "metadata": {}, - "outputs": [], - "source": [ - "# Global options\n", - "train_users = \"all\"\n", - "\n", - "# Enter any users to skip here\n", - "skip_users: typing.List[str] = []\n", - "\n", - "# Location where cache objects will be saved\n", - "cache_dir = \"/workspace/.cache/dfp\"\n", - "\n", - "# Input files to read from\n", - "input_files = [\n", - " \"../../../../data/dfp/duo-training-data/DUO_2022-08-*.json\"\n", - "]\n", - "\n", - "# The format to use for models\n", - "model_name_formatter = \"DFP-duo-{user_id}\"\n", - "\n", - "# The format to use for experiment names\n", - "experiment_name_formatter = \"dfp/duo/training/{reg_model_name}\"\n", - "\n", - "# === Derived Options ===\n", - "# To include the generic, we must be training all or generic\n", - "include_generic = train_users == \"all\" or train_users == \"generic\"\n", - "\n", - "# To include individual, we must be either training or inferring\n", - "include_individual = train_users != \"generic\"\n", - "\n", - "# None indicates we arent training anything\n", - "is_training = train_users != \"none\"\n", - "\n", - "# Tracking URI\n", - "tracking_uri = \"http://mlflow:5000\"" - ] - }, - { - "cell_type": "markdown", - "id": "31b16a47", - "metadata": {}, - "source": [ - "### Set MLFlow Tracking URI\n", - "Set MLFlow tracking URI to make inference calls." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "586e6b5e", - "metadata": {}, - "outputs": [], - "source": [ - "mlflow.set_tracking_uri(tracking_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", - "metadata": {}, - "source": [ - "### Global Config Object\n", - "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", - " - Indicating whether to use C++ or Python stages\n", - " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", - " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", - " - Sets the feature column names that will be used in model training\n", - " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", - " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", - " - The column name that indicates the user's unique identifier\n", - " - It is required for DFP to have a user ID column\n", - " - The column name that indicates the timestamp for the log\n", - " - It is required for DFP to know when each log occurred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "01abd537-9162-49dc-8e83-d9465592f1d5", - "metadata": {}, - "outputs": [], - "source": [ - "# Enable the Morpheus logger\n", - "configure_logging(log_level=logging.DEBUG)\n", - "\n", - "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", - "config.num_threads = len(os.sched_getaffinity(0))\n", - "\n", - "config.ae = ConfigAutoEncoder()\n", - "\n", - "config.ae.feature_columns = [\n", - " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", - "]\n", - "config.ae.userid_column_name = \"username\"\n", - "config.ae.timestamp_column_name = \"timestamp\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", - "metadata": {}, - "outputs": [], - "source": [ - "# Specify the column names to ensure all data is uniform\n", - "source_column_info = [\n", - " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", - " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", - " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", - " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", - " StringCatColumn(name=\"location\",\n", - " dtype=str,\n", - " input_columns=[\n", - " \"access_device.location.city\",\n", - " \"access_device.location.state\",\n", - " \"access_device.location.country\"\n", - " ],\n", - " sep=\", \"),\n", - " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", - " BoolColumn(name=\"result\",\n", - " dtype=bool,\n", - " input_name=\"result\",\n", - " true_values=[\"success\", \"SUCCESS\"],\n", - " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", - " ColumnInfo(name=\"reason\", dtype=str),\n", - " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", - "]\n", - "\n", - "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", - " column_info=source_column_info)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", - "metadata": {}, - "outputs": [], - "source": [ - "# Preprocessing schema\n", - "preprocess_column_info = [\n", - " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", - " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", - " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", - " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", - " ColumnInfo(name=\"authdevicename\", dtype=str),\n", - " ColumnInfo(name=\"result\", dtype=bool),\n", - " ColumnInfo(name=\"reason\", dtype=str),\n", - " # Derived columns\n", - " IncrementColumn(name=\"logcount\",\n", - " dtype=int,\n", - " input_name=config.ae.timestamp_column_name,\n", - " groupby_column=config.ae.userid_column_name),\n", - " DistinctIncrementColumn(name=\"locincrement\",\n", - " dtype=int,\n", - " input_name=\"location\",\n", - " groupby_column=config.ae.userid_column_name,\n", - " timestamp_column=config.ae.timestamp_column_name)\n", - "]\n", - "\n", - "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" - ] - }, - { - "cell_type": "markdown", - "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", - "metadata": {}, - "source": [ - "## Pipeline Construction\n", - "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", - "\n", - "### Source Stage (`MultiFileSource`)\n", - "\n", - "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", - "\n", - "### File Batcher Stage (`DFPFileBatcherStage`)\n", - "\n", - "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", - "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", - "\n", - "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", - "\n", - "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", - "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", - "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", - "\n", - "### Split Users Stage (`DFPSplitUsersStage`)\n", - "\n", - "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", - "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", - "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", - "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", - "\n", - "### Rolling Window Stage (`DFPRollingWindowStage`)\n", - "\n", - "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", - "1. This stage keeps a moving window of logs on a per user basis\n", - " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", - "1. It only emits logs when the window history requirements are met\n", - " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", - " 1. Configuration options for defining the window history requirements are detailed below.\n", - "1. It repeats the necessary logs to properly calculate log dependent features.\n", - " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", - " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", - " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", - "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", - "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", - "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", - "\n", - "### Preprocessing Stage (`DFPPreprocessingStage`)\n", - "\n", - "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", - "\n", - "### Training Stage (`DFPTraining`)\n", - "\n", - "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. Refer to `DFPAutoEncoder` for information on the available options. |\n", - "\n", - "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", - "\n", - "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", - "\n", - "| Name | Type | Default | Description |\n", - "| --- | --- | --- | :-- |\n", - "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", - "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "825390ad-ce64-4949-b324-33039ffdf264", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a linear pipeline object\n", - "pipeline = LinearPipeline(config)\n", - "\n", - "# Source stage\n", - "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", - "\n", - "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", - "pipeline.add_stage(\n", - " DFPFileBatcherStage(config,\n", - " period=\"D\",\n", - " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", - "\n", - "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", - "pipeline.add_stage(\n", - " DFPFileToDataFrameStage(config,\n", - " schema=source_schema,\n", - " file_type=FileTypes.JSON,\n", - " parser_kwargs={\n", - " \"lines\": False, \"orient\": \"records\"\n", - " },\n", - " cache_dir=cache_dir))\n", - "\n", - "\n", - "# This will split users or just use one single user\n", - "pipeline.add_stage(\n", - " DFPSplitUsersStage(config,\n", - " include_generic=include_generic,\n", - " include_individual=include_individual,\n", - " skip_users=skip_users))\n", - "\n", - "# Next, have a stage that will create rolling windows\n", - "pipeline.add_stage(\n", - " DFPRollingWindowStage(\n", - " config,\n", - " min_history=300 if is_training else 1,\n", - " min_increment=300 if is_training else 0,\n", - " # For inference, we only ever want 1 day max\n", - " max_history=\"60d\" if is_training else \"1d\",\n", - " cache_dir=cache_dir))\n", - "\n", - "# Output is UserMessageMeta -- Cached frame set\n", - "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", - "\n", - "# Finally, perform training which will output a model\n", - "pipeline.add_stage(DFPTraining(config, validation_size=0.10))\n", - "\n", - "# Write that model to MLFlow\n", - "pipeline.add_stage(\n", - " DFPMLFlowModelWriterStage(config,\n", - " model_name_formatter=model_name_formatter,\n", - " experiment_name_formatter=experiment_name_formatter))\n", - "\n", - "# Run the pipeline\n", - "await pipeline.run_async()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6eed0657-6f4b-4f21-97fa-051eeb7f4fee", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "morpheus", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "2941e94f-db20-44a5-ab87-2cab499825f7", + "metadata": {}, + "source": [ + "# Digital Finger Printing (DFP) with Morpheus - DUO Training\n", + "## Introduction\n", + "\n", + "In this notebook, we will be building and running a DFP pipeline that performs training on Duo authentication logs. The goal is to train an autoencoder PyTorch model to recogize the patterns of users in the sample data. The model will then be used by a second Morpheus pipeline to generate anomaly scores for each individual log. These anomaly scores can be used by security teams to detect abnormal behavior when it happens so the proper action can be taken.\n", + "\n", + "
\n", + "Note: For more information on DFP, the Morpheus pipeline, and setup steps to run this notebook, please refer to the coresponding DFP training materials.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c1cb50-74f2-445d-b865-8c22c3b3798b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "# Ensure that the morpheus directory is in the python path. This may not need to be run depending on the environment setup\n", + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(\"../../morpheus\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "102ce011-3ca3-4f96-a72d-de28fad32003", + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "import logging\n", + "import os\n", + "import mlflow\n", + "import typing\n", + "from datetime import datetime\n", + "\n", + "from dfp.stages.dfp_file_batcher_stage import DFPFileBatcherStage\n", + "from dfp.stages.dfp_file_to_df import DFPFileToDataFrameStage\n", + "from dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage\n", + "from dfp.stages.dfp_preprocessing_stage import DFPPreprocessingStage\n", + "from dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage\n", + "from dfp.stages.dfp_split_users_stage import DFPSplitUsersStage\n", + "from dfp.stages.dfp_training import DFPTraining\n", + "from dfp.stages.multi_file_source import MultiFileSource\n", + "from dfp.utils.regex_utils import iso_date_regex\n", + "\n", + "from morpheus.common import FileTypes\n", + "from morpheus.cli.utils import get_log_levels\n", + "from morpheus.cli.utils import get_package_relative_file\n", + "from morpheus.utils.file_utils import load_labels_file\n", + "from morpheus.cli.utils import parse_log_level\n", + "from morpheus.config import Config\n", + "from morpheus.config import ConfigAutoEncoder\n", + "from morpheus.config import CppConfig\n", + "from morpheus.pipeline import LinearPipeline\n", + "from morpheus.utils.column_info import BoolColumn\n", + "from morpheus.utils.column_info import ColumnInfo\n", + "from morpheus.utils.column_info import DataFrameInputSchema\n", + "from morpheus.utils.column_info import DateTimeColumn\n", + "from morpheus.utils.column_info import DistinctIncrementColumn\n", + "from morpheus.utils.column_info import IncrementColumn\n", + "from morpheus.utils.column_info import RenameColumn\n", + "from morpheus.utils.column_info import StringCatColumn\n", + "from morpheus.utils.file_utils import date_extractor\n", + "from morpheus.utils.logger import configure_logging\n", + "\n", + "# Left align all tables\n", + "from IPython.core.display import HTML\n", + "table_css = 'table {align:left;display:block}'\n", + "HTML(''.format(table_css))" + ] + }, + { + "cell_type": "markdown", + "id": "ca38c1b7-ce84-43e0-ac53-280562dc1642", + "metadata": {}, + "source": [ + "## High Level Configuration\n", + "\n", + "The following options significantly alter the functionality of the pipeline. These options are separated from the individual stage options since they may effect more than one stage. Additionally, the matching python script to this notebook, `dfp_pipeline_duo.py`, configures these options via command line arguments.\n", + "\n", + "### Options\n", + "\n", + "| Name | Type | Description |\n", + "| --- | --- | :-- |\n", + "| `train_users` | One of `[\"all\", \"generic\", \"individual\"]` | This indicates which users to train for this pipeline:|\n", + "| `skip_users` | List of strings | Any user in this list will be dropped from the pipeline. Useful for debugging to remove automated accounts with many logs. |\n", + "| `cache_dir` | string | The location to store cached files. To aid with development and reduce bandwidth, the Morpheus pipeline will cache data from several stages of the pipeline. This option configures the location for those caches. |\n", + "| `input_files` | List of strings | List of files to process. Can specify multiple arguments for multiple files. Also accepts glob (\\*) wildcards and schema prefixes such as `s3://`. For example, to make a local cache of an s3 bucket, use `filecache::s3://mybucket/*`. Refer to `fsspec` documentation for list of possible options. |\n", + "| `model_name_formatter` | string | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"`. This should match the value used in `DFPMLFlowModelWriterStage`. Available keyword arguments: `user_id`, `user_md5`. |\n", + "| `experiment_name_formatter` | string | A format string (without the `f`) that will be used when creating an experiment in ML Flow. Available keyword arguments: `user_id`, `user_md5`, `reg_model_name`. |\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee00703-75c5-46fc-890c-86733da906c4", + "metadata": {}, + "outputs": [], + "source": [ + "# Global options\n", + "train_users = \"all\"\n", + "\n", + "# Enter any users to skip here\n", + "skip_users: typing.List[str] = []\n", + "\n", + "# Location where cache objects will be saved\n", + "cache_dir = \"/workspace/.cache/dfp\"\n", + "\n", + "# Input files to read from\n", + "input_files = [\n", + " \"../../../../data/dfp/duo-training-data/DUO_2022-08-*.json\"\n", + "]\n", + "\n", + "# The format to use for models\n", + "model_name_formatter = \"DFP-duo-{user_id}\"\n", + "\n", + "# The format to use for experiment names\n", + "experiment_name_formatter = \"dfp/duo/training/{reg_model_name}\"\n", + "\n", + "# === Derived Options ===\n", + "# To include the generic, we must be training all or generic\n", + "include_generic = train_users == \"all\" or train_users == \"generic\"\n", + "\n", + "# To include individual, we must be either training or inferring\n", + "include_individual = train_users != \"generic\"\n", + "\n", + "# None indicates we arent training anything\n", + "is_training = train_users != \"none\"\n", + "\n", + "# Tracking URI\n", + "tracking_uri = \"http://mlflow:5000\"" + ] + }, + { + "cell_type": "markdown", + "id": "31b16a47", + "metadata": {}, + "source": [ + "### Set MLFlow Tracking URI\n", + "Set MLFlow tracking URI to make inference calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "586e6b5e", + "metadata": {}, + "outputs": [], + "source": [ + "mlflow.set_tracking_uri(tracking_uri)" + ] + }, + { + "cell_type": "markdown", + "id": "1cfc24c9-c85e-4977-a348-692c8f0aceaa", + "metadata": {}, + "source": [ + "### Global Config Object\n", + "Before creating the pipeline, we need to setup logging and set the parameters for the Morpheus config object. This config object is responsible for the following:\n", + " - Indicating whether to use C++ or Python stages\n", + " - C++ stages are not supported for the DFP pipeline. This should always be `False`\n", + " - Setting the number of threads to use in the pipeline. Defaults to the thread count of the OS.\n", + " - Sets the feature column names that will be used in model training\n", + " - This option allows extra columns to be used in the pipeline that will not be part of the training algorithm.\n", + " - The final features that the model will be trained on will be an intersection of this list with the log columns.\n", + " - The column name that indicates the user's unique identifier\n", + " - It is required for DFP to have a user ID column\n", + " - The column name that indicates the timestamp for the log\n", + " - It is required for DFP to know when each log occurred" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01abd537-9162-49dc-8e83-d9465592f1d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Enable the Morpheus logger\n", + "configure_logging(log_level=logging.DEBUG)\n", + "\n", + "config = Config()\n", + "\n", + "CppConfig.set_should_use_cpp(False)\n", + "\n", + "config.num_threads = len(os.sched_getaffinity(0))\n", + "\n", + "config.ae = ConfigAutoEncoder()\n", + "\n", + "config.ae.feature_columns = [\n", + " 'accessdevicebrowser', 'accessdeviceos', 'authdevicename', 'result', 'reason', 'logcount', \"locincrement\"\n", + "]\n", + "config.ae.userid_column_name = \"username\"\n", + "config.ae.timestamp_column_name = \"timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a73a4d53-32b6-4ab8-a5d7-c0104b31c69b", + "metadata": {}, + "outputs": [], + "source": [ + "# Specify the column names to ensure all data is uniform\n", + "source_column_info = [\n", + " DateTimeColumn(name=config.ae.timestamp_column_name, dtype=datetime, input_name=\"timestamp\"),\n", + " RenameColumn(name=config.ae.userid_column_name, dtype=str, input_name=\"user.name\"),\n", + " RenameColumn(name=\"accessdevicebrowser\", dtype=str, input_name=\"access_device.browser\"),\n", + " RenameColumn(name=\"accessdeviceos\", dtype=str, input_name=\"access_device.os\"),\n", + " StringCatColumn(name=\"location\",\n", + " dtype=str,\n", + " input_columns=[\n", + " \"access_device.location.city\",\n", + " \"access_device.location.state\",\n", + " \"access_device.location.country\"\n", + " ],\n", + " sep=\", \"),\n", + " RenameColumn(name=\"authdevicename\", dtype=str, input_name=\"auth_device.name\"),\n", + " BoolColumn(name=\"result\",\n", + " dtype=bool,\n", + " input_name=\"result\",\n", + " true_values=[\"success\", \"SUCCESS\"],\n", + " false_values=[\"denied\", \"DENIED\", \"FRAUD\"]),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # CustomColumn(name=\"user.groups\", dtype=str, process_column_fn=partial(column_listjoin, col_name=\"user.groups\"))\n", + "]\n", + "\n", + "source_schema = DataFrameInputSchema(json_columns=[\"access_device\", \"application\", \"auth_device\", \"user\"],\n", + " column_info=source_column_info)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7a0cb0a-e65a-444a-a06c-a4525d543790", + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing schema\n", + "preprocess_column_info = [\n", + " ColumnInfo(name=config.ae.timestamp_column_name, dtype=datetime),\n", + " ColumnInfo(name=config.ae.userid_column_name, dtype=str),\n", + " ColumnInfo(name=\"accessdevicebrowser\", dtype=str),\n", + " ColumnInfo(name=\"accessdeviceos\", dtype=str),\n", + " ColumnInfo(name=\"authdevicename\", dtype=str),\n", + " ColumnInfo(name=\"result\", dtype=bool),\n", + " ColumnInfo(name=\"reason\", dtype=str),\n", + " # Derived columns\n", + " IncrementColumn(name=\"logcount\",\n", + " dtype=int,\n", + " input_name=config.ae.timestamp_column_name,\n", + " groupby_column=config.ae.userid_column_name),\n", + " DistinctIncrementColumn(name=\"locincrement\",\n", + " dtype=int,\n", + " input_name=\"location\",\n", + " groupby_column=config.ae.userid_column_name,\n", + " timestamp_column=config.ae.timestamp_column_name)\n", + "]\n", + "\n", + "preprocess_schema = DataFrameInputSchema(column_info=preprocess_column_info, preserve_columns=[\"_batch_id\"])\n" + ] + }, + { + "cell_type": "markdown", + "id": "bdfc59de-dea8-4e5f-98e3-98eba1e4621d", + "metadata": {}, + "source": [ + "## Pipeline Construction\n", + "From this point on we begin constructing the stages that will make up the pipeline. To make testing easier, constructing the pipeline object, adding the stages, and running the pipeline, is provided as a single cell. The below cell can be rerun multiple times as needed for debugging.\n", + "\n", + "### Source Stage (`MultiFileSource`)\n", + "\n", + "This pipeline read input logs from one or more input files. This source stage will construct a list of files to be processed and pass to downstream stages. It is capable of reading files from many different source types, both local and remote. This is possible by utilizing the `fsspec` library (similar to `pandas`). Refer to the [`fsspec`](https://filesystem-spec.readthedocs.io/) documentation for more information on the supported file types. Once all of the logs have been read, the source completes. \n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `filenames` | List of strings | | Any files to read into the pipeline. All files will be combined into a single `DataFrame` |\n", + "\n", + "### File Batcher Stage (`DFPFileBatcherStage`)\n", + "\n", + "To improve performance, multiple small input files can be batched together into a single DataFrame for processing. This stage is responsible for determining the timestamp of input files, grouping input files into batches by time, and sending the batches to be processed into a single DataFrame. Repeated batches of files will be loaded from cache resulting in increased performance. For example, when performaing a 60 day training run, 59 days can be cached with a period of `\"D\"` and retraining once per day.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `period` | `str` | `\"D\"` | The period to create batches. Refer to `pandas` windowing frequency documentation for available options. |\n", + "| `date_conversion_func` | Function of `typing.Callable[[fsspec.core.OpenFile], datetime]` | | A callback which is responsible for determining the date for a specified file. |\n", + "\n", + "### File to DataFrame Stage (`DFPFileToDataFrameStage`)\n", + "\n", + "After files have been batched into groups, this stage is responsible for reading the files and converting into a DataFrame. The specified input schema converts the raw DataFrame into one suitable for caching and processing. Any columns that are not needed should be excluded from the schema.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `schema` | `DataFrameInputSchema` | | After the raw `DataFrame` is read from each file, this schema will be applied to ensure a consisten output from the source. |\n", + "| `file_type` | `FileTypes` | `FileTypes.Auto` | Allows overriding the file type. When set to `Auto`, the file extension will be used. Options are `CSV`, `JSON`, `PARQUET`, `Auto`. |\n", + "| `parser_kwargs` | `dict` | `{}` | This dictionary will be passed to the `DataFrame` parser class. Allows for customization of log parsing. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write cached input files to. |\n", + "\n", + "### Split Users Stage (`DFPSplitUsersStage`)\n", + "\n", + "Once the input logs have been read into a `DataFrame`, this stage is responsible for breaking that single `DataFrame` with many users into multiple `DataFrame`s for each user. This is also where the pipeline chooses whether to train individual users or the generic user (or both).\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `include_generic` | `bool` | | Whether or not to combine all user logs into a single `DataFrame` with the username 'generic_user' |\n", + "| `include_individual` | `bool` | | Whether or not to output individual `DataFrame` objects for each user |\n", + "| `skip_users` | List of `str` | `[]` | Any users to remove from the `DataFrame`. Useful for debugging to remove automated accounts with many logs. Mutually exclusive with `only_users`. |\n", + "| `only_users` | List of `str` | `[]` | Only allow these users in the final `DataFrame`. Useful for debugging to focus on specific users. Mutually exclusive with `skip_users`. |\n", + "\n", + "### Rolling Window Stage (`DFPRollingWindowStage`)\n", + "\n", + "The Rolling Window Stage performs several key pieces of functionality for DFP.\n", + "1. This stage keeps a moving window of logs on a per user basis\n", + " 1. These logs are saved to disk to reduce memory requirements between logs from the same user\n", + "1. It only emits logs when the window history requirements are met\n", + " 1. Until all of the window history requirements are met, no messages will be sent to the rest of the pipeline.\n", + " 1. Configuration options for defining the window history requirements are detailed below.\n", + "1. It repeats the necessary logs to properly calculate log dependent features.\n", + " 1. To support all column feature types, incoming log messages can be combined with existing history and sent to downstream stages.\n", + " 1. For example, to calculate a feature that increments a counter for the number of logs a particular user has generated in a single day, we must have the user's log history for the past 24 hours. To support this, this stage will combine new logs with existing history into a single `DataFrame`.\n", + " 1. It is the responsibility of downstream stages to distinguish between new logs and existing history.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `min_history` | `int` | `300` | The minimum number of logs a user must have before emitting any messages. Logs below this threshold will be saved to disk. |\n", + "| `min_increment` | `int` or `str` | `300` | Once the min history requirement is met, this stage must receive `min_increment` *new* logs before emmitting another message. Logs received before this threshold is met will be saved to disk. Can be specified as an integer count or a string duration. |\n", + "| `max_history` | `int` or `str` | `\"60d\"` | Once `min_history` and `min_increment` requirements have been met, this puts an upper bound on the maximum number of messages to forward into the pipeline and also the maximum amount of messages to retain in the history. Can be specified as an integer count or a string duration. |\n", + "| `cache_dir` | `str` | `./.cache/dfp` | The location to write log history to disk. |\n", + "\n", + "### Preprocessing Stage (`DFPPreprocessingStage`)\n", + "\n", + "This stage performs the final, row dependent, feature calculations as specified by the input schema object. Once calculated, this stage can forward on all received logs, or optionally can only forward on new logs, removing any history information.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `input_schema` | `DataFrameInputSchema` | | The final, row dependent, schema to apply to the incoming columns |\n", + "\n", + "### Training Stage (`DFPTraining`)\n", + "\n", + "This stage is responsible for performing the actual training calculations. Training will be performed on all received data. Resulting message will contain the input data paired with the trained model.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_kwargs` | `dict` | `{}` | The options to use when creating a new model instance. Refer to `DFPAutoEncoder` for information on the available options. |\n", + "\n", + "### MLFlow Model Writer Stage (`DFPMLFlowModelWriterStage`)\n", + "\n", + "This stage is the last step in training. It will upload the trained model from the previous stage to MLFlow. The tracking URI for which MLFlow instance to use is configured using the static method `mlflow.set_tracking_uri()`.\n", + "\n", + "| Name | Type | Default | Description |\n", + "| --- | --- | --- | :-- |\n", + "| `model_name_formatter` | `str` | `\"\"` | A format string to use when building the model name. The model name is constructed by calling `model_name_formatter.format(user_id=user_id)`. For example, with `model_name_formatter=\"my_model-{user_id}\"` and a user ID of `\"first:last\"` would result in the model name of `\"my_model-first:last\"` |\n", + "| `experiment_name` | `str` | | All models are created inside of an experiment to allow metrics to be saved with each model. This option specifies the experiment name. The final experiment name for each model will be in the form of `{experiment_name}/{model_name}` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825390ad-ce64-4949-b324-33039ffdf264", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a linear pipeline object\n", + "pipeline = LinearPipeline(config)\n", + "\n", + "# Source stage\n", + "pipeline.set_source(MultiFileSource(config, filenames=input_files))\n", + "\n", + "# Batch files into batches by time. Use the default ISO date extractor from the filename\n", + "pipeline.add_stage(\n", + " DFPFileBatcherStage(config,\n", + " period=\"D\",\n", + " date_conversion_func=functools.partial(date_extractor, filename_regex=iso_date_regex)))\n", + "\n", + "# Output is a list of fsspec files. Convert to DataFrames. This caches downloaded data\n", + "pipeline.add_stage(\n", + " DFPFileToDataFrameStage(config,\n", + " schema=source_schema,\n", + " file_type=FileTypes.JSON,\n", + " parser_kwargs={\n", + " \"lines\": False, \"orient\": \"records\"\n", + " },\n", + " cache_dir=cache_dir))\n", + "\n", + "\n", + "# This will split users or just use one single user\n", + "pipeline.add_stage(\n", + " DFPSplitUsersStage(config,\n", + " include_generic=include_generic,\n", + " include_individual=include_individual,\n", + " skip_users=skip_users))\n", + "\n", + "# Next, have a stage that will create rolling windows\n", + "pipeline.add_stage(\n", + " DFPRollingWindowStage(\n", + " config,\n", + " min_history=300 if is_training else 1,\n", + " min_increment=300 if is_training else 0,\n", + " # For inference, we only ever want 1 day max\n", + " max_history=\"60d\" if is_training else \"1d\",\n", + " cache_dir=cache_dir))\n", + "\n", + "# Output is UserMessageMeta -- Cached frame set\n", + "pipeline.add_stage(DFPPreprocessingStage(config, input_schema=preprocess_schema))\n", + "\n", + "# Finally, perform training which will output a model\n", + "pipeline.add_stage(DFPTraining(config, validation_size=0.10))\n", + "\n", + "# Write that model to MLFlow\n", + "pipeline.add_stage(\n", + " DFPMLFlowModelWriterStage(config,\n", + " model_name_formatter=model_name_formatter,\n", + " experiment_name_formatter=experiment_name_formatter))\n", + "\n", + "# Run the pipeline\n", + "await pipeline.run_async()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6eed0657-6f4b-4f21-97fa-051eeb7f4fee", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "morpheus", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py index f67460a6bd..afa9706ff7 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py @@ -37,7 +37,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_log_level from morpheus.common import FileTypes from morpheus.config import Config @@ -53,6 +52,7 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.file_utils import date_extractor +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py index 961f82d676..28f36995c1 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py @@ -37,7 +37,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_package_relative_file -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_log_level from morpheus.common import FileTypes from morpheus.config import Config @@ -54,6 +53,7 @@ from morpheus.utils.column_info import RenameColumn from morpheus.utils.column_info import StringCatColumn from morpheus.utils.file_utils import date_extractor +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging diff --git a/python/morpheus/morpheus/cli/commands.py b/python/morpheus/morpheus/cli/commands.py index e7df1d3b75..f90faadd52 100644 --- a/python/morpheus/morpheus/cli/commands.py +++ b/python/morpheus/morpheus/cli/commands.py @@ -28,7 +28,6 @@ from morpheus.cli.utils import get_enum_keys from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_pipeline_from_ctx -from morpheus.cli.utils import load_labels_file from morpheus.cli.utils import parse_enum from morpheus.cli.utils import parse_log_level from morpheus.cli.utils import prepare_command @@ -39,6 +38,7 @@ from morpheus.config import ConfigOnnxToTRT from morpheus.config import CppConfig from morpheus.config import PipelineModes +from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging # pylint: disable=line-too-long, import-outside-toplevel, invalid-name, global-at-module-level, unused-argument diff --git a/python/morpheus/morpheus/cli/utils.py b/python/morpheus/morpheus/cli/utils.py index bdb7989acb..d4924282d4 100644 --- a/python/morpheus/morpheus/cli/utils.py +++ b/python/morpheus/morpheus/cli/utils.py @@ -27,6 +27,8 @@ import morpheus from morpheus.config import Config from morpheus.config import ConfigBase +# For backwards compatibility +from morpheus.utils.file_utils import load_labels_file # pylint: disable=unused-import # noqa: F401 # Ignore pipeline unless we are typechecking since it takes a while to import if (typing.TYPE_CHECKING): @@ -193,12 +195,6 @@ def parse_enum(_: click.Context, _2: click.Parameter, value: str, enum_class: ty return result -def load_labels_file(labels_file: str) -> typing.List[str]: - """Returns a list of labels from the given file, where each line is a label.""" - with open(labels_file, "r", encoding='UTF-8') as fh: - return [x.strip() for x in fh.readlines()] - - def get_package_relative_file(filename: str): """ If `filename` is a relative path, and does not exist, attempt to locate the file relative to the directory of the diff --git a/python/morpheus/morpheus/utils/file_utils.py b/python/morpheus/morpheus/utils/file_utils.py index 1a8b62e8f1..f3457b9296 100644 --- a/python/morpheus/morpheus/utils/file_utils.py +++ b/python/morpheus/morpheus/utils/file_utils.py @@ -16,7 +16,6 @@ import os import re -import typing from datetime import datetime from datetime import timezone @@ -60,7 +59,7 @@ def get_data_file_path(data_filename: str) -> str: return data_filename -def load_labels_file(labels_filename: str) -> typing.List[str]: +def load_labels_file(labels_filename: str) -> list[str]: """ Get list of labels from file. @@ -71,7 +70,7 @@ def load_labels_file(labels_filename: str) -> typing.List[str]: Returns ------- - typing.List[str] + list[str] List of labels """ with open(labels_filename, "r", encoding='UTF-8') as fh: