From a527ee7ca8f1a90eab9c7b76f9770543407fc847 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Thu, 25 Jul 2024 11:30:30 -0700
Subject: [PATCH] Fix log parsing undefined variable and duplicate sequence id
 errors (#1809)

* Fixes two related bugs which occur when running the `log_parsing` example with unexpected data.
* Fix error in `LogParsingPostProcessingStage::__get_label_dicts` where there is a possibility that the `new_label` and `new_confidence` variables are undefined.

* Fix error where the `tokenize_text_series` method can return a sequence_ids list with duplicate ids when a text series is passed in and one of the values has a length longer than `seq_len` and `truncation=False`, this prevents a down-stream error in the inference stage
```
RuntimeError: Inconsistent ID column. Last element in 'seq_ids' tensor, [1], must not extend beyond last message, [0]
```
Not sure if this is the best place to catch this error.


Closes #1765

## By Submitting this PR I confirm:
- I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md).
- When the PR is ready for review, new or existing tests cover these changes.
- When the PR is ready for review, the documentation is up to date with these changes.

Authors:
  - David Gardner (https://github.com/dagardner-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: https://github.com/nv-morpheus/Morpheus/pull/1809
---
 examples/log_parsing/postprocessing.py        | 19 +++++--
 morpheus/utils/cudf_subword_helper.py         |  7 +++
 tests/conftest.py                             | 12 +++++
 .../log_parsing/test_log_parsing_pipe.py      | 36 +++++++++----
 .../log_parsing/test_postprocessing.py        | 52 ++++++++++++++++--
 tests/test_cli.py                             |  5 +-
 tests/utils/test_cudf_subword_helper.py       | 53 +++++++++++++++++++
 7 files changed, 164 insertions(+), 20 deletions(-)
 create mode 100644 tests/utils/test_cudf_subword_helper.py

diff --git a/examples/log_parsing/postprocessing.py b/examples/log_parsing/postprocessing.py
index 72b1e34fc6..830ff9f7ef 100644
--- a/examples/log_parsing/postprocessing.py
+++ b/examples/log_parsing/postprocessing.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import json
+import logging
 import pathlib
 import typing
 from collections import defaultdict
@@ -31,6 +32,8 @@
 from morpheus.pipeline.single_port_stage import SinglePortStage
 from morpheus.pipeline.stage_schema import StageSchema
 
+logger = logging.getLogger(f"morpheus.{__name__}")
+
 
 @register_stage("log-postprocess", modes=[PipelineModes.NLP])
 class LogParsingPostProcessingStage(SinglePortStage):
@@ -117,17 +120,25 @@ def _postprocess(self, x: MultiResponseMessage):
     def __get_label_dicts(self, row):
         token_dict = defaultdict(str)
         confidence_dict = defaultdict(list)
+        new_label = None
+        new_confidence = None
         for label, confidence, token_id in zip(row["labels"], row["confidences"], row["token_ids"]):
             text_token = self._vocab_lookup[token_id]
             if text_token[:2] != "##" and text_token[0] != '.':
                 # if not a subword use the current label, else use previous
                 new_label = label
                 new_confidence = confidence
-            if self._label_map[new_label] in token_dict:
-                token_dict[self._label_map[new_label]] = (token_dict[self._label_map[new_label]] + " " + text_token)
+
+            if new_label is not None and new_confidence is not None:
+                if self._label_map[new_label] in token_dict:
+                    token_dict[self._label_map[new_label]] = (token_dict[self._label_map[new_label]] + " " + text_token)
+                else:
+                    token_dict[self._label_map[new_label]] = text_token
+
+                confidence_dict[self._label_map[label]].append(new_confidence)
             else:
-                token_dict[self._label_map[new_label]] = text_token
-            confidence_dict[self._label_map[label]].append(new_confidence)
+                logger.warning("Ignoring unexecpected subword token: %s", text_token)
+
         return token_dict, confidence_dict
 
     def __decode_cleanup(self, df):
diff --git a/morpheus/utils/cudf_subword_helper.py b/morpheus/utils/cudf_subword_helper.py
index 550100a054..c3af4f010d 100644
--- a/morpheus/utils/cudf_subword_helper.py
+++ b/morpheus/utils/cudf_subword_helper.py
@@ -150,6 +150,13 @@ def tokenize_text_series(vocab_hash_file: str,
     max_rows_tensor = len(text_ser) * 2
     max_length = seq_len
 
+    # Preflight check to ensure that the input strings are not too long
+    if not truncation:
+        max_value_length = text_ser.str.len().max()
+        if max_value_length > max_length:
+            raise ValueError(
+                f"Input strings are too long ({max_value_length}) to be tokenized without truncation seq_len={seq_len}")
+
     # Call the tokenizer
     tokenizer_output = tokenizer(text_ser,
                                  max_length=max_length,
diff --git a/tests/conftest.py b/tests/conftest.py
index 732dee996c..19b92df1b1 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1044,6 +1044,18 @@ def string_collection_config_fixture():
     yield load_json_file(filename="service/milvus_string_collection_conf.json")
 
 
+@pytest.fixture(scope="session", name="bert_cased_hash")
+def bert_cased_hash_fixture():
+    from _utils import TEST_DIRS
+    yield os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-hash.txt')
+
+
+@pytest.fixture(scope="session", name="bert_cased_vocab")
+def bert_cased_vocab_fixture():
+    from _utils import TEST_DIRS
+    yield os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-vocab.txt')
+
+
 @pytest.fixture(name="nemollm", scope='session')
 def nemollm_fixture(fail_missing: bool):
     """
diff --git a/tests/examples/log_parsing/test_log_parsing_pipe.py b/tests/examples/log_parsing/test_log_parsing_pipe.py
index 4aac2d439b..0658dc879d 100755
--- a/tests/examples/log_parsing/test_log_parsing_pipe.py
+++ b/tests/examples/log_parsing/test_log_parsing_pipe.py
@@ -36,7 +36,11 @@
 MODEL_MAX_BATCH_SIZE = 32
 
 
-def _run_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typing.List[types.ModuleType]):
+def _run_pipeline(config: Config,
+                  dataset_cudf: DatasetManager,
+                  import_mod: typing.List[types.ModuleType],
+                  bert_cased_hash: str,
+                  bert_cased_vocab: str):
     """
     Runs just the Log Parsing Pipeline
     """
@@ -47,8 +51,6 @@ def _run_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typi
     config.model_max_batch_size = MODEL_MAX_BATCH_SIZE
     config.feature_length = FEATURE_LENGTH
 
-    model_vocab_file = os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-vocab.txt')
-    vocab_hash_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-hash.txt')
     log_test_data_dir = os.path.join(TEST_DIRS.tests_data_dir, 'examples/log_parsing')
 
     # Not actually the real model config, just the subset that LogParsingPostProcessingStage uses
@@ -67,7 +69,7 @@ def _run_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typi
     pipe.add_stage(DeserializeStage(config))
     pipe.add_stage(
         PreprocessNLPStage(config,
-                           vocab_hash_file=vocab_hash_file_name,
+                           vocab_hash_file=bert_cased_hash,
                            truncation=False,
                            do_lower_case=False,
                            stride=64,
@@ -80,7 +82,7 @@ def _run_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typi
                                                force_convert_inputs=True))
     pipe.add_stage(
         postprocessing_mod.LogParsingPostProcessingStage(config,
-                                                         vocab_path=model_vocab_file,
+                                                         vocab_path=bert_cased_vocab,
                                                          model_config_path=model_config_file))
 
     comp_stage = pipe.add_stage(CompareDataFrameStage(config, expected_df))
@@ -90,7 +92,11 @@ def _run_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typi
     assert_results(comp_stage.get_results())
 
 
-def _run_mocked_pipeline(config: Config, dataset_cudf: DatasetManager, import_mod: typing.List[types.ModuleType]):
+def _run_mocked_pipeline(config: Config,
+                         dataset_cudf: DatasetManager,
+                         import_mod: typing.List[types.ModuleType],
+                         bert_cased_hash: str,
+                         bert_cased_vocab: str):
     """
     Runs the minibert pipeline and mocks the Triton Python interface
     """
@@ -122,7 +128,11 @@ def _run_mocked_pipeline(config: Config, dataset_cudf: DatasetManager, import_mo
         async_infer = mk_async_infer(inf_results)
         mock_triton_client.async_infer.side_effect = async_infer
 
-        return _run_pipeline(config, dataset_cudf, import_mod)
+        return _run_pipeline(config,
+                             dataset_cudf,
+                             import_mod,
+                             bert_cased_hash=bert_cased_hash,
+                             bert_cased_vocab=bert_cased_vocab)
 
 
 @pytest.mark.slow
@@ -130,5 +140,13 @@ def _run_mocked_pipeline(config: Config, dataset_cudf: DatasetManager, import_mo
     os.path.join(TEST_DIRS.examples_dir, 'log_parsing', 'inference.py'),
     os.path.join(TEST_DIRS.examples_dir, 'log_parsing', 'postprocessing.py')
 ])
-def test_pipe(config: Config, dataset_cudf: DatasetManager, import_mod: typing.List[types.ModuleType]):
-    _run_mocked_pipeline(config, dataset_cudf, import_mod)
+def test_pipe(config: Config,
+              dataset_cudf: DatasetManager,
+              import_mod: typing.List[types.ModuleType],
+              bert_cased_hash: str,
+              bert_cased_vocab: str):
+    _run_mocked_pipeline(config,
+                         dataset_cudf,
+                         import_mod,
+                         bert_cased_hash=bert_cased_hash,
+                         bert_cased_vocab=bert_cased_vocab)
diff --git a/tests/examples/log_parsing/test_postprocessing.py b/tests/examples/log_parsing/test_postprocessing.py
index c32ec5623d..c76c7dcab4 100644
--- a/tests/examples/log_parsing/test_postprocessing.py
+++ b/tests/examples/log_parsing/test_postprocessing.py
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
+import re
 import types
 import typing
 
@@ -29,6 +31,11 @@
 from morpheus.messages import TensorMemory
 
 
+@pytest.fixture(scope='module', name="model_config_file")
+def fixture_model_config_file():
+    return os.path.join(TEST_DIRS.tests_data_dir, 'examples/log_parsing/log-parsing-config.json')
+
+
 def build_post_proc_message(dataset_cudf: DatasetManager, log_test_data_dir: str):
     input_file = os.path.join(TEST_DIRS.validation_data_dir, 'log-parsing-validation-data-input.csv')
     input_df = dataset_cudf[input_file]
@@ -55,15 +62,15 @@ def build_post_proc_message(dataset_cudf: DatasetManager, log_test_data_dir: str
 @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'log_parsing', 'postprocessing.py'))
 def test_log_parsing_post_processing_stage(config: Config,
                                            dataset_cudf: DatasetManager,
-                                           import_mod: typing.List[types.ModuleType]):
+                                           import_mod: typing.List[types.ModuleType],
+                                           bert_cased_vocab: str,
+                                           model_config_file: str):
     postprocessing_mod = import_mod
 
-    model_vocab_file = os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-vocab.txt')
     log_test_data_dir = os.path.join(TEST_DIRS.tests_data_dir, 'examples/log_parsing')
-    model_config_file = os.path.join(log_test_data_dir, 'log-parsing-config.json')
 
     stage = postprocessing_mod.LogParsingPostProcessingStage(config,
-                                                             vocab_path=model_vocab_file,
+                                                             vocab_path=bert_cased_vocab,
                                                              model_config_path=model_config_file)
 
     post_proc_message = build_post_proc_message(dataset_cudf, log_test_data_dir)
@@ -73,3 +80,40 @@ def test_log_parsing_post_processing_stage(config: Config,
 
     assert isinstance(out_meta, MessageMeta)
     DatasetManager.assert_compare_df(out_meta.df, expected_df)
+
+
+@pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'log_parsing', 'postprocessing.py'))
+def test_undefined_variable_error(caplog: pytest.LogCaptureFixture,
+                                  config: Config,
+                                  dataset_cudf: DatasetManager,
+                                  import_mod: typing.List[types.ModuleType],
+                                  bert_cased_vocab: str,
+                                  model_config_file: str):
+    """
+    Test for undefined variable error, which occurrs when the first token_id is unexpected resulting in the `new_label`
+    and `new_confidence` variables being undefined.
+    """
+    postprocessing_mod = import_mod
+
+    log_test_data_dir = os.path.join(TEST_DIRS.tests_data_dir, 'examples/log_parsing')
+
+    stage = postprocessing_mod.LogParsingPostProcessingStage(config,
+                                                             vocab_path=bert_cased_vocab,
+                                                             model_config_path=model_config_file)
+
+    post_proc_message = build_post_proc_message(dataset_cudf, log_test_data_dir)
+    post_proc_message.get_tensor('input_ids')[0] = 27716.0
+
+    expected_log_re = re.compile(r"^Ignoring unexecpected subword token:.*")
+
+    caplog.clear()
+    with caplog.at_level(logging.WARNING):
+        stage._postprocess(post_proc_message)
+
+        logged_warning = False
+        for rec in caplog.records:
+            if rec.levelno == logging.WARNING and expected_log_re.match(rec.message) is not None:
+                logged_warning = True
+                break
+
+            assert logged_warning, "Expected warning message not found in logs"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 93d11f1949..1f578e5990 100755
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -914,12 +914,11 @@ def test_pipeline_alias(self, config, callback_values):  # pylint: disable=unuse
         assert config.mode == PipelineModes.NLP
 
     @pytest.mark.replace_callback('pipeline_nlp')
-    def test_pipeline_nlp_relative_paths(self, config, callback_values):
+    def test_pipeline_nlp_relative_paths(self, config, callback_values, bert_cased_hash: str):
         """
         Ensure that the default paths in the nlp pipeline are valid when run from outside the morpheus repo
         """
 
-        vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-cased-hash.txt')
         args = (GENERAL_ARGS + ['pipeline-nlp'] + FILE_SRC_ARGS + [
             'deserialize',
             'preprocess',
@@ -941,7 +940,7 @@ def test_pipeline_nlp_relative_paths(self, config, callback_values):
         # pylint: disable=unused-variable
         [file_source, deserialize, process_nlp, triton_inf, monitor, add_class, validation, serialize, to_file] = stages
 
-        assert process_nlp._vocab_hash_file == os.path.realpath(vocab_file_name)
+        assert process_nlp._vocab_hash_file == os.path.realpath(bert_cased_hash)
 
     @pytest.mark.replace_callback('pipeline_nlp')
     def test_pipeline_nlp_relative_path_precedence(self, config, callback_values, tmp_path):
diff --git a/tests/utils/test_cudf_subword_helper.py b/tests/utils/test_cudf_subword_helper.py
new file mode 100644
index 0000000000..3dc376a9a9
--- /dev/null
+++ b/tests/utils/test_cudf_subword_helper.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import string
+
+import pytest
+
+import cudf
+
+from morpheus.utils.cudf_subword_helper import tokenize_text_series
+
+
+@pytest.mark.parametrize("seq_length", [10, 256, 1024])
+@pytest.mark.parametrize("do_lower_case", [False, True])
+@pytest.mark.parametrize("add_special_tokens", [False, True])
+def test_needs_trunc_error(bert_cased_hash: str, seq_length: int, do_lower_case: bool, add_special_tokens: bool):
+    """
+    Feeding the subword tokenizer with a string that is too long should raise an error rather than
+    a duplicate in the id list
+    """
+
+    short_string = string.ascii_lowercase[0:seq_length - 1]
+
+    long_string = list(string.ascii_lowercase)
+    while len(long_string) <= seq_length:
+        long_string.extend(string.ascii_lowercase)
+
+    long_string = "".join(long_string)
+
+    series = cudf.Series([short_string, long_string])
+
+    # Infer the value of stride the same way that the PreprocessNLPStage does
+    stride = (seq_length // 2) + (seq_length // 4)
+
+    with pytest.raises(ValueError):
+        tokenize_text_series(vocab_hash_file=bert_cased_hash,
+                             do_lower_case=do_lower_case,
+                             text_ser=series,
+                             seq_len=seq_length,
+                             truncation=False,
+                             stride=stride,
+                             add_special_tokens=add_special_tokens)