Fix for duplicate row IDs in log_parsing output (#2031)

* Use the index of the incoming DF as the source of the `doc` field not the first column of the sequence_ids * Fix typeo in docstring for `TableInfo::num_indices` (unrelated) * Remove declaration for unimplemtned/unused function in `python/morpheus/morpheus/_lib/include/morpheus/stages/deserialize.hpp` (unrelated) Closes #2029 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) Approvers: - Yuchen Zhang (https://github.com/yczhang-nv) URL: #2031
nv-morpheus · Nov 1, 2024 · 986abdb · 986abdb
1 parent de72aaf
commit 986abdb
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 11 deletions.
diff --git a/examples/log_parsing/postprocessing.py b/examples/log_parsing/postprocessing.py
@@ -83,8 +83,12 @@ def compute_schema(self, schema: StageSchema):
         schema.output_schema.set_type(MessageMeta)
 
     def _postprocess(self, msg: ControlMessage):
-        infer_pdf = pd.DataFrame(msg.tensors().get_tensor('seq_ids').get()).astype(int)
-        infer_pdf.columns = ["doc", "start", "stop"]
+        with msg.payload().mutable_dataframe() as src_df:
+            src_index = src_df.index.to_pandas()
+
+        seq_ids = msg.tensors().get_tensor('seq_ids').get()
+        infer_pdf = pd.DataFrame({"doc": src_index, "start": seq_ids[:, 1], "stop": seq_ids[:, 2]})
+
         infer_pdf["confidences"] = msg.tensors().get_tensor('confidences').tolist()
         infer_pdf["labels"] = msg.tensors().get_tensor('labels').tolist()
         infer_pdf["token_ids"] = msg.tensors().get_tensor('input_ids').tolist()

diff --git a/python/morpheus/morpheus/_lib/include/morpheus/objects/table_info.hpp b/python/morpheus/morpheus/_lib/include/morpheus/objects/table_info.hpp
@@ -70,7 +70,7 @@ struct MORPHEUS_EXPORT TableInfoBase
     std::vector<std::string> get_column_names() const;
 
     /**
-     * @brief Get size of a index names in a data table
+     * @brief Get the number of indices in a data table
      *
      * @return cudf::size_type
      */

diff --git a/python/morpheus/morpheus/_lib/include/morpheus/stages/deserialize.hpp b/python/morpheus/morpheus/_lib/include/morpheus/stages/deserialize.hpp
@@ -45,12 +45,6 @@ namespace morpheus {
  * @file
  */
 
-void make_output_message(std::shared_ptr<MessageMeta>& incoming_message,
-                         TensorIndex start,
-                         TensorIndex stop,
-                         control_message_task_t* task,
-                         std::shared_ptr<ControlMessage>& windowed_message);
-
 /****** DeserializationStage********************************/
 class MORPHEUS_EXPORT DeserializeStage
   : public mrc::pymrc::PythonNode<std::shared_ptr<MessageMeta>, std::shared_ptr<ControlMessage>>

diff --git a/tests/examples/log_parsing/test_postprocessing.py b/tests/examples/log_parsing/test_postprocessing.py
@@ -38,10 +38,11 @@ def fixture_model_config_file():
 
 def build_post_proc_message(dataset_cudf: DatasetManager, log_test_data_dir: str):
     input_file = os.path.join(TEST_DIRS.validation_data_dir, 'log-parsing-validation-data-input.csv')
-    input_df = dataset_cudf[input_file]
-    meta = MessageMeta(input_df)
 
     # we have tensor data for the first five rows
+    input_df = dataset_cudf[input_file][:5]
+    meta = MessageMeta(input_df)
+
     count = 5
     tensors = {}
     for tensor_name in ['confidences', 'input_ids', 'labels']: