From 0612e2a865708180a6ebd95f6f54600e7c2f3ed7 Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Sat, 23 Sep 2023 16:59:44 -0500
Subject: [PATCH 01/10] :sparkles: Add stepwise logging for prompt tuning

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 .../text_generation/peft_prompt_tuning.py     | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
index 599571d7..53658130 100644
--- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
+++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
@@ -1042,7 +1042,10 @@ def _execute_train_loop(
 
         training_loss_tracker = []
 
+        step_count = 0
+
         for epoch in range(num_epochs):
+            step_loss_log = {}
             model.train()
             total_loss = 0
             tqdm_loader = tqdm(train_dataloader, disable=silence_progress_bars)
@@ -1060,6 +1063,8 @@ def _execute_train_loop(
                         optimizer.step()
                         lr_scheduler.step()
                         optimizer.zero_grad()
+                        step_loss_log[step_count] = loss
+                        step_count += 1
                 except torch.cuda.OutOfMemoryError:
                     error(
                         "<NLP07175292E>",
@@ -1067,14 +1072,18 @@ def _execute_train_loop(
                     )
 
             log.info("<NLP46114010I>", {"loss": float(loss), "epoch": epoch})
-            # Below is added to be propagated and stored as training_metadata
-            training_loss_tracker.append(
-                {
-                    "epoch": epoch,
-                    "value": float(loss),
-                    "timestamp": datetime.isoformat(datetime.now()),
-                }
-            )
+
+            for step, loss_val in step_loss_log.items():
+
+                # Below is added to be propagated and stored as training_metadata
+                training_loss_tracker.append(
+                    {
+                        "epoch": epoch,
+                        "step": step,
+                        "value": loss_val,
+                        "timestamp": datetime.isoformat(datetime.now()),
+                    }
+                )
 
             if eval_dataloader is not None:
                 model.eval()

From ff895d59d98ebcdba825fc2ec3eb506ecd8e446d Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Sat, 23 Sep 2023 17:10:08 -0500
Subject: [PATCH 02/10] :construction: Changing logging in FT HF Trainer to
 step level

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 .../modules/text_generation/text_generation_local.py   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
index 3b2ff535..14af6264 100644
--- a/caikit_nlp/modules/text_generation/text_generation_local.py
+++ b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -359,13 +359,16 @@ def train(
                 "dataloader_pin_memory": False,
                 "gradient_accumulation_steps": accumulate_steps,
                 "gradient_checkpointing": True,
+                "logging_strategy": "steps",
+                "logging_steps": 1, #logging at every step
                 # NOTE: This is explicitly set to false since it will
                 # negatively impact the performance
                 "full_determinism": False,
                 # Required for iterable dataset
-                "max_steps": cls.infer_max_steps(
-                    num_epochs, batch_size, training_dataset
-                ),
+                # "max_steps": cls.infer_max_steps(
+                #     num_epochs, batch_size, training_dataset
+                # ),
+                "max_steps": 5,
                 # Some interesting parameters:
                 "auto_find_batch_size": True,
                 # NOTE: following can override above arguments in order
@@ -585,6 +588,7 @@ def _launch_training(
         # Start training via Trainer.train function
         trainer.train()
 
+        breakpoint()
         # save the model temporarily and reload it
         # this is done, since otherwise the model might be distributed in different
         # devices, in which case its better to use trainer's `prediction_step`

From 1d204d4b20354523506068bb7971fddedffd49a7 Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Mon, 25 Sep 2023 15:45:33 -0500
Subject: [PATCH 03/10] :bug: Remove epoch number validation for training loss

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 caikit_nlp/modules/text_generation/peft_prompt_tuning.py | 3 +--
 tests/modules/text_generation/test_peft_prompt_tuning.py | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
index 53658130..e4e27355 100644
--- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
+++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
@@ -1080,7 +1080,7 @@ def _execute_train_loop(
                     {
                         "epoch": epoch,
                         "step": step,
-                        "value": loss_val,
+                        "value": float(loss_val),
                         "timestamp": datetime.isoformat(datetime.now()),
                     }
                 )
@@ -1141,7 +1141,6 @@ def _execute_train_loop(
                         eval_epoch_loss,
                     )
 
-        error.value_check("<NLP66129758E>", len(training_loss_tracker) == num_epochs)
         return {"loss": training_loss_tracker}
 
     @classmethod
diff --git a/tests/modules/text_generation/test_peft_prompt_tuning.py b/tests/modules/text_generation/test_peft_prompt_tuning.py
index 1d600305..4e325f44 100644
--- a/tests/modules/text_generation/test_peft_prompt_tuning.py
+++ b/tests/modules/text_generation/test_peft_prompt_tuning.py
@@ -69,12 +69,12 @@ def test_save_log_loss_file(causal_lm_dummy_model):
     """Ensure saving a model saves the log loss file"""
     with tempfile.TemporaryDirectory() as model_dir:
         causal_lm_dummy_model.save(model_dir, save_base_model=False)
-        assert os.path.isfile(
-            os.path.join(
+        file_path = os.path.join(
                 model_dir,
                 caikit_nlp.modules.text_generation.peft_prompt_tuning.TRAINING_LOSS_LOG_FILENAME,
             )
-        )
+
+        assert os.path.isfile(file_path)
 
 
 def test_run_model(causal_lm_dummy_model):

From 50b1f1abfaff86f3e06c553fd07486e6ed638d0e Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 16:45:34 -0500
Subject: [PATCH 04/10] :construction: Make changes to enable logging for FT in
 distributed computation

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 .../text_generation/text_generation_local.py  | 55 ++++++++++++++++---
 caikit_nlp/resources/pretrained_model/base.py | 17 +++++-
 .../pretrained_model/hf_auto_seq2seq_lm.py    | 47 +++++++++++++++-
 examples/run_fine_tuning.py                   |  2 +-
 4 files changed, 107 insertions(+), 14 deletions(-)

diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
index 14af6264..4acebc13 100644
--- a/caikit_nlp/modules/text_generation/text_generation_local.py
+++ b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -14,15 +14,17 @@
 
 
 # Standard
-from typing import Optional, Union
+from datetime import datetime
+from typing import Any, Dict, Optional, Union
 import gc
+import json
 import os
 import tempfile
 
 # Third Party
 from datasets import Dataset
 from datasets import IterableDataset as TransformersIterableDataset
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer, TrainerCallback
 import torch
 
 # First Party
@@ -52,6 +54,8 @@
 error = error_handler.get(log)
 
 
+TRAINING_LOSS_LOG_FILENAME = "training_logs.jsonl"
+
 # pylint: disable=too-many-lines,too-many-instance-attributes
 @module(
     id="f9181353-4ccf-4572-bd1e-f12bcda26792",
@@ -95,6 +99,7 @@ def __init__(
         sep_token: Optional[str] = None,
         eos_token: Optional[str] = None,
         pad_token: Optional[str] = None,
+        training_metadata: Union[Dict[str, Any], None] = None,
     ):
         super().__init__()
 
@@ -106,6 +111,7 @@ def __init__(
         self._sep_token = sep_token
         self._eos_token = eos_token
         self._pad_token = pad_token
+        self.training_metadata = training_metadata
 
     # pylint: disable=duplicate-code
     def __del__(self):
@@ -365,15 +371,15 @@ def train(
                 # negatively impact the performance
                 "full_determinism": False,
                 # Required for iterable dataset
+                "max_steps": 5,
                 # "max_steps": cls.infer_max_steps(
                 #     num_epochs, batch_size, training_dataset
                 # ),
-                "max_steps": 5,
                 # Some interesting parameters:
                 "auto_find_batch_size": True,
                 # NOTE: following can override above arguments in order
                 **filtered_training_arguments,
-                **processing_configuration,
+                # **processing_configuration,
                 **dtype_based_params,
             }
 
@@ -394,15 +400,21 @@ def train(
                 get_config().master_port,
             )
 
+
             if torch.cuda.is_available():
                 # NOTE: torch distributed can hang if run on CPUs,
                 # to avoid that, specially for unit tests, we are only
                 # running below when GPUs are available
-                torch.distributed.launcher.api.elastic_launch(
+                training_loss_history = torch.distributed.launcher.api.elastic_launch(
                     launch_config, cls._launch_training
                 )(base_model, training_dataset, training_args, checkpoint_dir)
+
+                # NOTE: We are currently only storing the loss information from
+                # rank 0, i.e main process. training_loss_history is dictionary containing
+                # rank of the process as key
+                training_loss_history = training_loss_history[0]
             else:
-                cls._launch_training(
+                training_loss_history = cls._launch_training(
                     base_model, training_dataset, training_args, checkpoint_dir
                 )
 
@@ -426,6 +438,7 @@ def train(
             sep_token=model.tokenizer.sep_token or None,
             eos_token=model.tokenizer.eos_token or None,
             pad_token=model.tokenizer.pad_token or None,
+            training_metadata=training_loss_history
         )
 
     @classmethod
@@ -488,6 +501,24 @@ def save(self, model_path):
                     base_model_dirname=artifacts_dir,
                 )
 
+            training_loss_filename = TRAINING_LOSS_LOG_FILENAME
+
+            saver.update_config({"training_logs": training_loss_filename})
+
+            # We are currently only saving logs containing loss in jsonl format
+            if "loss" in self.training_metadata:
+                loss_log_lines = self.training_metadata.get("loss")
+                error.type_check("<NLP60269855E>", list, loss_log_lines=loss_log_lines)
+                with open(
+                    os.path.join(model_path, training_loss_filename),
+                    "w",
+                    encoding="utf-8",
+                ) as f:
+                    for loss_log in loss_log_lines:
+                        loss_log = {"name": "loss", "data": loss_log}
+                        json.dump(loss_log, f)
+                        f.write("\n")
+
     def run(
         self,
         text: str,
@@ -581,14 +612,18 @@ def _launch_training(
     ) -> None:
         """Utility function to wrap trainer and execute training"""
 
+        # logging_callback = LoggingCallback()
+
         trainer = base_model.get_trainer(
             train_dataset=training_dataset, **training_args
         )
 
+        # Add logging callback
+        # trainer.add_callback(logging_callback)
+
         # Start training via Trainer.train function
         trainer.train()
 
-        breakpoint()
         # save the model temporarily and reload it
         # this is done, since otherwise the model might be distributed in different
         # devices, in which case its better to use trainer's `prediction_step`
@@ -600,6 +635,10 @@ def _launch_training(
         # save tokenizer explicitly
         base_model.tokenizer.save_pretrained(checkpoint_dir)
 
+        # Below will return log history but launch will automatically attach rank to it.
+        # if started in distributed fashion
+        return trainer.state.log_history
+
     @staticmethod
     def infer_max_steps(
         num_epochs: int,
@@ -627,4 +666,4 @@ def infer_max_steps(
 
 def get(train_stream):
     for data in train_stream:
-        yield {"input": data.input, "output": data.output}
+        yield {"input": data.input, "output": data.output}
\ No newline at end of file
diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py
index 3f6aa498..3fea4f13 100644
--- a/caikit_nlp/resources/pretrained_model/base.py
+++ b/caikit_nlp/resources/pretrained_model/base.py
@@ -15,18 +15,21 @@
 # Standard
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
-from typing import Callable, List, Optional, Tuple, Type, Union
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
 import json
 import os
+from torch import nn
 
 # Third Party
-from torch.utils.data import IterableDataset
+from torch.utils.data import Dataset, IterableDataset
 from transformers import (
     AutoTokenizer,
+    DataCollator,
     DataCollatorWithPadding,
     Trainer,
     TrainingArguments,
 )
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 import torch
 
@@ -36,6 +39,9 @@
 from caikit.core.modules import ModuleBase, ModuleConfig, ModuleSaver
 from caikit.core.toolkit import error_handler
 import alog
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
 
 # Local
 from ...data_model import GenerationTrainRecord, PromptOutputModelType
@@ -45,6 +51,11 @@
 error = error_handler.get(log)
 
 
+class LoggingTrainer(Trainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.training_loss_history = []
+
 class PretrainedModelBase(ABC, ModuleBase):
     """Common abstractions and requirements for pretrained model resources"""
 
@@ -281,7 +292,7 @@ def get_trainer(
             "eval_dataset": eval_dataset,
         }
 
-        return Trainer(self._model, training_args, **trainer_arguments)
+        return LoggingTrainer(self._model, training_args, **trainer_arguments)
 
     def _get_data_collator(self, **kwargs):
         """Function to return appropriate data collator based on resource.
diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
index 4a6c9ceb..8bb37888 100644
--- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
+++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
@@ -15,8 +15,10 @@
 Huggingface auto causal LM resource type
 """
 # Standard
+import os
 from collections.abc import Mapping
-from typing import List, Union
+from datetime import datetime
+from typing import Dict, List, Union
 
 # Third Party
 from torch.utils.data import IterableDataset
@@ -27,6 +29,7 @@
     Seq2SeqTrainingArguments,
 )
 from transformers.models.auto import modeling_auto
+import torch
 
 # First Party
 from caikit.core.modules import module
@@ -44,6 +47,46 @@
 IGNORE_ID = -100
 
 
+class LoggingTrainer(Seq2SeqTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.training_loss_history = []
+
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        if self.state.epoch is not None:
+            logs["epoch"] = round(self.state.epoch, 2)
+
+        # output = {**logs, **{"step": self.state.global_step}}
+        # Get Rank
+        if torch.distributed.is_initialized():
+            rank = torch.distributed.get_rank()
+        else:
+            rank = 0
+
+        if "loss" in logs:
+            print("loss in logs. {} rank".format(os.getenv("RANK")))
+            log.debug(f"process rank: {rank} loss: {float(logs['loss'])} step: {self.state.global_step}")
+            output =  {
+                    "epoch": float(logs["epoch"]),
+                    "step": self.state.global_step,
+                    "value": float(logs["loss"]),
+                    "timestamp": datetime.isoformat(datetime.now()),
+                }
+            print("loss: ", output)
+            self.state.log_history.append(output)
+            self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+        else:
+            print("loss not in logs")
+
 @module(
     id="6759e891-287b-405b-bd8b-54a4a4d51c25",
     name="HF Transformers Auto Seq2Seq LM",
@@ -110,7 +153,7 @@ def get_trainer(
             # "generation_max_length": max_target_length,
         }
 
-        return Seq2SeqTrainer(self._model, training_args, **trainer_arguments)
+        return LoggingTrainer(self._model, training_args, **trainer_arguments)
 
     def _get_data_collator(self, **kwargs):
         """Function to return appropriate data collator based on resource.
diff --git a/examples/run_fine_tuning.py b/examples/run_fine_tuning.py
index 6187d2f8..c386f1f4 100644
--- a/examples/run_fine_tuning.py
+++ b/examples/run_fine_tuning.py
@@ -344,7 +344,7 @@ def export_model_preds(preds_file, predictions, validation_stream):
     sample_text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
     prediction_results = model.run(sample_text)
 
-    print("Generated text: ", prediction_results)
+    # print("Generated text: ", prediction_results)
 
     # Saving model
     model.save(args.output_dir)

From 4132c2a6d02851d29a48bab93075409275780952 Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 19:18:11 -0500
Subject: [PATCH 05/10] :construction: Update trainers to include base classes
 and create log utility function

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 .../text_generation/text_generation_local.py  | 11 ++++---
 caikit_nlp/resources/pretrained_model/base.py | 18 +++++++++--
 .../pretrained_model/hf_auto_seq2seq_lm.py    | 30 ++-----------------
 3 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
index 4acebc13..47f94e6f 100644
--- a/caikit_nlp/modules/text_generation/text_generation_local.py
+++ b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -371,15 +371,14 @@ def train(
                 # negatively impact the performance
                 "full_determinism": False,
                 # Required for iterable dataset
-                "max_steps": 5,
-                # "max_steps": cls.infer_max_steps(
-                #     num_epochs, batch_size, training_dataset
-                # ),
+                "max_steps": cls.infer_max_steps(
+                    num_epochs, batch_size, training_dataset
+                ),
                 # Some interesting parameters:
                 "auto_find_batch_size": True,
                 # NOTE: following can override above arguments in order
                 **filtered_training_arguments,
-                # **processing_configuration,
+                **processing_configuration,
                 **dtype_based_params,
             }
 
@@ -438,7 +437,7 @@ def train(
             sep_token=model.tokenizer.sep_token or None,
             eos_token=model.tokenizer.eos_token or None,
             pad_token=model.tokenizer.pad_token or None,
-            training_metadata=training_loss_history
+            training_metadata={"loss": training_loss_history}
         )
 
     @classmethod
diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py
index 3fea4f13..05345982 100644
--- a/caikit_nlp/resources/pretrained_model/base.py
+++ b/caikit_nlp/resources/pretrained_model/base.py
@@ -46,15 +46,27 @@
 # Local
 from ...data_model import GenerationTrainRecord, PromptOutputModelType
 from ...toolkit.data_type_utils import get_torch_dtype, str_to_torch_dtype
+from ...toolkit.trainer_utils import log_step
 
 log = alog.use_channel("HFRBAS")
 error = error_handler.get(log)
 
 
 class LoggingTrainer(Trainer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.training_loss_history = []
+
+    def log(self, logs: Dict[str, float]) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        self.state = log_step(self.state, logs)
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+
 
 class PretrainedModelBase(ABC, ModuleBase):
     """Common abstractions and requirements for pretrained model resources"""
diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
index 8bb37888..a0afc21e 100644
--- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
+++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
@@ -38,6 +38,7 @@
 
 # Local
 from ...data_model import GenerationTrainRecord, PromptOutputModelType
+from ...toolkit.trainer_utils import log_step
 from ...toolkit.verbalizer_utils import render_verbalizer
 from .base import PretrainedModelBase
 
@@ -48,9 +49,6 @@
 
 
 class LoggingTrainer(Seq2SeqTrainer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.training_loss_history = []
 
     def log(self, logs: Dict[str, float]) -> None:
         """
@@ -62,30 +60,8 @@ def log(self, logs: Dict[str, float]) -> None:
             logs (`Dict[str, float]`):
                 The values to log.
         """
-        if self.state.epoch is not None:
-            logs["epoch"] = round(self.state.epoch, 2)
-
-        # output = {**logs, **{"step": self.state.global_step}}
-        # Get Rank
-        if torch.distributed.is_initialized():
-            rank = torch.distributed.get_rank()
-        else:
-            rank = 0
-
-        if "loss" in logs:
-            print("loss in logs. {} rank".format(os.getenv("RANK")))
-            log.debug(f"process rank: {rank} loss: {float(logs['loss'])} step: {self.state.global_step}")
-            output =  {
-                    "epoch": float(logs["epoch"]),
-                    "step": self.state.global_step,
-                    "value": float(logs["loss"]),
-                    "timestamp": datetime.isoformat(datetime.now()),
-                }
-            print("loss: ", output)
-            self.state.log_history.append(output)
-            self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
-        else:
-            print("loss not in logs")
+        self.state = log_step(self.state, logs)
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
 
 @module(
     id="6759e891-287b-405b-bd8b-54a4a4d51c25",

From 03505b8f7a91fa89f75731447ba3a50b3c47f6eb Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 19:44:44 -0500
Subject: [PATCH 06/10] :art: Fix formatting and linting

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 .../modules/text_generation/peft_prompt_tuning.py |  2 +-
 .../text_generation/text_generation_local.py      | 15 ++++-----------
 caikit_nlp/resources/pretrained_model/base.py     | 13 ++++---------
 .../pretrained_model/hf_auto_seq2seq_lm.py        |  9 ++++-----
 .../text_generation/test_peft_prompt_tuning.py    |  6 +++---
 5 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
index e4e27355..fcf4269b 100644
--- a/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
+++ b/caikit_nlp/modules/text_generation/peft_prompt_tuning.py
@@ -1042,7 +1042,7 @@ def _execute_train_loop(
 
         training_loss_tracker = []
 
-        step_count = 0
+        step_count = 1
 
         for epoch in range(num_epochs):
             step_loss_log = {}
diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
index 47f94e6f..395cceb4 100644
--- a/caikit_nlp/modules/text_generation/text_generation_local.py
+++ b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -14,7 +14,6 @@
 
 
 # Standard
-from datetime import datetime
 from typing import Any, Dict, Optional, Union
 import gc
 import json
@@ -24,7 +23,7 @@
 # Third Party
 from datasets import Dataset
 from datasets import IterableDataset as TransformersIterableDataset
-from transformers import AutoConfig, AutoTokenizer, TrainerCallback
+from transformers import AutoConfig, AutoTokenizer
 import torch
 
 # First Party
@@ -366,7 +365,7 @@ def train(
                 "gradient_accumulation_steps": accumulate_steps,
                 "gradient_checkpointing": True,
                 "logging_strategy": "steps",
-                "logging_steps": 1, #logging at every step
+                "logging_steps": 1,  # logging at every step
                 # NOTE: This is explicitly set to false since it will
                 # negatively impact the performance
                 "full_determinism": False,
@@ -399,7 +398,6 @@ def train(
                 get_config().master_port,
             )
 
-
             if torch.cuda.is_available():
                 # NOTE: torch distributed can hang if run on CPUs,
                 # to avoid that, specially for unit tests, we are only
@@ -437,7 +435,7 @@ def train(
             sep_token=model.tokenizer.sep_token or None,
             eos_token=model.tokenizer.eos_token or None,
             pad_token=model.tokenizer.pad_token or None,
-            training_metadata={"loss": training_loss_history}
+            training_metadata={"loss": training_loss_history},
         )
 
     @classmethod
@@ -611,15 +609,10 @@ def _launch_training(
     ) -> None:
         """Utility function to wrap trainer and execute training"""
 
-        # logging_callback = LoggingCallback()
-
         trainer = base_model.get_trainer(
             train_dataset=training_dataset, **training_args
         )
 
-        # Add logging callback
-        # trainer.add_callback(logging_callback)
-
         # Start training via Trainer.train function
         trainer.train()
 
@@ -665,4 +658,4 @@ def infer_max_steps(
 
 def get(train_stream):
     for data in train_stream:
-        yield {"input": data.input, "output": data.output}
\ No newline at end of file
+        yield {"input": data.input, "output": data.output}
diff --git a/caikit_nlp/resources/pretrained_model/base.py b/caikit_nlp/resources/pretrained_model/base.py
index 05345982..dd1c7a8a 100644
--- a/caikit_nlp/resources/pretrained_model/base.py
+++ b/caikit_nlp/resources/pretrained_model/base.py
@@ -18,18 +18,15 @@
 from typing import Callable, Dict, List, Optional, Tuple, Type, Union
 import json
 import os
-from torch import nn
 
 # Third Party
-from torch.utils.data import Dataset, IterableDataset
+from torch.utils.data import IterableDataset
 from transformers import (
     AutoTokenizer,
-    DataCollator,
     DataCollatorWithPadding,
     Trainer,
     TrainingArguments,
 )
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 import torch
 
@@ -39,9 +36,6 @@
 from caikit.core.modules import ModuleBase, ModuleConfig, ModuleSaver
 from caikit.core.toolkit import error_handler
 import alog
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import EvalPrediction
 
 # Local
 from ...data_model import GenerationTrainRecord, PromptOutputModelType
@@ -53,7 +47,6 @@
 
 
 class LoggingTrainer(Trainer):
-
     def log(self, logs: Dict[str, float]) -> None:
         """
         Log `logs` on the various objects watching training.
@@ -65,7 +58,9 @@ def log(self, logs: Dict[str, float]) -> None:
                 The values to log.
         """
         self.state = log_step(self.state, logs)
-        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+        self.control = self.callback_handler.on_log(
+            self.args, self.state, self.control, logs
+        )
 
 
 class PretrainedModelBase(ABC, ModuleBase):
diff --git a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
index a0afc21e..9ca054fe 100644
--- a/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
+++ b/caikit_nlp/resources/pretrained_model/hf_auto_seq2seq_lm.py
@@ -15,9 +15,7 @@
 Huggingface auto causal LM resource type
 """
 # Standard
-import os
 from collections.abc import Mapping
-from datetime import datetime
 from typing import Dict, List, Union
 
 # Third Party
@@ -29,7 +27,6 @@
     Seq2SeqTrainingArguments,
 )
 from transformers.models.auto import modeling_auto
-import torch
 
 # First Party
 from caikit.core.modules import module
@@ -49,7 +46,6 @@
 
 
 class LoggingTrainer(Seq2SeqTrainer):
-
     def log(self, logs: Dict[str, float]) -> None:
         """
         Log `logs` on the various objects watching training.
@@ -61,7 +57,10 @@ def log(self, logs: Dict[str, float]) -> None:
                 The values to log.
         """
         self.state = log_step(self.state, logs)
-        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
+        self.control = self.callback_handler.on_log(
+            self.args, self.state, self.control, logs
+        )
+
 
 @module(
     id="6759e891-287b-405b-bd8b-54a4a4d51c25",
diff --git a/tests/modules/text_generation/test_peft_prompt_tuning.py b/tests/modules/text_generation/test_peft_prompt_tuning.py
index 4e325f44..36a10024 100644
--- a/tests/modules/text_generation/test_peft_prompt_tuning.py
+++ b/tests/modules/text_generation/test_peft_prompt_tuning.py
@@ -70,9 +70,9 @@ def test_save_log_loss_file(causal_lm_dummy_model):
     with tempfile.TemporaryDirectory() as model_dir:
         causal_lm_dummy_model.save(model_dir, save_base_model=False)
         file_path = os.path.join(
-                model_dir,
-                caikit_nlp.modules.text_generation.peft_prompt_tuning.TRAINING_LOSS_LOG_FILENAME,
-            )
+            model_dir,
+            caikit_nlp.modules.text_generation.peft_prompt_tuning.TRAINING_LOSS_LOG_FILENAME,
+        )
 
         assert os.path.isfile(file_path)
 

From 1d90036e1660be181832688a80ec21215cbce9d7 Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 19:46:36 -0500
Subject: [PATCH 07/10] :bug: Fix empty training metadata issue

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 caikit_nlp/modules/text_generation/text_generation_local.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/caikit_nlp/modules/text_generation/text_generation_local.py b/caikit_nlp/modules/text_generation/text_generation_local.py
index 395cceb4..538fd6b7 100644
--- a/caikit_nlp/modules/text_generation/text_generation_local.py
+++ b/caikit_nlp/modules/text_generation/text_generation_local.py
@@ -110,7 +110,9 @@ def __init__(
         self._sep_token = sep_token
         self._eos_token = eos_token
         self._pad_token = pad_token
-        self.training_metadata = training_metadata
+        self.training_metadata = (
+            training_metadata if training_metadata is not None else {}
+        )
 
     # pylint: disable=duplicate-code
     def __del__(self):

From 203a1ff9d0e3435c948454083cff6ae6bb30c376 Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 19:48:54 -0500
Subject: [PATCH 08/10] :recycle: Revert back run fine tuning print statement

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 examples/run_fine_tuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_fine_tuning.py b/examples/run_fine_tuning.py
index c386f1f4..6187d2f8 100644
--- a/examples/run_fine_tuning.py
+++ b/examples/run_fine_tuning.py
@@ -344,7 +344,7 @@ def export_model_preds(preds_file, predictions, validation_stream):
     sample_text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
     prediction_results = model.run(sample_text)
 
-    # print("Generated text: ", prediction_results)
+    print("Generated text: ", prediction_results)
 
     # Saving model
     model.save(args.output_dir)

From c9852cd2a1791b42ea954e8588a6f7b7046195ee Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 23:13:53 -0500
Subject: [PATCH 09/10] :sparkles: Add trainer util file

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 caikit_nlp/toolkit/trainer_utils.py | 60 +++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 caikit_nlp/toolkit/trainer_utils.py

diff --git a/caikit_nlp/toolkit/trainer_utils.py b/caikit_nlp/toolkit/trainer_utils.py
new file mode 100644
index 00000000..de62f959
--- /dev/null
+++ b/caikit_nlp/toolkit/trainer_utils.py
@@ -0,0 +1,60 @@
+# Copyright The Caikit Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains toolkit functionality for huggingface Trainer"""
+# Standard
+from datetime import datetime
+
+# First Party
+import alog
+
+# Third Party
+import torch
+
+log = alog.use_channel("TRNR_UTILS")
+
+def log_step(state, logs):
+    if state.epoch is not None:
+        logs["epoch"] = round(state.epoch, 2)
+
+
+    # Get Rank
+    if torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+
+    if "loss" in logs:
+        if state.epoch is not None:
+            logs["epoch"] = round(state.epoch, 2)
+
+        log.debug(
+            "process rank: {} loss: {} step: {}".format(
+                rank,
+                float(logs['loss']),
+                state.global_step
+            )
+
+        )
+        output =  {
+                "epoch": float(logs["epoch"]),
+                "step": state.global_step,
+                "value": float(logs["loss"]),
+                "timestamp": datetime.isoformat(datetime.now()),
+            }
+        state.log_history.append(output)
+    else:
+        output = {**logs, **{"step": state.global_step}}
+        state.log_history.append(output)
+
+    return state

From b1f4d81e12a9b5f14e65d8f887f233d757c40a1b Mon Sep 17 00:00:00 2001
From: gkumbhat <kumbhat.gaurav@gmail.com>
Date: Tue, 26 Sep 2023 23:33:06 -0500
Subject: [PATCH 10/10] :art: Fix formatting for trainer utils

Signed-off-by: gkumbhat <kumbhat.gaurav@gmail.com>
---
 caikit_nlp/toolkit/trainer_utils.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/caikit_nlp/toolkit/trainer_utils.py b/caikit_nlp/toolkit/trainer_utils.py
index de62f959..736e9d90 100644
--- a/caikit_nlp/toolkit/trainer_utils.py
+++ b/caikit_nlp/toolkit/trainer_utils.py
@@ -15,19 +15,19 @@
 # Standard
 from datetime import datetime
 
-# First Party
-import alog
-
 # Third Party
 import torch
 
+# First Party
+import alog
+
 log = alog.use_channel("TRNR_UTILS")
 
+
 def log_step(state, logs):
     if state.epoch is not None:
         logs["epoch"] = round(state.epoch, 2)
 
-
     # Get Rank
     if torch.distributed.is_initialized():
         rank = torch.distributed.get_rank()
@@ -40,18 +40,15 @@ def log_step(state, logs):
 
         log.debug(
             "process rank: {} loss: {} step: {}".format(
-                rank,
-                float(logs['loss']),
-                state.global_step
+                rank, float(logs["loss"]), state.global_step
             )
-
         )
-        output =  {
-                "epoch": float(logs["epoch"]),
-                "step": state.global_step,
-                "value": float(logs["loss"]),
-                "timestamp": datetime.isoformat(datetime.now()),
-            }
+        output = {
+            "epoch": float(logs["epoch"]),
+            "step": state.global_step,
+            "value": float(logs["loss"]),
+            "timestamp": datetime.isoformat(datetime.now()),
+        }
         state.log_history.append(output)
     else:
         output = {**logs, **{"step": state.global_step}}