From 63fea6b9a22ef2000ba1f0ec43f611d95fa09865 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Mon, 16 Dec 2024 12:46:26 +0100 Subject: [PATCH 1/7] Update callbacks expo scheduler --- qadence/ml_tools/callbacks/callback.py | 57 ++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/qadence/ml_tools/callbacks/callback.py b/qadence/ml_tools/callbacks/callback.py index bda56130..cf7709d6 100644 --- a/qadence/ml_tools/callbacks/callback.py +++ b/qadence/ml_tools/callbacks/callback.py @@ -449,3 +449,60 @@ def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> writer.log_model( model, trainer.train_dataloader, trainer.val_dataloader, trainer.test_dataloader ) + +class LRSchedulerExponentialDecay(Callback): + """ + Applies exponential decay to the learning rate during training. + + This callback adjusts the learning rate at regular intervals by multiplying + it with a decay factor. The learning rate is updated as: + lr = lr * gamma + + Example Usage in `TrainConfig`: + To use `LRSchedulerExponentialDecay`, include it in the `callbacks` list + when setting up your `TrainConfig`: + ```python exec="on" source="material-block" result="json" + from qadence.ml_tools import TrainConfig + from qadence.ml_tools.callbacks import LRSchedulerExponentialDecay + + # Create an instance of the LRSchedulerExponentialDecay callback + lr_exponential_decay = LRSchedulerExponentialDecay(on = "train_epoch_end", + called_every = 100, + gamma = 0.9) + + config = TrainConfig( + max_iter=10000, + # Print metrics every 1000 training epochs + print_every=1000, + # Add the custom callback that runs every 100 val_batch_end + callbacks=[lr_exponential_decay] + ) + ``` + """ + + def __init__(self, on: str, called_every: int, gamma: float = 0.9): + """Initializes the LRSchedulerExponentialDecay callback. + + Args: + on (str): The event to trigger the callback. + called_every (int): Frequency of callback calls in terms of iterations. + gamma (float, optional): The decay factor applied to the learning rate. + A value < 1 reduces the learning rate over time. + Default is 0.9. + """ + super().__init__(on=on, called_every=called_every) + if gamma > 1: + raise ValueError(f"Gamma must be less than or equal to 1, but got {gamma}.") + self.gamma = gamma + + def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: + """ + Runs the callback to apply exponential decay to the learning rate. + + Args: + trainer (Any): The training object. + config (TrainConfig): The configuration object. + writer (BaseWriter): The writer object for logging. + """ + for param_group in trainer.optimizer.param_groups: + param_group["lr"] *= self.gamma \ No newline at end of file From 1f583858eea03b57cb20900b02d126fe8e6d6eb9 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:14:47 +0100 Subject: [PATCH 2/7] Update new callbacks --- qadence/ml_tools/callbacks/callback.py | 311 +++++++++++++++++++++++-- 1 file changed, 289 insertions(+), 22 deletions(-) diff --git a/qadence/ml_tools/callbacks/callback.py b/qadence/ml_tools/callbacks/callback.py index cf7709d6..cebe71d2 100644 --- a/qadence/ml_tools/callbacks/callback.py +++ b/qadence/ml_tools/callbacks/callback.py @@ -1,5 +1,7 @@ from __future__ import annotations +import math +from logging import getLogger from typing import Any, Callable from qadence.ml_tools.callbacks.saveload import load_checkpoint, write_checkpoint @@ -12,6 +14,8 @@ CallbackFunction = Callable[..., Any] CallbackConditionFunction = Callable[..., bool] +logger = getLogger("ml_tools") + class Callback: """Base class for defining various training callbacks. @@ -258,7 +262,7 @@ def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> writer (BaseWriter ): The writer object for logging. """ opt_result = trainer.opt_result - writer.write(opt_result) + writer.write(opt_result.iteration, opt_result.metrics) class PlotMetrics(Callback): @@ -450,54 +454,172 @@ def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> model, trainer.train_dataloader, trainer.val_dataloader, trainer.test_dataloader ) -class LRSchedulerExponentialDecay(Callback): + +class LRSchedulerStepDecay(Callback): """ - Applies exponential decay to the learning rate during training. + Reduces the learning rate by a factor at regular intervals. - This callback adjusts the learning rate at regular intervals by multiplying - it with a decay factor. The learning rate is updated as: + This callback adjusts the learning rate by multiplying it with a decay factor + after a specified number of iterations. The learning rate is updated as: lr = lr * gamma Example Usage in `TrainConfig`: - To use `LRSchedulerExponentialDecay`, include it in the `callbacks` list - when setting up your `TrainConfig`: + To use `LRSchedulerStepDecay`, include it in the `callbacks` list when setting + up your `TrainConfig`: ```python exec="on" source="material-block" result="json" from qadence.ml_tools import TrainConfig - from qadence.ml_tools.callbacks import LRSchedulerExponentialDecay + from qadence.ml_tools.callbacks import LRSchedulerStepDecay - # Create an instance of the LRSchedulerExponentialDecay callback - lr_exponential_decay = LRSchedulerExponentialDecay(on = "train_epoch_end", - called_every = 100, - gamma = 0.9) + # Create an instance of the LRSchedulerStepDecay callback + lr_step_decay = LRSchedulerStepDecay(on="train_epoch_end", + called_every=100, + gamma=0.5) config = TrainConfig( max_iter=10000, # Print metrics every 1000 training epochs print_every=1000, - # Add the custom callback that runs every 100 val_batch_end - callbacks=[lr_exponential_decay] + # Add the custom callback + callbacks=[lr_step_decay] ) ``` """ - def __init__(self, on: str, called_every: int, gamma: float = 0.9): - """Initializes the LRSchedulerExponentialDecay callback. + def __init__(self, on: str, called_every: int, gamma: float = 0.5): + """Initializes the LRSchedulerStepDecay callback. Args: on (str): The event to trigger the callback. called_every (int): Frequency of callback calls in terms of iterations. gamma (float, optional): The decay factor applied to the learning rate. - A value < 1 reduces the learning rate over time. - Default is 0.9. + A value < 1 reduces the learning rate over time. Default is 0.5. """ super().__init__(on=on, called_every=called_every) - if gamma > 1: - raise ValueError(f"Gamma must be less than or equal to 1, but got {gamma}.") self.gamma = gamma def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: """ - Runs the callback to apply exponential decay to the learning rate. + Runs the callback to apply step decay to the learning rate. + + Args: + trainer (Any): The training object. + config (TrainConfig): The configuration object. + writer (BaseWriter): The writer object for logging. + """ + for param_group in trainer.optimizer.param_groups: + param_group["lr"] *= self.gamma + + +class LRSchedulerCyclic(Callback): + """ + Applies a cyclic learning rate schedule during training. + + This callback oscillates the learning rate between a minimum (base_lr) + and a maximum (max_lr) over a defined cycle length (step_size). The learning + rate follows a triangular wave pattern. + + Example Usage in `TrainConfig`: + To use `LRSchedulerCyclic`, include it in the `callbacks` list when setting + up your `TrainConfig`: + ```python exec="on" source="material-block" result="json" + from qadence.ml_tools import TrainConfig + from qadence.ml_tools.callbacks import LRSchedulerCyclic + + # Create an instance of the LRSchedulerCyclic callback + lr_cyclic = LRSchedulerCyclic(on="train_batch_end", + called_every=1, + base_lr=0.001, + max_lr=0.01, + step_size=2000) + + config = TrainConfig( + max_iter=10000, + # Print metrics every 1000 training epochs + print_every=1000, + # Add the custom callback + callbacks=[lr_cyclic] + ) + ``` + """ + + def __init__(self, on: str, called_every: int, base_lr: float, max_lr: float, step_size: int): + """Initializes the LRSchedulerCyclic callback. + + Args: + on (str): The event to trigger the callback. + called_every (int): Frequency of callback calls in terms of iterations. + base_lr (float): The minimum learning rate. + max_lr (float): The maximum learning rate. + step_size (int): Number of iterations for half a cycle. + """ + super().__init__(on=on, called_every=called_every) + self.base_lr = base_lr + self.max_lr = max_lr + self.step_size = step_size + + def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: + """ + Adjusts the learning rate cyclically. + + Args: + trainer (Any): The training object. + config (TrainConfig): The configuration object. + writer (BaseWriter): The writer object for logging. + """ + cycle = trainer.opt_result.iteration // (2 * self.step_size) + x = abs(trainer.opt_result.iteration / self.step_size - 2 * cycle - 1) + scale = max(0, (1 - x)) + new_lr = self.base_lr + (self.max_lr - self.base_lr) * scale + for param_group in trainer.optimizer.param_groups: + param_group["lr"] = new_lr + + +class LRSchedulerCosineAnnealing(Callback): + """ + Applies cosine annealing to the learning rate during training. + + This callback decreases the learning rate following a cosine curve, + starting from the initial learning rate and annealing to a minimum (min_lr). + + Example Usage in `TrainConfig`: + To use `LRSchedulerCosineAnnealing`, include it in the `callbacks` list + when setting up your `TrainConfig`: + ```python exec="on" source="material-block" result="json" + from qadence.ml_tools import TrainConfig + from qadence.ml_tools.callbacks import LRSchedulerCosineAnnealing + + # Create an instance of the LRSchedulerCosineAnnealing callback + lr_cosine = LRSchedulerCosineAnnealing(on="train_batch_end", + called_every=1, + t_max=5000, + min_lr=1e-6) + + config = TrainConfig( + max_iter=10000, + # Print metrics every 1000 training epochs + print_every=1000, + # Add the custom callback + callbacks=[lr_cosine] + ) + ``` + """ + + def __init__(self, on: str, called_every: int, t_max: int, min_lr: float = 0.0): + """Initializes the LRSchedulerCosineAnnealing callback. + + Args: + on (str): The event to trigger the callback. + called_every (int): Frequency of callback calls in terms of iterations. + t_max (int): The total number of iterations for one annealing cycle. + min_lr (float, optional): The minimum learning rate. Default is 0.0. + """ + super().__init__(on=on, called_every=called_every) + self.t_max = t_max + self.min_lr = min_lr + + def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: + """ + Adjusts the learning rate using cosine annealing. Args: trainer (Any): The training object. @@ -505,4 +627,149 @@ def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> writer (BaseWriter): The writer object for logging. """ for param_group in trainer.optimizer.param_groups: - param_group["lr"] *= self.gamma \ No newline at end of file + max_lr = param_group["lr"] + new_lr = ( + self.min_lr + + (max_lr - self.min_lr) + * (1 + math.cos(math.pi * trainer.opt_result.iteration / self.t_max)) + / 2 + ) + param_group["lr"] = new_lr + + +class EarlyStopping(Callback): + """ + Stops training when a monitored metric has not improved for a specified number of epochs. + + This callback monitors a specified metric (e.g., validation loss or accuracy). If the metric + does not improve for a given patience period, training is stopped. + + Example Usage in `TrainConfig`: + To use `EarlyStopping`, include it in the `callbacks` list when setting up your `TrainConfig`: + ```python exec="on" source="material-block" result="json" + from qadence.ml_tools import TrainConfig + from qadence.ml_tools.callbacks import EarlyStopping + + # Create an instance of the EarlyStopping callback + early_stopping = EarlyStopping(on="val_epoch_end", + called_every=1, + monitor="val_loss", + patience=5, + mode="min") + + config = TrainConfig( + max_iter=10000, + print_every=1000, + callbacks=[early_stopping] + ) + ``` + """ + + def __init__( + self, on: str, called_every: int, monitor: str, patience: int = 5, mode: str = "min" + ): + """Initializes the EarlyStopping callback. + + Args: + on (str): The event to trigger the callback (e.g., "val_epoch_end"). + called_every (int): Frequency of callback calls in terms of iterations. + monitor (str): The metric to monitor (e.g., "val_loss" or "train_loss"). + All metrics returned by optimize step are available to monitor. + Please add "val_" and "train_" strings at the start of the metric name. + patience (int, optional): Number of iterations to wait for improvement. Default is 5. + mode (str, optional): Whether to minimize ("min") or maximize ("max") the metric. + Default is "min". + """ + super().__init__(on=on, called_every=called_every) + self.monitor = monitor + self.patience = patience + self.mode = mode + self.best_value = float("inf") if mode == "min" else -float("inf") + self.counter = 0 + + def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: + """ + Monitors the metric and stops training if no improvement is observed. + + Args: + trainer (Any): The training object. + config (TrainConfig): The configuration object. + writer (BaseWriter): The writer object for logging. + """ + current_value = trainer.opt_result.metrics.get(self.monitor) + if current_value is None: + raise ValueError(f"Metric '{self.monitor}' is not available in the trainer's metrics.") + + if (self.mode == "min" and current_value < self.best_value) or ( + self.mode == "max" and current_value > self.best_value + ): + self.best_value = current_value + self.counter = 0 + else: + self.counter += 1 + + if self.counter >= self.patience: + logger.info( + f"EarlyStopping: No improvement in '{self.monitor}' for {self.patience} epochs. " + "Stopping training." + ) + trainer.stop_training = True + + +class GradientMonitoring(Callback): + """ + Logs gradient statistics (e.g., mean, standard deviation, max) during training. + + This callback monitors and logs statistics about the gradients of the model parameters + to help debug or optimize the training process. + + Example Usage in `TrainConfig`: + To use `GradientMonitoring`, include it in the `callbacks` list when + setting up your `TrainConfig`: + ```python exec="on" source="material-block" result="json" + from qadence.ml_tools import TrainConfig + from qadence.ml_tools.callbacks import GradientMonitoring + + # Create an instance of the GradientMonitoring callback + gradient_monitoring = GradientMonitoring(on="train_batch_end", called_every=10) + + config = TrainConfig( + max_iter=10000, + print_every=1000, + callbacks=[gradient_monitoring] + ) + ``` + """ + + def __init__(self, on: str, called_every: int = 1): + """Initializes the GradientMonitoring callback. + + Args: + on (str): The event to trigger the callback (e.g., "train_batch_end"). + called_every (int): Frequency of callback calls in terms of iterations. + """ + super().__init__(on=on, called_every=called_every) + + def run_callback(self, trainer: Any, config: TrainConfig, writer: BaseWriter) -> None: + """ + Logs gradient statistics. + + Args: + trainer (Any): The training object. + config (TrainConfig): The configuration object. + writer (BaseWriter): The writer object for logging. + """ + gradient_stats = {} + for name, param in trainer.model.named_parameters(): + if param.grad is not None: + grad = param.grad + gradient_stats.update( + { + name + "_mean": grad.mean().item(), + name + "_std": grad.std().item(), + name + "_max": grad.max().item(), + name + "_min": grad.min().item(), + } + ) + + writer.write(trainer.opt_result.iteration, gradient_stats) From 2e8d285cab6d79768893e809bcb10547dbb184b9 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:14:57 +0100 Subject: [PATCH 3/7] Update callback tests --- tests/ml_tools/test_callbacks.py | 97 +++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/tests/ml_tools/test_callbacks.py b/tests/ml_tools/test_callbacks.py index 4cb6911d..100c5960 100644 --- a/tests/ml_tools/test_callbacks.py +++ b/tests/ml_tools/test_callbacks.py @@ -1,5 +1,6 @@ from __future__ import annotations +import math from pathlib import Path from unittest.mock import Mock @@ -9,9 +10,14 @@ from qadence.ml_tools import TrainConfig, Trainer from qadence.ml_tools.callbacks import ( + EarlyStopping, + GradientMonitoring, LoadCheckpoint, LogHyperparameters, LogModelTracker, + LRSchedulerCosineAnnealing, + LRSchedulerCyclic, + LRSchedulerStepDecay, PlotMetrics, PrintMetrics, SaveBestCheckpoint, @@ -94,7 +100,7 @@ def test_write_metrics(trainer: Trainer) -> None: stage = trainer.training_stage callback = WriteMetrics(on=stage, called_every=1) callback(stage, trainer, trainer.config, writer) - writer.write.assert_called_once_with(trainer.opt_result) + writer.write.assert_called_once_with(trainer.opt_result.iteration, trainer.opt_result.metrics) def test_plot_metrics(trainer: Trainer) -> None: @@ -143,3 +149,92 @@ def test_log_model_tracker(trainer: Trainer) -> None: trainer.val_dataloader, trainer.test_dataloader, ) + + +def test_lr_scheduler_step_decay(trainer: Trainer) -> None: + writer = trainer.callback_manager.writer = Mock() + stage = trainer.training_stage + initial_lr = trainer.optimizer.param_groups[0]["lr"] # type: ignore + decay_factor = 0.5 + callback = LRSchedulerStepDecay(on=stage, called_every=1, gamma=decay_factor) + callback(stage, trainer, trainer.config, writer) + + new_lr = trainer.optimizer.param_groups[0]["lr"] # type: ignore + assert new_lr == initial_lr * decay_factor + + +def test_lr_scheduler_cyclic(trainer: Trainer) -> None: + writer = trainer.callback_manager.writer = Mock() + stage = trainer.training_stage + base_lr = 0.001 + max_lr = 0.01 + step_size = 2000 + callback = LRSchedulerCyclic( + on=stage, called_every=1, base_lr=base_lr, max_lr=max_lr, step_size=step_size + ) + + # Set trainer's iteration to simulate training progress + trainer.opt_result.iteration = step_size // 2 # Middle of the cycle + callback(stage, trainer, trainer.config, writer) + expected_lr = base_lr + (max_lr - base_lr) * 0.5 + new_lr = trainer.optimizer.param_groups[0]["lr"] # type: ignore + assert math.isclose(new_lr, expected_lr, rel_tol=1e-6) + + +def test_lr_scheduler_cosine_annealing(trainer: Trainer) -> None: + writer = trainer.callback_manager.writer = Mock() + stage = trainer.training_stage + min_lr = 1e-6 + t_max = 5000 + initial_lr = trainer.optimizer.param_groups[0]["lr"] # type: ignore + callback = LRSchedulerCosineAnnealing(on=stage, called_every=1, t_max=t_max, min_lr=min_lr) + + trainer.opt_result.iteration = t_max // 2 # Halfway through the cycle + callback(stage, trainer, trainer.config, writer) + + expected_lr = min_lr + (initial_lr - min_lr) * (1 + math.cos(math.pi * 0.5)) / 2 + new_lr = trainer.optimizer.param_groups[0]["lr"] # type: ignore + assert math.isclose(new_lr, expected_lr, rel_tol=1e-6) + + +def test_early_stopping(trainer: Trainer) -> None: + writer = trainer.callback_manager.writer = Mock() + stage = trainer.training_stage + patience = 2 + monitor_metric = "val_loss" + mode = "min" + callback = EarlyStopping( + on=stage, called_every=1, monitor=monitor_metric, patience=patience, mode=mode + ) + + # Simulate metric values + trainer.opt_result.metrics = {monitor_metric: 0.5} + callback(stage, trainer, trainer.config, writer) + assert trainer.stop_training is False + + trainer.opt_result.metrics[monitor_metric] = 0.6 + callback(stage, trainer, trainer.config, writer) + assert trainer.stop_training is False + + trainer.opt_result.metrics[monitor_metric] = 0.7 + callback(stage, trainer, trainer.config, writer) + assert trainer.stop_training is True # Should stop training after patience exceeded + + +def test_gradient_monitoring(trainer: Trainer) -> None: + writer = trainer.callback_manager.writer = Mock() + stage = trainer.training_stage + callback = GradientMonitoring(on=stage, called_every=1) + + for param in trainer.model.parameters(): + param.grad = torch.ones_like(param) * 0.1 + + callback(stage, trainer, trainer.config, writer) + expected_keys = { + f"{name}_{stat}" + for name, param in trainer.model.named_parameters() + for stat in ["mean", "std", "max", "min"] + } + + written_keys = writer.write.call_args[0][1].keys() + assert set(written_keys) == expected_keys From 4b70f67c4e93a6411c92564b564b55f7bf3a4c48 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:15:12 +0100 Subject: [PATCH 4/7] Update writer write method --- qadence/ml_tools/callbacks/writer_registry.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/qadence/ml_tools/callbacks/writer_registry.py b/qadence/ml_tools/callbacks/writer_registry.py index 30facf49..579c416e 100644 --- a/qadence/ml_tools/callbacks/writer_registry.py +++ b/qadence/ml_tools/callbacks/writer_registry.py @@ -60,12 +60,14 @@ def close(self) -> None: raise NotImplementedError("Writers must implement a close method.") @abstractmethod - def write(self, result: OptimizeResult) -> None: + def write(self, iteration: int, metrics: dict) -> None: """ Logs the results of the current iteration. Args: - result (OptimizeResult): The optimization results to log. + iteration (int): The current training iteration. + metrics (dict): A dictionary of metrics to log, where keys are metric names + and values are the corresponding metric values. """ raise NotImplementedError("Writers must implement a write method.") @@ -166,23 +168,22 @@ def close(self) -> None: if self.writer: self.writer.close() - def write(self, result: OptimizeResult) -> None: + def write(self, iteration: int, metrics: dict) -> None: """ Logs the results of the current iteration to TensorBoard. Args: - result (OptimizeResult): The optimization results to log. + iteration (int): The current training iteration. + metrics (dict): A dictionary of metrics to log, where keys are metric names + and values are the corresponding metric values. """ - # Not writing loss as loss is available in the metrics - # if result.loss is not None: - # self.writer.add_scalar("loss", float(result.loss), result.iteration) if self.writer: - for key, value in result.metrics.items(): - self.writer.add_scalar(key, value, result.iteration) + for key, value in metrics.items(): + self.writer.add_scalar(key, value, iteration) else: raise RuntimeError( "The writer is not initialized." - "Please call the 'writer.open()' method before writing" + "Please call the 'writer.open()' method before writing." ) def log_hyperparams(self, hyperparams: dict) -> None: @@ -305,22 +306,21 @@ def close(self) -> None: if self.run: self.mlflow.end_run() - def write(self, result: OptimizeResult) -> None: + def write(self, iteration: int, metrics: dict) -> None: """ Logs the results of the current iteration to MLflow. Args: - result (OptimizeResult): The optimization results to log. + iteration (int): The current training iteration. + metrics (dict): A dictionary of metrics to log, where keys are metric names + and values are the corresponding metric values. """ - # Not writing loss as loss is available in the metrics - # if result.loss is not None: - # self.mlflow.log_metric("loss", float(result.loss), step=result.iteration) if self.mlflow: - self.mlflow.log_metrics(result.metrics, step=result.iteration) + self.mlflow.log_metrics(metrics, step=iteration) else: raise RuntimeError( "The writer is not initialized." - "Please call the 'writer.open()' method before writing" + "Please call the 'writer.open()' method before writing." ) def log_hyperparams(self, hyperparams: dict) -> None: From 3e5fbd442bc1bf8783bcc0e8e27cb063e8317373 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:15:26 +0100 Subject: [PATCH 5/7] Update trainer with train stop --- qadence/ml_tools/trainer.py | 41 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/qadence/ml_tools/trainer.py b/qadence/ml_tools/trainer.py index af5150ee..c9062e62 100644 --- a/qadence/ml_tools/trainer.py +++ b/qadence/ml_tools/trainer.py @@ -281,6 +281,7 @@ def __init__( self.device: torch_device | None = device self.dtype: torch_dtype | None = dtype self.data_dtype: torch_dtype | None = None + self.stop_training: bool = False if self.dtype: self.data_dtype = float64 if (self.dtype == complex128) else float32 @@ -321,6 +322,7 @@ def _fit_setup(self) -> None: The callback_manager.start_training takes care of loading checkpoint, and setting up the writer. """ + self.stop_training = False self.config_manager.initialize_config() self.callback_manager.start_training(trainer=self) @@ -377,25 +379,26 @@ def _train(self) -> list[list[tuple[torch.Tensor, dict[str, Any]]]]: for epoch in range( self.global_step, self.global_step + self.config_manager.config.max_iter + 1 ): - try: - self.current_epoch = epoch - self.on_train_epoch_start() - train_epoch_loss_metrics = self.run_training(self.train_dataloader) - train_losses.append(train_epoch_loss_metrics) - self.on_train_epoch_end(train_epoch_loss_metrics) - - # Run validation periodically if specified - if self.perform_val and self.current_epoch % self.config.val_every == 0: - self.on_val_epoch_start() - val_epoch_loss_metrics = self.run_validation(self.val_dataloader) - val_losses.append(val_epoch_loss_metrics) - self.on_val_epoch_end(val_epoch_loss_metrics) - self.progress.update(val_task, advance=1) - - self.progress.update(train_task, advance=1) - except KeyboardInterrupt: - logger.info("Terminating training gracefully after the current iteration.") - break + if not self.stop_training: + try: + self.current_epoch = epoch + self.on_train_epoch_start() + train_epoch_loss_metrics = self.run_training(self.train_dataloader) + train_losses.append(train_epoch_loss_metrics) + self.on_train_epoch_end(train_epoch_loss_metrics) + + # Run validation periodically if specified + if self.perform_val and self.current_epoch % self.config.val_every == 0: + self.on_val_epoch_start() + val_epoch_loss_metrics = self.run_validation(self.val_dataloader) + val_losses.append(val_epoch_loss_metrics) + self.on_val_epoch_end(val_epoch_loss_metrics) + self.progress.update(val_task, advance=1) + + self.progress.update(train_task, advance=1) + except KeyboardInterrupt: + logger.info("Terminating training gracefully after the current iteration.") + break self.on_train_end(train_losses, val_losses) return train_losses From 0d8910b7cf85848dcbfef8eed658d93a44496c9b Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:15:40 +0100 Subject: [PATCH 6/7] Update callbacks init --- qadence/ml_tools/callbacks/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/qadence/ml_tools/callbacks/__init__.py b/qadence/ml_tools/callbacks/__init__.py index 2eca2271..8bd56c54 100644 --- a/qadence/ml_tools/callbacks/__init__.py +++ b/qadence/ml_tools/callbacks/__init__.py @@ -2,9 +2,14 @@ from .callback import ( Callback, + EarlyStopping, + GradientMonitoring, LoadCheckpoint, LogHyperparameters, LogModelTracker, + LRSchedulerCosineAnnealing, + LRSchedulerCyclic, + LRSchedulerStepDecay, PlotMetrics, PrintMetrics, SaveBestCheckpoint, @@ -26,5 +31,10 @@ "SaveBestCheckpoint", "SaveCheckpoint", "WriteMetrics", + "GradientMonitoring", + "LRSchedulerStepDecay", + "LRSchedulerCyclic", + "LRSchedulerCosineAnnealing", + "EarlyStopping", "get_writer", ] From eca954e86d7f37eb149751255ce08dd654b03970 Mon Sep 17 00:00:00 2001 From: mlahariya <40852060+mlahariya@users.noreply.github.com> Date: Tue, 7 Jan 2025 13:15:58 +0100 Subject: [PATCH 7/7] Update Docs --- docs/tutorials/qml/ml_tools/callbacks.md | 79 ++++++++++++++++++++++++ docs/tutorials/qml/ml_tools/trainer.md | 4 +- 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/docs/tutorials/qml/ml_tools/callbacks.md b/docs/tutorials/qml/ml_tools/callbacks.md index 383af0bf..4fe40fea 100644 --- a/docs/tutorials/qml/ml_tools/callbacks.md +++ b/docs/tutorials/qml/ml_tools/callbacks.md @@ -146,6 +146,85 @@ config = TrainConfig( ) ``` +### 1.9. `LRSchedulerStepDecay` + +Reduces the learning rate by a factor at regular intervals. + +```python exec="on" source="material-block" html="1" +from qadence.ml_tools import TrainConfig +from qadence.ml_tools.callbacks import LRSchedulerStepDecay + +lr_step_decay = LRSchedulerStepDecay(on="train_epoch_end", called_every=100, gamma=0.5) + +config = TrainConfig( + max_iter=10000, + callbacks=[lr_step_decay] +) +``` + +### 1.10. `LRSchedulerCyclic` + +Applies a cyclic learning rate schedule during training. + +```python exec="on" source="material-block" html="1" +from qadence.ml_tools import TrainConfig +from qadence.ml_tools.callbacks import LRSchedulerCyclic + +lr_cyclic = LRSchedulerCyclic(on="train_batch_end", called_every=1, base_lr=0.001, max_lr=0.01, step_size=2000) + +config = TrainConfig( + max_iter=10000, + callbacks=[lr_cyclic] +) +``` + +### 1.11. `LRSchedulerCosineAnnealing` + +Applies cosine annealing to the learning rate during training. + +```python exec="on" source="material-block" html="1" +from qadence.ml_tools import TrainConfig +from qadence.ml_tools.callbacks import LRSchedulerCosineAnnealing + +lr_cosine = LRSchedulerCosineAnnealing(on="train_batch_end", called_every=1, t_max=5000, min_lr=1e-6) + +config = TrainConfig( + max_iter=10000, + callbacks=[lr_cosine] +) +``` + +### 1.12. `EarlyStopping` + +Stops training when a monitored metric has not improved for a specified number of epochs. + +```python exec="on" source="material-block" html="1" +from qadence.ml_tools import TrainConfig +from qadence.ml_tools.callbacks import EarlyStopping + +early_stopping = EarlyStopping(on="val_epoch_end", called_every=1, monitor="val_loss", patience=5, mode="min") + +config = TrainConfig( + max_iter=10000, + callbacks=[early_stopping] +) +``` + +### 1.13. `GradientMonitoring` + +Logs gradient statistics (e.g., mean, standard deviation, max) during training. + +```python exec="on" source="material-block" html="1" +from qadence.ml_tools import TrainConfig +from qadence.ml_tools.callbacks import GradientMonitoring + +gradient_monitoring = GradientMonitoring(on="train_batch_end", called_every=10) + +config = TrainConfig( + max_iter=10000, + callbacks=[gradient_monitoring] +) +``` ## 2. Custom Callbacks diff --git a/docs/tutorials/qml/ml_tools/trainer.md b/docs/tutorials/qml/ml_tools/trainer.md index 5d1dd292..cefd1ecd 100644 --- a/docs/tutorials/qml/ml_tools/trainer.md +++ b/docs/tutorials/qml/ml_tools/trainer.md @@ -531,7 +531,7 @@ def train( writer.print_metrics(OptimizeResult(iteration, model, optimizer, loss, metrics)) if iteration % config.write_every == 0: - writer.write(OptimizeResult(iteration, model, optimizer, loss, metrics)) + writer.write(iteration, metrics) if config.log_folder: if iteration % config.checkpoint_every == 0: @@ -540,7 +540,7 @@ def train( # Final writing and checkpointing if config.log_folder: write_checkpoint(config.log_folder, model, optimizer, iteration) - writer.write(OptimizeResult(iteration, model, optimizer, loss, metrics)) + writer.write(iteration,metrics) writer.close() return model, optimizer