Skip to content

Commit

Permalink
Remove tensor option for _global_exception_occured (mosaicml#3611)
Browse files Browse the repository at this point in the history
Co-authored-by: Mihir Patel <[email protected]>
  • Loading branch information
irenedea and mvpatel2000 authored Sep 11, 2024
1 parent a9cd768 commit fea4a88
Showing 1 changed file with 2 additions and 8 deletions.
10 changes: 2 additions & 8 deletions composer/loggers/mlflow_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,10 +312,7 @@ def init(self, state: State, logger: Logger) -> None:
if self.run_name is None:
self.run_name = state.run_name

if hasattr(state, 'device'):
self._global_exception_occurred = state.device.tensor_to_device(torch.tensor([0], dtype=torch.uint8),)
else:
self._global_exception_occurred = 0
self._global_exception_occurred = 0

# Store the Composer run name in the MLFlow run tags so it can be retrieved for autoresume
self.tags['run_name'] = os.environ.get('RUN_NAME', state.run_name)
Expand Down Expand Up @@ -615,10 +612,7 @@ def post_close(self):
if hasattr(self, 'monitor_process'):
# Check if there is an uncaught exception, which means `post_close()` is triggered
# due to program crash.
if isinstance(self._global_exception_occurred, torch.Tensor):
finish_with_exception = (self._global_exception_occurred == 1).item()
else:
finish_with_exception = (self._global_exception_occurred == 1)
finish_with_exception = self._global_exception_occurred == 1
if finish_with_exception:
self.monitor_process.crash()
return
Expand Down

0 comments on commit fea4a88

Please sign in to comment.