From e1728f6509470eb571fe30594bb79553e0ca9478 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 17 Jan 2024 13:52:38 -0500 Subject: [PATCH] Update monkeypatch to put barrier in optim load (#2874) * wip * bugfix * increase retries and jitter * logs * logs * remove kadabra * add sync * remove * no sync * logs * tweak * strip print * strip * upload file * remove comment * remove --------- Co-authored-by: Abhinav Venigalla --- composer/trainer/mosaic_fsdp_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/mosaic_fsdp_utils.py b/composer/trainer/mosaic_fsdp_utils.py index 24496ec4f4..d04449c4ae 100644 --- a/composer/trainer/mosaic_fsdp_utils.py +++ b/composer/trainer/mosaic_fsdp_utils.py @@ -1163,7 +1163,6 @@ def _shard_orig_param_state( optim_state, pg=fsdp_state.process_group, device=fsdp_state.compute_device, - cpu_offload=True, ) if not shard_param_info.in_shard: return {} @@ -1179,6 +1178,7 @@ def _shard_orig_param_state( ): value = value.flatten()[intra_param_start_idx : intra_param_end_idx + 1].clone() # type: ignore[operator] new_optim_state[state_name] = value + torch.cuda.synchronize() return new_optim_state def fsdp_state_has_default_pg(state: '_FSDPState') -> bool: