From c3f7b44f89af55f3eb1acc14fa6ef2c24d11dcc3 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sat, 12 Oct 2024 06:28:09 +0000 Subject: [PATCH 1/2] ignore reuse_dist_env --- tests/unit/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/common.py b/tests/unit/common.py index c9eb7ffaa5f4..eacb7bc85e4a 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -25,6 +25,8 @@ # Worker timeout for tests that hang DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600')) +warn_reuse_dist_env = False + def is_rocm_pytorch(): return hasattr(torch.version, 'hip') and torch.version.hip is not None @@ -179,6 +181,12 @@ def _launch_daemonic_procs(self, num_procs): print("Ignoring reuse_dist_env for hpu") self.reuse_dist_env = False + global warn_reuse_dist_env + if self.reuse_dist_env and not warn_reuse_dist_env: + # Currently we see memory leak for tests that reuse distributed environment + print("Ignoring reuse_dist_env and forcibly setting it to False") + warn_reuse_dist_env = True + if self.reuse_dist_env: if num_procs not in self._pool_cache: self._pool_cache[num_procs] = mp.Pool(processes=num_procs) From c6768e3f8396c59d0135105a44e6f8ab7816fbcf Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sat, 12 Oct 2024 06:31:02 +0000 Subject: [PATCH 2/2] set reuse_dist_env to False --- tests/unit/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/common.py b/tests/unit/common.py index eacb7bc85e4a..69ba4c2708ac 100644 --- a/tests/unit/common.py +++ b/tests/unit/common.py @@ -186,6 +186,7 @@ def _launch_daemonic_procs(self, num_procs): # Currently we see memory leak for tests that reuse distributed environment print("Ignoring reuse_dist_env and forcibly setting it to False") warn_reuse_dist_env = True + self.reuse_dist_env = False if self.reuse_dist_env: if num_procs not in self._pool_cache: