From 7714b84ffdcd5f58c594ecd95db6737e90cdfdb4 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Wed, 20 May 2026 18:19:59 +0800 Subject: [PATCH 01/19] fix: try remove memory footprint --- deepmd/pt_expt/train/training.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 5692b019cd..7bbc9c3c17 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -288,6 +288,13 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: decomposition_table=decomp_table, )(ext_coord, ext_atype, nlist, mapping, fparam, aparam) + # make_fx has captured the graph; input tensors are no longer needed. + del ext_coord, ext_atype, nlist, mapping + if fparam is not None: + del fparam + if aparam is not None: + del aparam + # make_fx inserts aten.detach.default for saved tensors used in the # decomposed autograd.grad backward ops. These detach nodes break # second-order gradient flow (d(force)/d(params) for force training). @@ -316,12 +323,16 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: if compile_opts: inductor_options.update(compile_opts) - return torch.compile( + compiled = torch.compile( traced_lower, backend="inductor", dynamic=True, options=inductor_options, ) + del traced_lower + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return compiled class _CompiledModel(torch.nn.Module): @@ -994,6 +1005,21 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: ) wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower) + + # Release all intermediate tensors built for this task so they don't + # accumulate across tasks in multi-task scenarios. + del ext_coord, ext_atype, mapping, nlist_t + del coord, atype, coord_3d, coord_norm + if box is not None: + del box, box_flat + if fparam is not None: + del fparam + if aparam is not None: + del aparam + del inp + if torch.cuda.is_available(): + torch.cuda.empty_cache() + log.info( "Model compiled (task=%s, tracing_mode=symbolic, " "dynamic=True, backend=inductor).", From ee413c3c075266fa7843eaa28cc5c302ccc89d5d Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Wed, 20 May 2026 19:33:49 +0800 Subject: [PATCH 02/19] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com> --- deepmd/pt_expt/train/training.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 7bbc9c3c17..4b29afb162 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -330,7 +330,14 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: options=inductor_options, ) del traced_lower - if torch.cuda.is_available(): + model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any( + buffer.is_cuda for buffer in model.buffers() + ) + if ( + model_uses_cuda + and torch.cuda.is_available() + and torch.cuda.is_initialized() + ): torch.cuda.empty_cache() return compiled From eb239ef726371c81c33b0a95352eb86a0f488e00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 May 2026 11:34:30 +0000 Subject: [PATCH 03/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/train/training.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 4b29afb162..14205a0d6d 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -333,11 +333,7 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any( buffer.is_cuda for buffer in model.buffers() ) - if ( - model_uses_cuda - and torch.cuda.is_available() - and torch.cuda.is_initialized() - ): + if model_uses_cuda and torch.cuda.is_available() and torch.cuda.is_initialized(): torch.cuda.empty_cache() return compiled From c7d9f57a6381cc2c3e2e857ce46b1b7342f5a028 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Wed, 20 May 2026 19:35:46 +0800 Subject: [PATCH 04/19] fix: comment --- deepmd/pt_expt/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 14205a0d6d..d3d135936f 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1020,7 +1020,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: if aparam is not None: del aparam del inp - if torch.cuda.is_available(): + if torch.cuda.is_initialized(): torch.cuda.empty_cache() log.info( From 2a105324804509ca18faeeff5810addf4a581a57 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Wed, 20 May 2026 20:00:49 +0800 Subject: [PATCH 05/19] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com> --- deepmd/pt_expt/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index d3d135936f..ac3c79fe41 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1020,7 +1020,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: if aparam is not None: del aparam del inp - if torch.cuda.is_initialized(): + if DEVICE.type == "cuda" and torch.cuda.is_initialized(): torch.cuda.empty_cache() log.info( From 8b7584a061c37bb7d74ccb7447e9e7f6bca8738c Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Thu, 21 May 2026 10:47:17 +0800 Subject: [PATCH 06/19] fix: remove graph --- deepmd/pt_expt/train/training.py | 44 +++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index ac3c79fe41..ef67d04bb5 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1019,7 +1019,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: del fparam if aparam is not None: del aparam - del inp + del inp, _ if DEVICE.type == "cuda" and torch.cuda.is_initialized(): torch.cuda.empty_cache() @@ -1205,7 +1205,9 @@ def run(self) -> None: if self.rank == 0: if not self.multi_task: train_results = { - k: v for k, v in more_loss.items() if "l2_" not in k + k: (v.item() if isinstance(v, torch.Tensor) else v) + for k, v in more_loss.items() + if "l2_" not in k } # validation @@ -1225,9 +1227,13 @@ def run(self) -> None: sum_natoms += natoms for k, v in _vmore.items(): if "l2_" not in k: - valid_results[k] = ( - valid_results.get(k, 0.0) + v * natoms - ) + valid_results[k] = valid_results.get( + k, 0.0 + ) + ( + v.item() + if isinstance(v, torch.Tensor) + else v + ) * natoms if sum_natoms > 0: valid_results = { k: v / sum_natoms for k, v in valid_results.items() @@ -1239,13 +1245,15 @@ def run(self) -> None: # current task already has loss train_results[task_key] = { - k: v for k, v in more_loss.items() if "l2_" not in k + k: (v.item() if isinstance(v, torch.Tensor) else v) + for k, v in more_loss.items() + if "l2_" not in k } # compute loss for other tasks for _key in self.model_keys: if _key != task_key: - self.optimizer.zero_grad() + self.optimizer.zero_grad(set_to_none=True) _inp, _lab = self.get_data(is_train=True, task_key=_key) _, _loss, _more = self._unwrapped( **_inp, @@ -1253,9 +1261,23 @@ def run(self) -> None: label=_lab, task_key=_key, ) + # Use .item() so the backward graph (and its + # saved activations) can be freed immediately. + # Display passes never call loss.backward(), so + # without this the computation graphs for all + # tasks accumulate simultaneously in GPU memory. train_results[_key] = { - k: v for k, v in _more.items() if "l2_" not in k + k: ( + v.item() + if isinstance(v, torch.Tensor) + else v + ) + for k, v in _more.items() + if "l2_" not in k } + del _loss, _more, _inp, _lab + if torch.cuda.is_available() and torch.cuda.is_initialized(): + torch.cuda.empty_cache() # validation for each task _vdata = self.validation_data[_key] @@ -1278,7 +1300,11 @@ def run(self) -> None: _sum_natoms += natoms for k, v in _vmore.items(): if "l2_" not in k: - _vres[k] = _vres.get(k, 0.0) + v * natoms + _vres[k] = _vres.get(k, 0.0) + ( + v.item() + if isinstance(v, torch.Tensor) + else v + ) * natoms if _sum_natoms > 0: _vres = { k: v / _sum_natoms for k, v in _vres.items() From 87e4e469d476ef5092ff2a9bb8295ca7ee13840f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 May 2026 02:48:13 +0000 Subject: [PATCH 07/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/train/training.py | 41 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index ef67d04bb5..647ac2c624 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1227,13 +1227,15 @@ def run(self) -> None: sum_natoms += natoms for k, v in _vmore.items(): if "l2_" not in k: - valid_results[k] = valid_results.get( - k, 0.0 - ) + ( - v.item() - if isinstance(v, torch.Tensor) - else v - ) * natoms + valid_results[k] = ( + valid_results.get(k, 0.0) + + ( + v.item() + if isinstance(v, torch.Tensor) + else v + ) + * natoms + ) if sum_natoms > 0: valid_results = { k: v / sum_natoms for k, v in valid_results.items() @@ -1267,16 +1269,15 @@ def run(self) -> None: # without this the computation graphs for all # tasks accumulate simultaneously in GPU memory. train_results[_key] = { - k: ( - v.item() - if isinstance(v, torch.Tensor) - else v - ) + k: (v.item() if isinstance(v, torch.Tensor) else v) for k, v in _more.items() if "l2_" not in k } del _loss, _more, _inp, _lab - if torch.cuda.is_available() and torch.cuda.is_initialized(): + if ( + torch.cuda.is_available() + and torch.cuda.is_initialized() + ): torch.cuda.empty_cache() # validation for each task @@ -1300,11 +1301,15 @@ def run(self) -> None: _sum_natoms += natoms for k, v in _vmore.items(): if "l2_" not in k: - _vres[k] = _vres.get(k, 0.0) + ( - v.item() - if isinstance(v, torch.Tensor) - else v - ) * natoms + _vres[k] = ( + _vres.get(k, 0.0) + + ( + v.item() + if isinstance(v, torch.Tensor) + else v + ) + * natoms + ) if _sum_natoms > 0: _vres = { k: v / _sum_natoms for k, v in _vres.items() From 4ffc15a19765dfc9489a2583b24a89855844fbf4 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Thu, 21 May 2026 14:26:14 +0800 Subject: [PATCH 08/19] fix: lazy compile in multitask NCCL timeout --- deepmd/pt_expt/train/training.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 647ac2c624..7e39b8561e 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1007,6 +1007,19 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: compile_opts, ) + # torch.compile is lazy: inductor only compiles on the first + # call. In DDP multi-task training, different ranks may first + # hit a task at different training steps, so one rank can block + # inside inductor for minutes while others spin in AllReduce — + # causing an NCCL timeout. Warmup here, while sample inputs + # still exist, forces eager compilation before training starts. + _warmup_out = compiled_lower( + ext_coord, ext_atype, nlist_t, mapping, fparam, aparam + ) + del _warmup_out + if DEVICE.type == "cuda" and torch.cuda.is_initialized(): + torch.cuda.synchronize() + wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower) # Release all intermediate tensors built for this task so they don't @@ -1029,6 +1042,12 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: task_key, ) + # All tasks compiled on this rank — wait for all ranks before + # training starts so no rank enters the training loop while another + # is still blocked in inductor compilation. + if self.is_distributed: + dist.barrier() + # ------------------------------------------------------------------ # Data helpers # ------------------------------------------------------------------ From 36d57a7e45d60aa48947143b2ab673fd7b1e2cc7 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Thu, 21 May 2026 14:53:07 +0800 Subject: [PATCH 09/19] fix: mark variable-size dimensions as dynamic to prevent NCCL timeout --- deepmd/pt_expt/train/training.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 7e39b8561e..65c57b777d 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1013,6 +1013,20 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # inside inductor for minutes while others spin in AllReduce — # causing an NCCL timeout. Warmup here, while sample inputs # still exist, forces eager compilation before training starts. + # + # Mark variable-size dimensions as dynamic so Dynamo creates range + # guards rather than equality guards. Without this, each new value + # of nall or nloc in a training batch breaks the equality guard and + # triggers a full recompilation, which can stall one rank for + # minutes while others wait in a collective — causing NCCL timeout. + # ext_coord / ext_atype / mapping dim 1 = nall (ghost+real atoms) + # nlist_t dim 1 = nloc (real atoms only) + # Both vary per batch because system sizes differ across structures. + torch._dynamo.mark_dynamic(ext_coord, 1) # [nframes, nall, 3] + torch._dynamo.mark_dynamic(ext_atype, 1) # [nframes, nall] + torch._dynamo.mark_dynamic(nlist_t, 1) # [nframes, nloc, max_nnei] + if mapping.dim() >= 2: + torch._dynamo.mark_dynamic(mapping, 1) # [nframes, nall] _warmup_out = compiled_lower( ext_coord, ext_atype, nlist_t, mapping, fparam, aparam ) From 1978e3363d6ddfaad14a54d5d638ee26fa410dcf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 May 2026 06:53:58 +0000 Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/train/training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 65c57b777d..b696b08097 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1022,9 +1022,9 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # ext_coord / ext_atype / mapping dim 1 = nall (ghost+real atoms) # nlist_t dim 1 = nloc (real atoms only) # Both vary per batch because system sizes differ across structures. - torch._dynamo.mark_dynamic(ext_coord, 1) # [nframes, nall, 3] - torch._dynamo.mark_dynamic(ext_atype, 1) # [nframes, nall] - torch._dynamo.mark_dynamic(nlist_t, 1) # [nframes, nloc, max_nnei] + torch._dynamo.mark_dynamic(ext_coord, 1) # [nframes, nall, 3] + torch._dynamo.mark_dynamic(ext_atype, 1) # [nframes, nall] + torch._dynamo.mark_dynamic(nlist_t, 1) # [nframes, nloc, max_nnei] if mapping.dim() >= 2: torch._dynamo.mark_dynamic(mapping, 1) # [nframes, nall] _warmup_out = compiled_lower( From 937b742b2ae1f77b42c4b6987dbf58a92cadcb6b Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Thu, 21 May 2026 16:54:28 +0800 Subject: [PATCH 11/19] fix: mark tensors as dynamic to prevent NCCL timeout during training --- deepmd/pt_expt/train/training.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index b696b08097..e5ff2d86cc 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -394,6 +394,23 @@ def forward( ext_coord = ext_coord.reshape(nframes, -1, 3) ext_coord = ext_coord.detach().requires_grad_(True) + # Mark nall and nloc as dynamic on every call so Dynamo's guard always + # matches the warmup compilation (which was also built with mark_dynamic). + # Without this, training tensors are new Python objects that Dynamo sees + # as "fresh", triggering a second per-task compilation on first training + # use. With random task sampling across DDP ranks, that second + # compilation can happen at different times on different ranks, causing + # an NCCL timeout. + torch._dynamo.mark_dynamic(ext_coord, 1) # nall + torch._dynamo.mark_dynamic(ext_atype, 1) # nall + torch._dynamo.mark_dynamic(nlist, 1) # nloc + if mapping.dim() >= 2: + torch._dynamo.mark_dynamic(mapping, 1) # nall + if fparam is not None: + torch._dynamo.mark_dynamic(fparam, 0) # nframes (may differ per task) + if aparam is not None: + torch._dynamo.mark_dynamic(aparam, 1) # nloc + result = self.compiled_forward_lower( ext_coord, ext_atype, nlist, mapping, fparam, aparam ) @@ -1022,11 +1039,22 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # ext_coord / ext_atype / mapping dim 1 = nall (ghost+real atoms) # nlist_t dim 1 = nloc (real atoms only) # Both vary per batch because system sizes differ across structures. + # Match _CompiledModel.forward which sets requires_grad_(True) on + # ext_coord before calling compiled_forward_lower. Dynamo's guard + # includes requires_grad, so a mismatch here would cause every + # task's first training call to miss the warmup cache and trigger + # a new compilation — at a random time on each rank — creating the + # exact NCCL desync we are trying to prevent. + ext_coord = ext_coord.detach().requires_grad_(True) torch._dynamo.mark_dynamic(ext_coord, 1) # [nframes, nall, 3] torch._dynamo.mark_dynamic(ext_atype, 1) # [nframes, nall] torch._dynamo.mark_dynamic(nlist_t, 1) # [nframes, nloc, max_nnei] if mapping.dim() >= 2: torch._dynamo.mark_dynamic(mapping, 1) # [nframes, nall] + if fparam is not None: + torch._dynamo.mark_dynamic(fparam, 0) # [nframes, dim_fparam] + if aparam is not None: + torch._dynamo.mark_dynamic(aparam, 1) # [nframes, nloc, dim_aparam] _warmup_out = compiled_lower( ext_coord, ext_atype, nlist_t, mapping, fparam, aparam ) From 2ee8af155b93c8167448e99b5fe89d35eb8e4995 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 May 2026 08:55:21 +0000 Subject: [PATCH 12/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/train/training.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index e5ff2d86cc..3cba47c354 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -401,15 +401,15 @@ def forward( # use. With random task sampling across DDP ranks, that second # compilation can happen at different times on different ranks, causing # an NCCL timeout. - torch._dynamo.mark_dynamic(ext_coord, 1) # nall - torch._dynamo.mark_dynamic(ext_atype, 1) # nall - torch._dynamo.mark_dynamic(nlist, 1) # nloc + torch._dynamo.mark_dynamic(ext_coord, 1) # nall + torch._dynamo.mark_dynamic(ext_atype, 1) # nall + torch._dynamo.mark_dynamic(nlist, 1) # nloc if mapping.dim() >= 2: torch._dynamo.mark_dynamic(mapping, 1) # nall if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # nframes (may differ per task) + torch._dynamo.mark_dynamic(fparam, 0) # nframes (may differ per task) if aparam is not None: - torch._dynamo.mark_dynamic(aparam, 1) # nloc + torch._dynamo.mark_dynamic(aparam, 1) # nloc result = self.compiled_forward_lower( ext_coord, ext_atype, nlist, mapping, fparam, aparam @@ -1052,7 +1052,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: if mapping.dim() >= 2: torch._dynamo.mark_dynamic(mapping, 1) # [nframes, nall] if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # [nframes, dim_fparam] + torch._dynamo.mark_dynamic(fparam, 0) # [nframes, dim_fparam] if aparam is not None: torch._dynamo.mark_dynamic(aparam, 1) # [nframes, nloc, dim_aparam] _warmup_out = compiled_lower( From c05542acf79f610a5f7331219b6d66ae1878452b Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Thu, 21 May 2026 18:03:20 +0800 Subject: [PATCH 13/19] fix: dynamic shape --- deepmd/pt_expt/train/training.py | 56 ++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 3cba47c354..21500573f9 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -394,21 +394,27 @@ def forward( ext_coord = ext_coord.reshape(nframes, -1, 3) ext_coord = ext_coord.detach().requires_grad_(True) - # Mark nall and nloc as dynamic on every call so Dynamo's guard always - # matches the warmup compilation (which was also built with mark_dynamic). - # Without this, training tensors are new Python objects that Dynamo sees - # as "fresh", triggering a second per-task compilation on first training - # use. With random task sampling across DDP ranks, that second - # compilation can happen at different times on different ranks, causing - # an NCCL timeout. + # Mark nframes, nall, and nloc as dynamic on every call so Dynamo's + # guard always matches the warmup compilation. nframes varies when + # different systems have different per-system batch sizes. nall/nloc + # vary because system atom counts differ. Marking dim 0 (nframes) on + # ext_coord/ext_atype/nlist/mapping is required: without it Dynamo + # specialises ext_coord.shape[0] to the warmup value, propagates that + # constant into fparam.shape[0] via the reshape in general_fitting, and + # conflicts with mark_dynamic(fparam, 0) when nframes changes. + torch._dynamo.mark_dynamic(ext_coord, 0) # nframes torch._dynamo.mark_dynamic(ext_coord, 1) # nall + torch._dynamo.mark_dynamic(ext_atype, 0) # nframes torch._dynamo.mark_dynamic(ext_atype, 1) # nall + torch._dynamo.mark_dynamic(nlist, 0) # nframes torch._dynamo.mark_dynamic(nlist, 1) # nloc if mapping.dim() >= 2: + torch._dynamo.mark_dynamic(mapping, 0) # nframes torch._dynamo.mark_dynamic(mapping, 1) # nall if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # nframes (may differ per task) + torch._dynamo.mark_dynamic(fparam, 0) # nframes if aparam is not None: + torch._dynamo.mark_dynamic(aparam, 0) # nframes torch._dynamo.mark_dynamic(aparam, 1) # nloc result = self.compiled_forward_lower( @@ -1033,12 +1039,17 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # # Mark variable-size dimensions as dynamic so Dynamo creates range # guards rather than equality guards. Without this, each new value - # of nall or nloc in a training batch breaks the equality guard and - # triggers a full recompilation, which can stall one rank for - # minutes while others wait in a collective — causing NCCL timeout. - # ext_coord / ext_atype / mapping dim 1 = nall (ghost+real atoms) - # nlist_t dim 1 = nloc (real atoms only) - # Both vary per batch because system sizes differ across structures. + # of nall, nloc, or nframes in a training batch breaks the equality + # guard and triggers a full recompilation, which can stall one rank + # for minutes while others wait in a collective — causing NCCL timeout. + # ext_coord / ext_atype / mapping dim 0 = nframes, dim 1 = nall + # nlist_t dim 0 = nframes, dim 1 = nloc + # nframes varies across batches when different systems have different + # per-system batch sizes. nall/nloc vary because system atom counts + # differ. fparam/aparam share dim 0 = nframes with ext_coord; if + # ext_coord dim 0 is not marked dynamic, Dynamo specialises it to the + # warmup value and propagates that constant into fparam's shape via + # the reshape in general_fitting, conflicting with mark_dynamic(fparam,0). # Match _CompiledModel.forward which sets requires_grad_(True) on # ext_coord before calling compiled_forward_lower. Dynamo's guard # includes requires_grad, so a mismatch here would cause every @@ -1046,15 +1057,20 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # a new compilation — at a random time on each rank — creating the # exact NCCL desync we are trying to prevent. ext_coord = ext_coord.detach().requires_grad_(True) - torch._dynamo.mark_dynamic(ext_coord, 1) # [nframes, nall, 3] - torch._dynamo.mark_dynamic(ext_atype, 1) # [nframes, nall] - torch._dynamo.mark_dynamic(nlist_t, 1) # [nframes, nloc, max_nnei] + torch._dynamo.mark_dynamic(ext_coord, 0) # nframes + torch._dynamo.mark_dynamic(ext_coord, 1) # nall + torch._dynamo.mark_dynamic(ext_atype, 0) # nframes + torch._dynamo.mark_dynamic(ext_atype, 1) # nall + torch._dynamo.mark_dynamic(nlist_t, 0) # nframes + torch._dynamo.mark_dynamic(nlist_t, 1) # nloc if mapping.dim() >= 2: - torch._dynamo.mark_dynamic(mapping, 1) # [nframes, nall] + torch._dynamo.mark_dynamic(mapping, 0) # nframes + torch._dynamo.mark_dynamic(mapping, 1) # nall if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # [nframes, dim_fparam] + torch._dynamo.mark_dynamic(fparam, 0) # nframes if aparam is not None: - torch._dynamo.mark_dynamic(aparam, 1) # [nframes, nloc, dim_aparam] + torch._dynamo.mark_dynamic(aparam, 0) # nframes + torch._dynamo.mark_dynamic(aparam, 1) # nloc _warmup_out = compiled_lower( ext_coord, ext_atype, nlist_t, mapping, fparam, aparam ) From 697b24f933af1395b0f751c6c445ef73f0a8af8a Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Fri, 22 May 2026 13:52:07 +0800 Subject: [PATCH 14/19] chore: revert extra mark_dynamic --- deepmd/pt_expt/train/training.py | 57 ++------------------------------ 1 file changed, 2 insertions(+), 55 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 21500573f9..2203f877e9 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -394,29 +394,6 @@ def forward( ext_coord = ext_coord.reshape(nframes, -1, 3) ext_coord = ext_coord.detach().requires_grad_(True) - # Mark nframes, nall, and nloc as dynamic on every call so Dynamo's - # guard always matches the warmup compilation. nframes varies when - # different systems have different per-system batch sizes. nall/nloc - # vary because system atom counts differ. Marking dim 0 (nframes) on - # ext_coord/ext_atype/nlist/mapping is required: without it Dynamo - # specialises ext_coord.shape[0] to the warmup value, propagates that - # constant into fparam.shape[0] via the reshape in general_fitting, and - # conflicts with mark_dynamic(fparam, 0) when nframes changes. - torch._dynamo.mark_dynamic(ext_coord, 0) # nframes - torch._dynamo.mark_dynamic(ext_coord, 1) # nall - torch._dynamo.mark_dynamic(ext_atype, 0) # nframes - torch._dynamo.mark_dynamic(ext_atype, 1) # nall - torch._dynamo.mark_dynamic(nlist, 0) # nframes - torch._dynamo.mark_dynamic(nlist, 1) # nloc - if mapping.dim() >= 2: - torch._dynamo.mark_dynamic(mapping, 0) # nframes - torch._dynamo.mark_dynamic(mapping, 1) # nall - if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # nframes - if aparam is not None: - torch._dynamo.mark_dynamic(aparam, 0) # nframes - torch._dynamo.mark_dynamic(aparam, 1) # nloc - result = self.compiled_forward_lower( ext_coord, ext_atype, nlist, mapping, fparam, aparam ) @@ -1037,40 +1014,10 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # causing an NCCL timeout. Warmup here, while sample inputs # still exist, forces eager compilation before training starts. # - # Mark variable-size dimensions as dynamic so Dynamo creates range - # guards rather than equality guards. Without this, each new value - # of nall, nloc, or nframes in a training batch breaks the equality - # guard and triggers a full recompilation, which can stall one rank - # for minutes while others wait in a collective — causing NCCL timeout. - # ext_coord / ext_atype / mapping dim 0 = nframes, dim 1 = nall - # nlist_t dim 0 = nframes, dim 1 = nloc - # nframes varies across batches when different systems have different - # per-system batch sizes. nall/nloc vary because system atom counts - # differ. fparam/aparam share dim 0 = nframes with ext_coord; if - # ext_coord dim 0 is not marked dynamic, Dynamo specialises it to the - # warmup value and propagates that constant into fparam's shape via - # the reshape in general_fitting, conflicting with mark_dynamic(fparam,0). # Match _CompiledModel.forward which sets requires_grad_(True) on - # ext_coord before calling compiled_forward_lower. Dynamo's guard - # includes requires_grad, so a mismatch here would cause every - # task's first training call to miss the warmup cache and trigger - # a new compilation — at a random time on each rank — creating the - # exact NCCL desync we are trying to prevent. + # ext_coord: Dynamo's guard includes requires_grad, so a mismatch + # causes every task's first training call to miss the warmup cache. ext_coord = ext_coord.detach().requires_grad_(True) - torch._dynamo.mark_dynamic(ext_coord, 0) # nframes - torch._dynamo.mark_dynamic(ext_coord, 1) # nall - torch._dynamo.mark_dynamic(ext_atype, 0) # nframes - torch._dynamo.mark_dynamic(ext_atype, 1) # nall - torch._dynamo.mark_dynamic(nlist_t, 0) # nframes - torch._dynamo.mark_dynamic(nlist_t, 1) # nloc - if mapping.dim() >= 2: - torch._dynamo.mark_dynamic(mapping, 0) # nframes - torch._dynamo.mark_dynamic(mapping, 1) # nall - if fparam is not None: - torch._dynamo.mark_dynamic(fparam, 0) # nframes - if aparam is not None: - torch._dynamo.mark_dynamic(aparam, 0) # nframes - torch._dynamo.mark_dynamic(aparam, 1) # nloc _warmup_out = compiled_lower( ext_coord, ext_atype, nlist_t, mapping, fparam, aparam ) From c27c95d64167d4698b8c62bc9bb0c4fbfb1fe866 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Fri, 22 May 2026 15:49:12 +0800 Subject: [PATCH 15/19] fix: prevent GC error --- deepmd/pt_expt/train/training.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 2203f877e9..733bd2c88a 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -329,6 +329,13 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: dynamic=True, options=inductor_options, ) + # Keep the traced FX graph alive as long as the compiled callable. + # _remove_detach_nodes makes saved activations alias the graph's symbolic + # tensors; if the FX graph is GC'd, its SymInt shape objects lose their + # Python references while C++ view metadata still holds raw pointers to + # them — causing apply_view_meta_sequence to read garbage (crash at + # random training steps, earlier under higher GC pressure from many tasks). + compiled._traced_lower_ref = traced_lower del traced_lower model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any( buffer.is_cuda for buffer in model.buffers() From b5236de8f66a2b54e462b8f64734e7c0ffe68ec8 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Fri, 22 May 2026 16:34:52 +0800 Subject: [PATCH 16/19] fix:recursion --- deepmd/pt_expt/train/training.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 733bd2c88a..5377b43044 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -335,7 +335,11 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: # Python references while C++ view metadata still holds raw pointers to # them — causing apply_view_meta_sequence to read garbage (crash at # random training steps, earlier under higher GC pressure from many tasks). - compiled._traced_lower_ref = traced_lower + # Use object.__setattr__ to bypass nn.Module.__setattr__: traced_lower is + # an nn.Module, and normal assignment would register it as a submodule of + # compiled (also an nn.Module), creating a cycle in the module tree that + # causes RecursionError in trainer.wrapper.train(). + object.__setattr__(compiled, "_traced_lower_ref", traced_lower) del traced_lower model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any( buffer.is_cuda for buffer in model.buffers() From 2377b1a438f14b70821e212e5b36e480398f9361 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 25 May 2026 09:43:25 +0800 Subject: [PATCH 17/19] fix: try fix remove_detach_node --- deepmd/pt_expt/train/training.py | 60 +++++++++++--------------------- 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 5377b43044..fe5bd28362 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -147,23 +147,30 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]: def _remove_detach_nodes(gm: torch.fx.GraphModule) -> None: - """Remove ``aten.detach.default`` nodes from an FX graph in-place. - - ``make_fx`` inserts these nodes when recording saved tensors from the - autograd backward pass (``autograd.grad`` with ``create_graph=True``). - The detach breaks the gradient connection between saved activations and - model parameters, causing incorrect second-order derivatives — e.g. - bias gradients become zero for force-loss training. - - Removing these nodes restores the gradient path so that higher-order - derivatives flow correctly through the decomposed backward ops. + """Replace ``aten.detach.default`` nodes with ``aten.clone.default``. + + ``make_fx`` inserts detach nodes for saved tensors in the decomposed + autograd backward. The detach breaks the gradient path from saved + activations back to model parameters, causing incorrect second-order + derivatives (e.g. bias gradients become zero for force-loss training). + + We replace detach with clone rather than erasing the node entirely. + Erasing makes the output alias the input — AOT autograd detects the + alias and stores SymInt shape values as raw pointers in a C++ + ``view_meta_sequence``. When Python GC later collects those SymInt + objects the pointers dangle, producing a crash of the form + ``shape '[139...008, ...]' is invalid for input of size N``. + Clone breaks the alias so no ``view_meta_sequence`` is generated. """ graph = gm.graph for node in list(graph.nodes): if node.op == "call_function" and node.target == torch.ops.aten.detach.default: - input_node = node.args[0] - node.replace_all_uses_with(input_node) - graph.erase_node(node) + # Replace detach with clone to break the input-output alias. + # Alias-free outputs mean AOT autograd never writes SymInt raw + # pointers into C++ view_meta_sequence, so GC of SymInt objects + # cannot produce dangling pointers and apply_view_meta_sequence + # crashes (shape '[139...008, ...]' is invalid for input ...). + node.target = torch.ops.aten.clone.default graph.lint() gm.recompile() @@ -323,30 +330,12 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: if compile_opts: inductor_options.update(compile_opts) - compiled = torch.compile( + return torch.compile( traced_lower, backend="inductor", dynamic=True, options=inductor_options, ) - # Keep the traced FX graph alive as long as the compiled callable. - # _remove_detach_nodes makes saved activations alias the graph's symbolic - # tensors; if the FX graph is GC'd, its SymInt shape objects lose their - # Python references while C++ view metadata still holds raw pointers to - # them — causing apply_view_meta_sequence to read garbage (crash at - # random training steps, earlier under higher GC pressure from many tasks). - # Use object.__setattr__ to bypass nn.Module.__setattr__: traced_lower is - # an nn.Module, and normal assignment would register it as a submodule of - # compiled (also an nn.Module), creating a cycle in the module tree that - # causes RecursionError in trainer.wrapper.train(). - object.__setattr__(compiled, "_traced_lower_ref", traced_lower) - del traced_lower - model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any( - buffer.is_cuda for buffer in model.buffers() - ) - if model_uses_cuda and torch.cuda.is_available() and torch.cuda.is_initialized(): - torch.cuda.empty_cache() - return compiled class _CompiledModel(torch.nn.Module): @@ -1049,8 +1038,6 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: if aparam is not None: del aparam del inp, _ - if DEVICE.type == "cuda" and torch.cuda.is_initialized(): - torch.cuda.empty_cache() log.info( "Model compiled (task=%s, tracing_mode=symbolic, " @@ -1309,11 +1296,6 @@ def run(self) -> None: if "l2_" not in k } del _loss, _more, _inp, _lab - if ( - torch.cuda.is_available() - and torch.cuda.is_initialized() - ): - torch.cuda.empty_cache() # validation for each task _vdata = self.validation_data[_key] From 211608bdad7a2a760169d7d7f1f32d0c1efbdb99 Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 25 May 2026 12:00:10 +0800 Subject: [PATCH 18/19] fix: add new charge_spin para --- deepmd/pt_expt/train/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 003e1049a5..8e8e29b59e 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -1045,7 +1045,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: # causes every task's first training call to miss the warmup cache. ext_coord = ext_coord.detach().requires_grad_(True) _warmup_out = compiled_lower( - ext_coord, ext_atype, nlist_t, mapping, fparam, aparam + ext_coord, ext_atype, nlist_t, mapping, fparam, aparam, charge_spin ) del _warmup_out if DEVICE.type == "cuda" and torch.cuda.is_initialized(): @@ -1063,6 +1063,8 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: del fparam if aparam is not None: del aparam + if charge_spin is not None: + del charge_spin del inp, _ log.info( From ec7296ca87800aec33efa5a738462d4fabb22e3b Mon Sep 17 00:00:00 2001 From: Anyang Peng <137014849+anyangml@users.noreply.github.com> Date: Mon, 25 May 2026 13:10:47 +0800 Subject: [PATCH 19/19] fix: comment --- deepmd/pt_expt/train/training.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 8e8e29b59e..2d5d47ea78 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -324,6 +324,8 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None: del fparam if aparam is not None: del aparam + if charge_spin is not None: + del charge_spin # make_fx inserts aten.detach.default for saved tensors used in the # decomposed autograd.grad backward ops. These detach nodes break @@ -1048,7 +1050,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None: ext_coord, ext_atype, nlist_t, mapping, fparam, aparam, charge_spin ) del _warmup_out - if DEVICE.type == "cuda" and torch.cuda.is_initialized(): + if DEVICE.type == "cuda": torch.cuda.synchronize() wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower)