From 7714b84ffdcd5f58c594ecd95db6737e90cdfdb4 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 20 May 2026 18:19:59 +0800
Subject: [PATCH 01/19] fix: try remove memory footprint

---
 deepmd/pt_expt/train/training.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 5692b019cd..7bbc9c3c17 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -288,6 +288,13 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
         decomposition_table=decomp_table,
     )(ext_coord, ext_atype, nlist, mapping, fparam, aparam)
 
+    # make_fx has captured the graph; input tensors are no longer needed.
+    del ext_coord, ext_atype, nlist, mapping
+    if fparam is not None:
+        del fparam
+    if aparam is not None:
+        del aparam
+
     # make_fx inserts aten.detach.default for saved tensors used in the
     # decomposed autograd.grad backward ops.  These detach nodes break
     # second-order gradient flow (d(force)/d(params) for force training).
@@ -316,12 +323,16 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
     if compile_opts:
         inductor_options.update(compile_opts)
 
-    return torch.compile(
+    compiled = torch.compile(
         traced_lower,
         backend="inductor",
         dynamic=True,
         options=inductor_options,
     )
+    del traced_lower
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    return compiled
 
 
 class _CompiledModel(torch.nn.Module):
@@ -994,6 +1005,21 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             )
 
             wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower)
+
+            # Release all intermediate tensors built for this task so they don't
+            # accumulate across tasks in multi-task scenarios.
+            del ext_coord, ext_atype, mapping, nlist_t
+            del coord, atype, coord_3d, coord_norm
+            if box is not None:
+                del box, box_flat
+            if fparam is not None:
+                del fparam
+            if aparam is not None:
+                del aparam
+            del inp
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
             log.info(
                 "Model compiled (task=%s, tracing_mode=symbolic, "
                 "dynamic=True, backend=inductor).",

From ee413c3c075266fa7843eaa28cc5c302ccc89d5d Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 20 May 2026 19:33:49 +0800
Subject: [PATCH 02/19] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com>
---
 deepmd/pt_expt/train/training.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 7bbc9c3c17..4b29afb162 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -330,7 +330,14 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
         options=inductor_options,
     )
     del traced_lower
-    if torch.cuda.is_available():
+    model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any(
+        buffer.is_cuda for buffer in model.buffers()
+    )
+    if (
+        model_uses_cuda
+        and torch.cuda.is_available()
+        and torch.cuda.is_initialized()
+    ):
         torch.cuda.empty_cache()
     return compiled
 

From eb239ef726371c81c33b0a95352eb86a0f488e00 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 20 May 2026 11:34:30 +0000
Subject: [PATCH 03/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/train/training.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 4b29afb162..14205a0d6d 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -333,11 +333,7 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
     model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any(
         buffer.is_cuda for buffer in model.buffers()
     )
-    if (
-        model_uses_cuda
-        and torch.cuda.is_available()
-        and torch.cuda.is_initialized()
-    ):
+    if model_uses_cuda and torch.cuda.is_available() and torch.cuda.is_initialized():
         torch.cuda.empty_cache()
     return compiled
 

From c7d9f57a6381cc2c3e2e857ce46b1b7342f5a028 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 20 May 2026 19:35:46 +0800
Subject: [PATCH 04/19] fix: comment

---
 deepmd/pt_expt/train/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 14205a0d6d..d3d135936f 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1020,7 +1020,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             if aparam is not None:
                 del aparam
             del inp
-            if torch.cuda.is_available():
+            if torch.cuda.is_initialized():
                 torch.cuda.empty_cache()
 
             log.info(

From 2a105324804509ca18faeeff5810addf4a581a57 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Wed, 20 May 2026 20:00:49 +0800
Subject: [PATCH 05/19] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Anyang Peng <137014849+anyangml@users.noreply.github.com>
---
 deepmd/pt_expt/train/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index d3d135936f..ac3c79fe41 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1020,7 +1020,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             if aparam is not None:
                 del aparam
             del inp
-            if torch.cuda.is_initialized():
+            if DEVICE.type == "cuda" and torch.cuda.is_initialized():
                 torch.cuda.empty_cache()
 
             log.info(

From 8b7584a061c37bb7d74ccb7447e9e7f6bca8738c Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 21 May 2026 10:47:17 +0800
Subject: [PATCH 06/19] fix: remove graph

---
 deepmd/pt_expt/train/training.py | 44 +++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index ac3c79fe41..ef67d04bb5 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1019,7 +1019,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
                 del fparam
             if aparam is not None:
                 del aparam
-            del inp
+            del inp, _
             if DEVICE.type == "cuda" and torch.cuda.is_initialized():
                 torch.cuda.empty_cache()
 
@@ -1205,7 +1205,9 @@ def run(self) -> None:
                 if self.rank == 0:
                     if not self.multi_task:
                         train_results = {
-                            k: v for k, v in more_loss.items() if "l2_" not in k
+                            k: (v.item() if isinstance(v, torch.Tensor) else v)
+                            for k, v in more_loss.items()
+                            if "l2_" not in k
                         }
 
                         # validation
@@ -1225,9 +1227,13 @@ def run(self) -> None:
                                 sum_natoms += natoms
                                 for k, v in _vmore.items():
                                     if "l2_" not in k:
-                                        valid_results[k] = (
-                                            valid_results.get(k, 0.0) + v * natoms
-                                        )
+                                        valid_results[k] = valid_results.get(
+                                            k, 0.0
+                                        ) + (
+                                            v.item()
+                                            if isinstance(v, torch.Tensor)
+                                            else v
+                                        ) * natoms
                             if sum_natoms > 0:
                                 valid_results = {
                                     k: v / sum_natoms for k, v in valid_results.items()
@@ -1239,13 +1245,15 @@ def run(self) -> None:
 
                         # current task already has loss
                         train_results[task_key] = {
-                            k: v for k, v in more_loss.items() if "l2_" not in k
+                            k: (v.item() if isinstance(v, torch.Tensor) else v)
+                            for k, v in more_loss.items()
+                            if "l2_" not in k
                         }
 
                         # compute loss for other tasks
                         for _key in self.model_keys:
                             if _key != task_key:
-                                self.optimizer.zero_grad()
+                                self.optimizer.zero_grad(set_to_none=True)
                                 _inp, _lab = self.get_data(is_train=True, task_key=_key)
                                 _, _loss, _more = self._unwrapped(
                                     **_inp,
@@ -1253,9 +1261,23 @@ def run(self) -> None:
                                     label=_lab,
                                     task_key=_key,
                                 )
+                                # Use .item() so the backward graph (and its
+                                # saved activations) can be freed immediately.
+                                # Display passes never call loss.backward(), so
+                                # without this the computation graphs for all
+                                # tasks accumulate simultaneously in GPU memory.
                                 train_results[_key] = {
-                                    k: v for k, v in _more.items() if "l2_" not in k
+                                    k: (
+                                        v.item()
+                                        if isinstance(v, torch.Tensor)
+                                        else v
+                                    )
+                                    for k, v in _more.items()
+                                    if "l2_" not in k
                                 }
+                                del _loss, _more, _inp, _lab
+                                if torch.cuda.is_available() and torch.cuda.is_initialized():
+                                    torch.cuda.empty_cache()
 
                             # validation for each task
                             _vdata = self.validation_data[_key]
@@ -1278,7 +1300,11 @@ def run(self) -> None:
                                     _sum_natoms += natoms
                                     for k, v in _vmore.items():
                                         if "l2_" not in k:
-                                            _vres[k] = _vres.get(k, 0.0) + v * natoms
+                                            _vres[k] = _vres.get(k, 0.0) + (
+                                                v.item()
+                                                if isinstance(v, torch.Tensor)
+                                                else v
+                                            ) * natoms
                                 if _sum_natoms > 0:
                                     _vres = {
                                         k: v / _sum_natoms for k, v in _vres.items()

From 87e4e469d476ef5092ff2a9bb8295ca7ee13840f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 02:48:13 +0000
Subject: [PATCH 07/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/train/training.py | 41 ++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index ef67d04bb5..647ac2c624 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1227,13 +1227,15 @@ def run(self) -> None:
                                 sum_natoms += natoms
                                 for k, v in _vmore.items():
                                     if "l2_" not in k:
-                                        valid_results[k] = valid_results.get(
-                                            k, 0.0
-                                        ) + (
-                                            v.item()
-                                            if isinstance(v, torch.Tensor)
-                                            else v
-                                        ) * natoms
+                                        valid_results[k] = (
+                                            valid_results.get(k, 0.0)
+                                            + (
+                                                v.item()
+                                                if isinstance(v, torch.Tensor)
+                                                else v
+                                            )
+                                            * natoms
+                                        )
                             if sum_natoms > 0:
                                 valid_results = {
                                     k: v / sum_natoms for k, v in valid_results.items()
@@ -1267,16 +1269,15 @@ def run(self) -> None:
                                 # without this the computation graphs for all
                                 # tasks accumulate simultaneously in GPU memory.
                                 train_results[_key] = {
-                                    k: (
-                                        v.item()
-                                        if isinstance(v, torch.Tensor)
-                                        else v
-                                    )
+                                    k: (v.item() if isinstance(v, torch.Tensor) else v)
                                     for k, v in _more.items()
                                     if "l2_" not in k
                                 }
                                 del _loss, _more, _inp, _lab
-                                if torch.cuda.is_available() and torch.cuda.is_initialized():
+                                if (
+                                    torch.cuda.is_available()
+                                    and torch.cuda.is_initialized()
+                                ):
                                     torch.cuda.empty_cache()
 
                             # validation for each task
@@ -1300,11 +1301,15 @@ def run(self) -> None:
                                     _sum_natoms += natoms
                                     for k, v in _vmore.items():
                                         if "l2_" not in k:
-                                            _vres[k] = _vres.get(k, 0.0) + (
-                                                v.item()
-                                                if isinstance(v, torch.Tensor)
-                                                else v
-                                            ) * natoms
+                                            _vres[k] = (
+                                                _vres.get(k, 0.0)
+                                                + (
+                                                    v.item()
+                                                    if isinstance(v, torch.Tensor)
+                                                    else v
+                                                )
+                                                * natoms
+                                            )
                                 if _sum_natoms > 0:
                                     _vres = {
                                         k: v / _sum_natoms for k, v in _vres.items()

From 4ffc15a19765dfc9489a2583b24a89855844fbf4 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 21 May 2026 14:26:14 +0800
Subject: [PATCH 08/19] fix: lazy compile in multitask NCCL timeout

---
 deepmd/pt_expt/train/training.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 647ac2c624..7e39b8561e 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1007,6 +1007,19 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
                 compile_opts,
             )
 
+            # torch.compile is lazy: inductor only compiles on the first
+            # call.  In DDP multi-task training, different ranks may first
+            # hit a task at different training steps, so one rank can block
+            # inside inductor for minutes while others spin in AllReduce —
+            # causing an NCCL timeout.  Warmup here, while sample inputs
+            # still exist, forces eager compilation before training starts.
+            _warmup_out = compiled_lower(
+                ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
+            )
+            del _warmup_out
+            if DEVICE.type == "cuda" and torch.cuda.is_initialized():
+                torch.cuda.synchronize()
+
             wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower)
 
             # Release all intermediate tensors built for this task so they don't
@@ -1029,6 +1042,12 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
                 task_key,
             )
 
+        # All tasks compiled on this rank — wait for all ranks before
+        # training starts so no rank enters the training loop while another
+        # is still blocked in inductor compilation.
+        if self.is_distributed:
+            dist.barrier()
+
     # ------------------------------------------------------------------
     # Data helpers
     # ------------------------------------------------------------------

From 36d57a7e45d60aa48947143b2ab673fd7b1e2cc7 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 21 May 2026 14:53:07 +0800
Subject: [PATCH 09/19] fix: mark variable-size dimensions as dynamic to
 prevent NCCL timeout

---
 deepmd/pt_expt/train/training.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 7e39b8561e..65c57b777d 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1013,6 +1013,20 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             # inside inductor for minutes while others spin in AllReduce —
             # causing an NCCL timeout.  Warmup here, while sample inputs
             # still exist, forces eager compilation before training starts.
+            #
+            # Mark variable-size dimensions as dynamic so Dynamo creates range
+            # guards rather than equality guards.  Without this, each new value
+            # of nall or nloc in a training batch breaks the equality guard and
+            # triggers a full recompilation, which can stall one rank for
+            # minutes while others wait in a collective — causing NCCL timeout.
+            #   ext_coord / ext_atype / mapping  dim 1 = nall (ghost+real atoms)
+            #   nlist_t                          dim 1 = nloc (real atoms only)
+            # Both vary per batch because system sizes differ across structures.
+            torch._dynamo.mark_dynamic(ext_coord, 1)   # [nframes, nall, 3]
+            torch._dynamo.mark_dynamic(ext_atype, 1)   # [nframes, nall]
+            torch._dynamo.mark_dynamic(nlist_t, 1)     # [nframes, nloc, max_nnei]
+            if mapping.dim() >= 2:
+                torch._dynamo.mark_dynamic(mapping, 1)  # [nframes, nall]
             _warmup_out = compiled_lower(
                 ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
             )

From 1978e3363d6ddfaad14a54d5d638ee26fa410dcf Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 06:53:58 +0000
Subject: [PATCH 10/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/train/training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 65c57b777d..b696b08097 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1022,9 +1022,9 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             #   ext_coord / ext_atype / mapping  dim 1 = nall (ghost+real atoms)
             #   nlist_t                          dim 1 = nloc (real atoms only)
             # Both vary per batch because system sizes differ across structures.
-            torch._dynamo.mark_dynamic(ext_coord, 1)   # [nframes, nall, 3]
-            torch._dynamo.mark_dynamic(ext_atype, 1)   # [nframes, nall]
-            torch._dynamo.mark_dynamic(nlist_t, 1)     # [nframes, nloc, max_nnei]
+            torch._dynamo.mark_dynamic(ext_coord, 1)  # [nframes, nall, 3]
+            torch._dynamo.mark_dynamic(ext_atype, 1)  # [nframes, nall]
+            torch._dynamo.mark_dynamic(nlist_t, 1)  # [nframes, nloc, max_nnei]
             if mapping.dim() >= 2:
                 torch._dynamo.mark_dynamic(mapping, 1)  # [nframes, nall]
             _warmup_out = compiled_lower(

From 937b742b2ae1f77b42c4b6987dbf58a92cadcb6b Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 21 May 2026 16:54:28 +0800
Subject: [PATCH 11/19] fix: mark tensors as dynamic to prevent NCCL timeout
 during training

---
 deepmd/pt_expt/train/training.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index b696b08097..e5ff2d86cc 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -394,6 +394,23 @@ def forward(
         ext_coord = ext_coord.reshape(nframes, -1, 3)
         ext_coord = ext_coord.detach().requires_grad_(True)
 
+        # Mark nall and nloc as dynamic on every call so Dynamo's guard always
+        # matches the warmup compilation (which was also built with mark_dynamic).
+        # Without this, training tensors are new Python objects that Dynamo sees
+        # as "fresh", triggering a second per-task compilation on first training
+        # use.  With random task sampling across DDP ranks, that second
+        # compilation can happen at different times on different ranks, causing
+        # an NCCL timeout.
+        torch._dynamo.mark_dynamic(ext_coord, 1)   # nall
+        torch._dynamo.mark_dynamic(ext_atype, 1)   # nall
+        torch._dynamo.mark_dynamic(nlist, 1)        # nloc
+        if mapping.dim() >= 2:
+            torch._dynamo.mark_dynamic(mapping, 1)  # nall
+        if fparam is not None:
+            torch._dynamo.mark_dynamic(fparam, 0)   # nframes (may differ per task)
+        if aparam is not None:
+            torch._dynamo.mark_dynamic(aparam, 1)   # nloc
+
         result = self.compiled_forward_lower(
             ext_coord, ext_atype, nlist, mapping, fparam, aparam
         )
@@ -1022,11 +1039,22 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             #   ext_coord / ext_atype / mapping  dim 1 = nall (ghost+real atoms)
             #   nlist_t                          dim 1 = nloc (real atoms only)
             # Both vary per batch because system sizes differ across structures.
+            # Match _CompiledModel.forward which sets requires_grad_(True) on
+            # ext_coord before calling compiled_forward_lower.  Dynamo's guard
+            # includes requires_grad, so a mismatch here would cause every
+            # task's first training call to miss the warmup cache and trigger
+            # a new compilation — at a random time on each rank — creating the
+            # exact NCCL desync we are trying to prevent.
+            ext_coord = ext_coord.detach().requires_grad_(True)
             torch._dynamo.mark_dynamic(ext_coord, 1)  # [nframes, nall, 3]
             torch._dynamo.mark_dynamic(ext_atype, 1)  # [nframes, nall]
             torch._dynamo.mark_dynamic(nlist_t, 1)  # [nframes, nloc, max_nnei]
             if mapping.dim() >= 2:
                 torch._dynamo.mark_dynamic(mapping, 1)  # [nframes, nall]
+            if fparam is not None:
+                torch._dynamo.mark_dynamic(fparam, 0)   # [nframes, dim_fparam]
+            if aparam is not None:
+                torch._dynamo.mark_dynamic(aparam, 1)  # [nframes, nloc, dim_aparam]
             _warmup_out = compiled_lower(
                 ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
             )

From 2ee8af155b93c8167448e99b5fe89d35eb8e4995 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 08:55:21 +0000
Subject: [PATCH 12/19] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/train/training.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index e5ff2d86cc..3cba47c354 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -401,15 +401,15 @@ def forward(
         # use.  With random task sampling across DDP ranks, that second
         # compilation can happen at different times on different ranks, causing
         # an NCCL timeout.
-        torch._dynamo.mark_dynamic(ext_coord, 1)   # nall
-        torch._dynamo.mark_dynamic(ext_atype, 1)   # nall
-        torch._dynamo.mark_dynamic(nlist, 1)        # nloc
+        torch._dynamo.mark_dynamic(ext_coord, 1)  # nall
+        torch._dynamo.mark_dynamic(ext_atype, 1)  # nall
+        torch._dynamo.mark_dynamic(nlist, 1)  # nloc
         if mapping.dim() >= 2:
             torch._dynamo.mark_dynamic(mapping, 1)  # nall
         if fparam is not None:
-            torch._dynamo.mark_dynamic(fparam, 0)   # nframes (may differ per task)
+            torch._dynamo.mark_dynamic(fparam, 0)  # nframes (may differ per task)
         if aparam is not None:
-            torch._dynamo.mark_dynamic(aparam, 1)   # nloc
+            torch._dynamo.mark_dynamic(aparam, 1)  # nloc
 
         result = self.compiled_forward_lower(
             ext_coord, ext_atype, nlist, mapping, fparam, aparam
@@ -1052,7 +1052,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             if mapping.dim() >= 2:
                 torch._dynamo.mark_dynamic(mapping, 1)  # [nframes, nall]
             if fparam is not None:
-                torch._dynamo.mark_dynamic(fparam, 0)   # [nframes, dim_fparam]
+                torch._dynamo.mark_dynamic(fparam, 0)  # [nframes, dim_fparam]
             if aparam is not None:
                 torch._dynamo.mark_dynamic(aparam, 1)  # [nframes, nloc, dim_aparam]
             _warmup_out = compiled_lower(

From c05542acf79f610a5f7331219b6d66ae1878452b Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Thu, 21 May 2026 18:03:20 +0800
Subject: [PATCH 13/19] fix: dynamic shape

---
 deepmd/pt_expt/train/training.py | 56 ++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 3cba47c354..21500573f9 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -394,21 +394,27 @@ def forward(
         ext_coord = ext_coord.reshape(nframes, -1, 3)
         ext_coord = ext_coord.detach().requires_grad_(True)
 
-        # Mark nall and nloc as dynamic on every call so Dynamo's guard always
-        # matches the warmup compilation (which was also built with mark_dynamic).
-        # Without this, training tensors are new Python objects that Dynamo sees
-        # as "fresh", triggering a second per-task compilation on first training
-        # use.  With random task sampling across DDP ranks, that second
-        # compilation can happen at different times on different ranks, causing
-        # an NCCL timeout.
+        # Mark nframes, nall, and nloc as dynamic on every call so Dynamo's
+        # guard always matches the warmup compilation.  nframes varies when
+        # different systems have different per-system batch sizes.  nall/nloc
+        # vary because system atom counts differ.  Marking dim 0 (nframes) on
+        # ext_coord/ext_atype/nlist/mapping is required: without it Dynamo
+        # specialises ext_coord.shape[0] to the warmup value, propagates that
+        # constant into fparam.shape[0] via the reshape in general_fitting, and
+        # conflicts with mark_dynamic(fparam, 0) when nframes changes.
+        torch._dynamo.mark_dynamic(ext_coord, 0)  # nframes
         torch._dynamo.mark_dynamic(ext_coord, 1)  # nall
+        torch._dynamo.mark_dynamic(ext_atype, 0)  # nframes
         torch._dynamo.mark_dynamic(ext_atype, 1)  # nall
+        torch._dynamo.mark_dynamic(nlist, 0)  # nframes
         torch._dynamo.mark_dynamic(nlist, 1)  # nloc
         if mapping.dim() >= 2:
+            torch._dynamo.mark_dynamic(mapping, 0)  # nframes
             torch._dynamo.mark_dynamic(mapping, 1)  # nall
         if fparam is not None:
-            torch._dynamo.mark_dynamic(fparam, 0)  # nframes (may differ per task)
+            torch._dynamo.mark_dynamic(fparam, 0)  # nframes
         if aparam is not None:
+            torch._dynamo.mark_dynamic(aparam, 0)  # nframes
             torch._dynamo.mark_dynamic(aparam, 1)  # nloc
 
         result = self.compiled_forward_lower(
@@ -1033,12 +1039,17 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             #
             # Mark variable-size dimensions as dynamic so Dynamo creates range
             # guards rather than equality guards.  Without this, each new value
-            # of nall or nloc in a training batch breaks the equality guard and
-            # triggers a full recompilation, which can stall one rank for
-            # minutes while others wait in a collective — causing NCCL timeout.
-            #   ext_coord / ext_atype / mapping  dim 1 = nall (ghost+real atoms)
-            #   nlist_t                          dim 1 = nloc (real atoms only)
-            # Both vary per batch because system sizes differ across structures.
+            # of nall, nloc, or nframes in a training batch breaks the equality
+            # guard and triggers a full recompilation, which can stall one rank
+            # for minutes while others wait in a collective — causing NCCL timeout.
+            #   ext_coord / ext_atype / mapping  dim 0 = nframes, dim 1 = nall
+            #   nlist_t                          dim 0 = nframes, dim 1 = nloc
+            # nframes varies across batches when different systems have different
+            # per-system batch sizes.  nall/nloc vary because system atom counts
+            # differ.  fparam/aparam share dim 0 = nframes with ext_coord; if
+            # ext_coord dim 0 is not marked dynamic, Dynamo specialises it to the
+            # warmup value and propagates that constant into fparam's shape via
+            # the reshape in general_fitting, conflicting with mark_dynamic(fparam,0).
             # Match _CompiledModel.forward which sets requires_grad_(True) on
             # ext_coord before calling compiled_forward_lower.  Dynamo's guard
             # includes requires_grad, so a mismatch here would cause every
@@ -1046,15 +1057,20 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             # a new compilation — at a random time on each rank — creating the
             # exact NCCL desync we are trying to prevent.
             ext_coord = ext_coord.detach().requires_grad_(True)
-            torch._dynamo.mark_dynamic(ext_coord, 1)  # [nframes, nall, 3]
-            torch._dynamo.mark_dynamic(ext_atype, 1)  # [nframes, nall]
-            torch._dynamo.mark_dynamic(nlist_t, 1)  # [nframes, nloc, max_nnei]
+            torch._dynamo.mark_dynamic(ext_coord, 0)  # nframes
+            torch._dynamo.mark_dynamic(ext_coord, 1)  # nall
+            torch._dynamo.mark_dynamic(ext_atype, 0)  # nframes
+            torch._dynamo.mark_dynamic(ext_atype, 1)  # nall
+            torch._dynamo.mark_dynamic(nlist_t, 0)  # nframes
+            torch._dynamo.mark_dynamic(nlist_t, 1)  # nloc
             if mapping.dim() >= 2:
-                torch._dynamo.mark_dynamic(mapping, 1)  # [nframes, nall]
+                torch._dynamo.mark_dynamic(mapping, 0)  # nframes
+                torch._dynamo.mark_dynamic(mapping, 1)  # nall
             if fparam is not None:
-                torch._dynamo.mark_dynamic(fparam, 0)  # [nframes, dim_fparam]
+                torch._dynamo.mark_dynamic(fparam, 0)  # nframes
             if aparam is not None:
-                torch._dynamo.mark_dynamic(aparam, 1)  # [nframes, nloc, dim_aparam]
+                torch._dynamo.mark_dynamic(aparam, 0)  # nframes
+                torch._dynamo.mark_dynamic(aparam, 1)  # nloc
             _warmup_out = compiled_lower(
                 ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
             )

From 697b24f933af1395b0f751c6c445ef73f0a8af8a Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Fri, 22 May 2026 13:52:07 +0800
Subject: [PATCH 14/19] chore: revert extra mark_dynamic

---
 deepmd/pt_expt/train/training.py | 57 ++------------------------------
 1 file changed, 2 insertions(+), 55 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 21500573f9..2203f877e9 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -394,29 +394,6 @@ def forward(
         ext_coord = ext_coord.reshape(nframes, -1, 3)
         ext_coord = ext_coord.detach().requires_grad_(True)
 
-        # Mark nframes, nall, and nloc as dynamic on every call so Dynamo's
-        # guard always matches the warmup compilation.  nframes varies when
-        # different systems have different per-system batch sizes.  nall/nloc
-        # vary because system atom counts differ.  Marking dim 0 (nframes) on
-        # ext_coord/ext_atype/nlist/mapping is required: without it Dynamo
-        # specialises ext_coord.shape[0] to the warmup value, propagates that
-        # constant into fparam.shape[0] via the reshape in general_fitting, and
-        # conflicts with mark_dynamic(fparam, 0) when nframes changes.
-        torch._dynamo.mark_dynamic(ext_coord, 0)  # nframes
-        torch._dynamo.mark_dynamic(ext_coord, 1)  # nall
-        torch._dynamo.mark_dynamic(ext_atype, 0)  # nframes
-        torch._dynamo.mark_dynamic(ext_atype, 1)  # nall
-        torch._dynamo.mark_dynamic(nlist, 0)  # nframes
-        torch._dynamo.mark_dynamic(nlist, 1)  # nloc
-        if mapping.dim() >= 2:
-            torch._dynamo.mark_dynamic(mapping, 0)  # nframes
-            torch._dynamo.mark_dynamic(mapping, 1)  # nall
-        if fparam is not None:
-            torch._dynamo.mark_dynamic(fparam, 0)  # nframes
-        if aparam is not None:
-            torch._dynamo.mark_dynamic(aparam, 0)  # nframes
-            torch._dynamo.mark_dynamic(aparam, 1)  # nloc
-
         result = self.compiled_forward_lower(
             ext_coord, ext_atype, nlist, mapping, fparam, aparam
         )
@@ -1037,40 +1014,10 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             # causing an NCCL timeout.  Warmup here, while sample inputs
             # still exist, forces eager compilation before training starts.
             #
-            # Mark variable-size dimensions as dynamic so Dynamo creates range
-            # guards rather than equality guards.  Without this, each new value
-            # of nall, nloc, or nframes in a training batch breaks the equality
-            # guard and triggers a full recompilation, which can stall one rank
-            # for minutes while others wait in a collective — causing NCCL timeout.
-            #   ext_coord / ext_atype / mapping  dim 0 = nframes, dim 1 = nall
-            #   nlist_t                          dim 0 = nframes, dim 1 = nloc
-            # nframes varies across batches when different systems have different
-            # per-system batch sizes.  nall/nloc vary because system atom counts
-            # differ.  fparam/aparam share dim 0 = nframes with ext_coord; if
-            # ext_coord dim 0 is not marked dynamic, Dynamo specialises it to the
-            # warmup value and propagates that constant into fparam's shape via
-            # the reshape in general_fitting, conflicting with mark_dynamic(fparam,0).
             # Match _CompiledModel.forward which sets requires_grad_(True) on
-            # ext_coord before calling compiled_forward_lower.  Dynamo's guard
-            # includes requires_grad, so a mismatch here would cause every
-            # task's first training call to miss the warmup cache and trigger
-            # a new compilation — at a random time on each rank — creating the
-            # exact NCCL desync we are trying to prevent.
+            # ext_coord: Dynamo's guard includes requires_grad, so a mismatch
+            # causes every task's first training call to miss the warmup cache.
             ext_coord = ext_coord.detach().requires_grad_(True)
-            torch._dynamo.mark_dynamic(ext_coord, 0)  # nframes
-            torch._dynamo.mark_dynamic(ext_coord, 1)  # nall
-            torch._dynamo.mark_dynamic(ext_atype, 0)  # nframes
-            torch._dynamo.mark_dynamic(ext_atype, 1)  # nall
-            torch._dynamo.mark_dynamic(nlist_t, 0)  # nframes
-            torch._dynamo.mark_dynamic(nlist_t, 1)  # nloc
-            if mapping.dim() >= 2:
-                torch._dynamo.mark_dynamic(mapping, 0)  # nframes
-                torch._dynamo.mark_dynamic(mapping, 1)  # nall
-            if fparam is not None:
-                torch._dynamo.mark_dynamic(fparam, 0)  # nframes
-            if aparam is not None:
-                torch._dynamo.mark_dynamic(aparam, 0)  # nframes
-                torch._dynamo.mark_dynamic(aparam, 1)  # nloc
             _warmup_out = compiled_lower(
                 ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
             )

From c27c95d64167d4698b8c62bc9bb0c4fbfb1fe866 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Fri, 22 May 2026 15:49:12 +0800
Subject: [PATCH 15/19] fix: prevent GC error

---
 deepmd/pt_expt/train/training.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 2203f877e9..733bd2c88a 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -329,6 +329,13 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
         dynamic=True,
         options=inductor_options,
     )
+    # Keep the traced FX graph alive as long as the compiled callable.
+    # _remove_detach_nodes makes saved activations alias the graph's symbolic
+    # tensors; if the FX graph is GC'd, its SymInt shape objects lose their
+    # Python references while C++ view metadata still holds raw pointers to
+    # them — causing apply_view_meta_sequence to read garbage (crash at
+    # random training steps, earlier under higher GC pressure from many tasks).
+    compiled._traced_lower_ref = traced_lower
     del traced_lower
     model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any(
         buffer.is_cuda for buffer in model.buffers()

From b5236de8f66a2b54e462b8f64734e7c0ffe68ec8 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Fri, 22 May 2026 16:34:52 +0800
Subject: [PATCH 16/19] fix:recursion

---
 deepmd/pt_expt/train/training.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 733bd2c88a..5377b43044 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -335,7 +335,11 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
     # Python references while C++ view metadata still holds raw pointers to
     # them — causing apply_view_meta_sequence to read garbage (crash at
     # random training steps, earlier under higher GC pressure from many tasks).
-    compiled._traced_lower_ref = traced_lower
+    # Use object.__setattr__ to bypass nn.Module.__setattr__: traced_lower is
+    # an nn.Module, and normal assignment would register it as a submodule of
+    # compiled (also an nn.Module), creating a cycle in the module tree that
+    # causes RecursionError in trainer.wrapper.train().
+    object.__setattr__(compiled, "_traced_lower_ref", traced_lower)
     del traced_lower
     model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any(
         buffer.is_cuda for buffer in model.buffers()

From 2377b1a438f14b70821e212e5b36e480398f9361 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 25 May 2026 09:43:25 +0800
Subject: [PATCH 17/19] fix: try fix remove_detach_node

---
 deepmd/pt_expt/train/training.py | 60 +++++++++++---------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 5377b43044..fe5bd28362 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -147,23 +147,30 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
 
 
 def _remove_detach_nodes(gm: torch.fx.GraphModule) -> None:
-    """Remove ``aten.detach.default`` nodes from an FX graph in-place.
-
-    ``make_fx`` inserts these nodes when recording saved tensors from the
-    autograd backward pass (``autograd.grad`` with ``create_graph=True``).
-    The detach breaks the gradient connection between saved activations and
-    model parameters, causing incorrect second-order derivatives — e.g.
-    bias gradients become zero for force-loss training.
-
-    Removing these nodes restores the gradient path so that higher-order
-    derivatives flow correctly through the decomposed backward ops.
+    """Replace ``aten.detach.default`` nodes with ``aten.clone.default``.
+
+    ``make_fx`` inserts detach nodes for saved tensors in the decomposed
+    autograd backward.  The detach breaks the gradient path from saved
+    activations back to model parameters, causing incorrect second-order
+    derivatives (e.g. bias gradients become zero for force-loss training).
+
+    We replace detach with clone rather than erasing the node entirely.
+    Erasing makes the output alias the input — AOT autograd detects the
+    alias and stores SymInt shape values as raw pointers in a C++
+    ``view_meta_sequence``.  When Python GC later collects those SymInt
+    objects the pointers dangle, producing a crash of the form
+    ``shape '[139...008, ...]' is invalid for input of size N``.
+    Clone breaks the alias so no ``view_meta_sequence`` is generated.
     """
     graph = gm.graph
     for node in list(graph.nodes):
         if node.op == "call_function" and node.target == torch.ops.aten.detach.default:
-            input_node = node.args[0]
-            node.replace_all_uses_with(input_node)
-            graph.erase_node(node)
+            # Replace detach with clone to break the input-output alias.
+            # Alias-free outputs mean AOT autograd never writes SymInt raw
+            # pointers into C++ view_meta_sequence, so GC of SymInt objects
+            # cannot produce dangling pointers and apply_view_meta_sequence
+            # crashes (shape '[139...008, ...]' is invalid for input ...).
+            node.target = torch.ops.aten.clone.default
     graph.lint()
     gm.recompile()
 
@@ -323,30 +330,12 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
     if compile_opts:
         inductor_options.update(compile_opts)
 
-    compiled = torch.compile(
+    return torch.compile(
         traced_lower,
         backend="inductor",
         dynamic=True,
         options=inductor_options,
     )
-    # Keep the traced FX graph alive as long as the compiled callable.
-    # _remove_detach_nodes makes saved activations alias the graph's symbolic
-    # tensors; if the FX graph is GC'd, its SymInt shape objects lose their
-    # Python references while C++ view metadata still holds raw pointers to
-    # them — causing apply_view_meta_sequence to read garbage (crash at
-    # random training steps, earlier under higher GC pressure from many tasks).
-    # Use object.__setattr__ to bypass nn.Module.__setattr__: traced_lower is
-    # an nn.Module, and normal assignment would register it as a submodule of
-    # compiled (also an nn.Module), creating a cycle in the module tree that
-    # causes RecursionError in trainer.wrapper.train().
-    object.__setattr__(compiled, "_traced_lower_ref", traced_lower)
-    del traced_lower
-    model_uses_cuda = any(param.is_cuda for param in model.parameters()) or any(
-        buffer.is_cuda for buffer in model.buffers()
-    )
-    if model_uses_cuda and torch.cuda.is_available() and torch.cuda.is_initialized():
-        torch.cuda.empty_cache()
-    return compiled
 
 
 class _CompiledModel(torch.nn.Module):
@@ -1049,8 +1038,6 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             if aparam is not None:
                 del aparam
             del inp, _
-            if DEVICE.type == "cuda" and torch.cuda.is_initialized():
-                torch.cuda.empty_cache()
 
             log.info(
                 "Model compiled (task=%s, tracing_mode=symbolic, "
@@ -1309,11 +1296,6 @@ def run(self) -> None:
                                     if "l2_" not in k
                                 }
                                 del _loss, _more, _inp, _lab
-                                if (
-                                    torch.cuda.is_available()
-                                    and torch.cuda.is_initialized()
-                                ):
-                                    torch.cuda.empty_cache()
 
                             # validation for each task
                             _vdata = self.validation_data[_key]

From 211608bdad7a2a760169d7d7f1f32d0c1efbdb99 Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 25 May 2026 12:00:10 +0800
Subject: [PATCH 18/19] fix: add new charge_spin para

---
 deepmd/pt_expt/train/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 003e1049a5..8e8e29b59e 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -1045,7 +1045,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             # causes every task's first training call to miss the warmup cache.
             ext_coord = ext_coord.detach().requires_grad_(True)
             _warmup_out = compiled_lower(
-                ext_coord, ext_atype, nlist_t, mapping, fparam, aparam
+                ext_coord, ext_atype, nlist_t, mapping, fparam, aparam, charge_spin
             )
             del _warmup_out
             if DEVICE.type == "cuda" and torch.cuda.is_initialized():
@@ -1063,6 +1063,8 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
                 del fparam
             if aparam is not None:
                 del aparam
+            if charge_spin is not None:
+                del charge_spin
             del inp, _
 
             log.info(

From ec7296ca87800aec33efa5a738462d4fabb22e3b Mon Sep 17 00:00:00 2001
From: Anyang Peng <137014849+anyangml@users.noreply.github.com>
Date: Mon, 25 May 2026 13:10:47 +0800
Subject: [PATCH 19/19] fix: comment

---
 deepmd/pt_expt/train/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 8e8e29b59e..2d5d47ea78 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -324,6 +324,8 @@ def _expand(t: torch.Tensor | None) -> torch.Tensor | None:
         del fparam
     if aparam is not None:
         del aparam
+    if charge_spin is not None:
+        del charge_spin
 
     # make_fx inserts aten.detach.default for saved tensors used in the
     # decomposed autograd.grad backward ops.  These detach nodes break
@@ -1048,7 +1050,7 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
                 ext_coord, ext_atype, nlist_t, mapping, fparam, aparam, charge_spin
             )
             del _warmup_out
-            if DEVICE.type == "cuda" and torch.cuda.is_initialized():
+            if DEVICE.type == "cuda":
                 torch.cuda.synchronize()
 
             wrapper_mod.model[task_key] = _CompiledModel(model, compiled_lower)