From 6b4ca59e88a6957f0bc17cb7d3380d96f1277ae5 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 29 Jan 2026 23:39:06 +0800 Subject: [PATCH 01/12] fix(pt/pd): fix incompatibility between AutoBatchSize and eval hooks --- deepmd/pd/infer/deep_eval.py | 8 ++++++ deepmd/pt/infer/deep_eval.py | 8 ++++++ deepmd/utils/batch_size.py | 50 ++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 6c0ffed7ec..384869c2a7 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -823,6 +823,8 @@ def eval_descriptor( model = ( self.dp.model["Default"] if isinstance(self.dp, ModelWrapper) else self.dp ) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) self.eval( coords, @@ -835,6 +837,8 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -878,6 +882,8 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) self.eval( coords, @@ -890,4 +896,6 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 6e63ecb2fc..50fce3ccd2 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -793,6 +793,8 @@ def eval_descriptor( Descriptors. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) self.eval( coords, @@ -805,6 +807,8 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -848,6 +852,8 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) self.eval( coords, @@ -860,4 +866,6 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index e701e82ec6..289b8b83a8 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -22,6 +22,39 @@ log = logging.getLogger(__name__) + +class RetrySignal(Exception): + """Signal to retry execution after OOM error.""" + + +# originally copied from dpdispatcher +# https://github.com/deepmodeling/dpdispatcher/blob/9a76542311a02e84c4ae62f15b7edcd30850a64e/dpdispatcher/utils/utils.py#L161-L213 +# license: LGPL-3.0-or-later +def retry(func: Any) -> Callable: + """Decorator to retry the function until it succeeds or fails for certain times. + + Returns + ------- + wrapper: Callable + The wrapper. + + Examples + -------- + >>> @retry + ... def func(): + ... raise RetrySignal("Failed") + """ + + def wrapper(*args: Any, **kwargs: Any) -> Any: + while True: + try: + return func(*args, **kwargs) + except RetrySignal: + log.info("Retry the entire method") + + return wrapper + + class AutoBatchSize(ABC): """This class allows DeePMD-kit to automatically decide the maximum batch size that will not cause an OOM error. @@ -75,6 +108,7 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: ) self.factor = factor + self.oom_retry_mode = False def execute( self, callable: Callable, start_index: int, natoms: int @@ -125,6 +159,8 @@ def execute( ) from e # adjust the next batch size self._adjust_batch_size(1.0 / self.factor) + if self.set_oom_retry_mode: + raise RetrySignal from e return 0, None else: n_tot = n_batch * natoms @@ -147,6 +183,7 @@ def _adjust_batch_size(self, factor: float) -> None: f"Adjust batch size from {old_batch_size} to {self.current_batch_size}" ) + @retry def execute_all( self, callable: Callable, @@ -281,3 +318,16 @@ def is_oom_error(self, e: Exception) -> bool: bool True if the exception is an OOM error """ + + def set_oom_retry_mode(self, enable: bool) -> None: + """Set OOM retry mode. + + In OOM retry mode, all data will be re-executed. + + Parameters + ---------- + enable : bool + True to enable OOM retry mode + """ + self.oom_retry_mode = enable + From 404d1ac71d49a339ebfaadd372d14973580ca92e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:42:01 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/batch_size.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 289b8b83a8..293f6aa92a 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -22,7 +22,6 @@ log = logging.getLogger(__name__) - class RetrySignal(Exception): """Signal to retry execution after OOM error.""" @@ -330,4 +329,3 @@ def set_oom_retry_mode(self, enable: bool) -> None: True to enable OOM retry mode """ self.oom_retry_mode = enable - From 01de666bee46632fa57bb66c006cbf388ef155b6 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 30 Jan 2026 00:19:48 +0800 Subject: [PATCH 03/12] apply Copilot's suggestions --- deepmd/pd/infer/deep_eval.py | 75 +++++++++++++++++++++++------------- deepmd/pt/infer/deep_eval.py | 75 +++++++++++++++++++++++------------- deepmd/utils/batch_size.py | 30 +-------------- 3 files changed, 99 insertions(+), 81 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 384869c2a7..10973e254a 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -10,6 +10,7 @@ ) import numpy as np +from deepmd.utils.batch_size import RetrySignal import paddle from paddle import inference as paddle_inference @@ -826,19 +827,30 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + descriptor = model.eval_descriptor() + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -885,17 +897,28 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + fitting_net = model.eval_fitting_last_layer() + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 50fce3ccd2..1895919732 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -67,6 +67,7 @@ to_numpy_array, to_torch_tensor, ) +from deepmd.utils.batch_size import RetrySignal from deepmd.utils.econf_embd import ( sort_element_type, ) @@ -796,19 +797,30 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + descriptor = model.eval_descriptor() + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -855,17 +867,28 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + fitting_net = model.eval_fitting_last_layer() + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 289b8b83a8..f8f96bc1ef 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -27,34 +27,6 @@ class RetrySignal(Exception): """Signal to retry execution after OOM error.""" -# originally copied from dpdispatcher -# https://github.com/deepmodeling/dpdispatcher/blob/9a76542311a02e84c4ae62f15b7edcd30850a64e/dpdispatcher/utils/utils.py#L161-L213 -# license: LGPL-3.0-or-later -def retry(func: Any) -> Callable: - """Decorator to retry the function until it succeeds or fails for certain times. - - Returns - ------- - wrapper: Callable - The wrapper. - - Examples - -------- - >>> @retry - ... def func(): - ... raise RetrySignal("Failed") - """ - - def wrapper(*args: Any, **kwargs: Any) -> Any: - while True: - try: - return func(*args, **kwargs) - except RetrySignal: - log.info("Retry the entire method") - - return wrapper - - class AutoBatchSize(ABC): """This class allows DeePMD-kit to automatically decide the maximum batch size that will not cause an OOM error. @@ -159,7 +131,7 @@ def execute( ) from e # adjust the next batch size self._adjust_batch_size(1.0 / self.factor) - if self.set_oom_retry_mode: + if self.oom_retry_mode: raise RetrySignal from e return 0, None else: From 72c4e367af614d9f5eaad52e0a1a5444ee0fda1f Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 30 Jan 2026 00:21:03 +0800 Subject: [PATCH 04/12] rm retry --- deepmd/utils/batch_size.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 57dbd34585..82de03695c 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -154,7 +154,6 @@ def _adjust_batch_size(self, factor: float) -> None: f"Adjust batch size from {old_batch_size} to {self.current_batch_size}" ) - @retry def execute_all( self, callable: Callable, From f785d619397efa24b6fb1b3ed38252322edce81c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:22:15 +0000 Subject: [PATCH 05/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pd/infer/deep_eval.py | 4 +++- deepmd/pt/infer/deep_eval.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 10973e254a..2465a2b0de 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -10,7 +10,6 @@ ) import numpy as np -from deepmd.utils.batch_size import RetrySignal import paddle from paddle import inference as paddle_inference @@ -65,6 +64,9 @@ to_numpy_array, to_paddle_tensor, ) +from deepmd.utils.batch_size import ( + RetrySignal, +) from deepmd.utils.econf_embd import ( sort_element_type, ) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 1895919732..be90fcea78 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -67,7 +67,9 @@ to_numpy_array, to_torch_tensor, ) -from deepmd.utils.batch_size import RetrySignal +from deepmd.utils.batch_size import ( + RetrySignal, +) from deepmd.utils.econf_embd import ( sort_element_type, ) From fb6fff570921a59a7dbb8dee09ec4465065c0b52 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 23 May 2026 06:52:23 +0800 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: A bot of @njzjz <48687836+njzjz-bot@users.noreply.github.com> Signed-off-by: Jinzhe Zeng --- deepmd/pd/infer/deep_eval.py | 6 +++--- deepmd/pt/infer/deep_eval.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 2465a2b0de..2d5e05097e 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -839,6 +839,7 @@ def eval_descriptor( aparam=aparam, **kwargs, ) + descriptor = model.eval_descriptor() except RetrySignal: return self.eval_descriptor( coords, @@ -849,7 +850,6 @@ def eval_descriptor( **kwargs, ) finally: - descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(False) @@ -909,8 +909,9 @@ def eval_fitting_last_layer( aparam=aparam, **kwargs, ) + fitting_net = model.eval_fitting_last_layer() except RetrySignal: - return self.eval_descriptor( + return self.eval_fitting_last_layer( coords, cells, atom_types, @@ -919,7 +920,6 @@ def eval_fitting_last_layer( **kwargs, ) finally: - fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(False) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index be90fcea78..9fffda2f23 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -809,6 +809,7 @@ def eval_descriptor( aparam=aparam, **kwargs, ) + descriptor = model.eval_descriptor() except RetrySignal: return self.eval_descriptor( coords, @@ -819,7 +820,6 @@ def eval_descriptor( **kwargs, ) finally: - descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(False) @@ -879,8 +879,9 @@ def eval_fitting_last_layer( aparam=aparam, **kwargs, ) + fitting_net = model.eval_fitting_last_layer() except RetrySignal: - return self.eval_descriptor( + return self.eval_fitting_last_layer( coords, cells, atom_types, @@ -889,7 +890,6 @@ def eval_fitting_last_layer( **kwargs, ) finally: - fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(False) From 86a137adbd34a7c5110e6859d8d34060255b0051 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sat, 23 May 2026 16:20:58 +0000 Subject: [PATCH 07/12] test(infer): cover OOM retry hook cleanup Add regression coverage for the AutoBatchSize RetrySignal path and eval hook cleanup around retry and non-retry failures. Move recursive retries until after finally so hooks are cleared between attempts. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- deepmd/pd/infer/deep_eval.py | 22 ++-- deepmd/pt/infer/deep_eval.py | 22 ++-- source/tests/common/test_oom_retry.py | 170 ++++++++++++++++++++++++++ 3 files changed, 198 insertions(+), 16 deletions(-) create mode 100644 source/tests/common/test_oom_retry.py diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 2d5e05097e..12ff454b74 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -829,6 +829,7 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) + retry = False try: self.eval( coords, @@ -841,6 +842,12 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() except RetrySignal: + retry = True + finally: + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: return self.eval_descriptor( coords, cells, @@ -849,10 +856,6 @@ def eval_descriptor( aparam=aparam, **kwargs, ) - finally: - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -899,6 +902,7 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) + retry = False try: self.eval( coords, @@ -911,6 +915,12 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() except RetrySignal: + retry = True + finally: + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: return self.eval_fitting_last_layer( coords, cells, @@ -919,8 +929,4 @@ def eval_fitting_last_layer( aparam=aparam, **kwargs, ) - finally: - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 9fffda2f23..de2803cab2 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -799,6 +799,7 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) + retry = False try: self.eval( coords, @@ -811,6 +812,12 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() except RetrySignal: + retry = True + finally: + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: return self.eval_descriptor( coords, cells, @@ -819,10 +826,6 @@ def eval_descriptor( aparam=aparam, **kwargs, ) - finally: - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -869,6 +872,7 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) + retry = False try: self.eval( coords, @@ -881,6 +885,12 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() except RetrySignal: + retry = True + finally: + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: return self.eval_fitting_last_layer( coords, cells, @@ -889,8 +899,4 @@ def eval_fitting_last_layer( aparam=aparam, **kwargs, ) - finally: - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/source/tests/common/test_oom_retry.py b/source/tests/common/test_oom_retry.py new file mode 100644 index 0000000000..da95d9177d --- /dev/null +++ b/source/tests/common/test_oom_retry.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import unittest +from typing import ( + Any, +) + +from deepmd.utils.batch_size import ( + AutoBatchSize, + RetrySignal, +) +from deepmd.utils.errors import ( + OutOfMemoryError, +) + + +class CustomizedAutoBatchSizeGPU(AutoBatchSize): + def is_gpu_available(self) -> bool: + return True + + def is_oom_error(self, e): + return isinstance(e, OutOfMemoryError) + + +class DummyAutoBatchSize: + def __init__(self) -> None: + self.oom_retry_mode = False + self.modes: list[bool] = [] + + def set_oom_retry_mode(self, enable: bool) -> None: + self.oom_retry_mode = enable + self.modes.append(enable) + + +class DummyModel: + def __init__(self) -> None: + self.descriptor_hook_calls: list[bool] = [] + self.fitting_hook_calls: list[bool] = [] + + def set_eval_descriptor_hook(self, enable: bool) -> None: + self.descriptor_hook_calls.append(enable) + + def set_eval_fitting_last_layer_hook(self, enable: bool) -> None: + self.fitting_hook_calls.append(enable) + + def eval_descriptor(self) -> list[int]: + return [1, 2, 3] + + def eval_fitting_last_layer(self) -> list[int]: + return [4, 5, 6] + + +class DummyDeepEval: + def __init__(self, fail_once: bool = False, runtime_error: bool = False) -> None: + self.auto_batch_size = DummyAutoBatchSize() + self.model = DummyModel() + self.dp = {"model": {"Default": self.model}} + self.fail_once = fail_once + self.runtime_error = runtime_error + self.eval_calls = 0 + + def eval(self, *args: Any, **kwargs: Any) -> None: + self.eval_calls += 1 + if self.runtime_error: + raise RuntimeError("non-retry failure") + if self.fail_once and self.eval_calls == 1: + raise RetrySignal + + def eval_descriptor(self, *args: Any, **kwargs: Any) -> list[int]: + model = self.dp["model"]["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_descriptor_hook(True) + retry = False + try: + self.eval(*args, **kwargs) + descriptor = model.eval_descriptor() + except RetrySignal: + retry = True + finally: + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: + return self.eval_descriptor(*args, **kwargs) + return descriptor + + def eval_fitting_last_layer(self, *args: Any, **kwargs: Any) -> list[int]: + model = self.dp["model"]["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_fitting_last_layer_hook(True) + retry = False + try: + self.eval(*args, **kwargs) + fitting = model.eval_fitting_last_layer() + except RetrySignal: + retry = True + finally: + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if retry: + return self.eval_fitting_last_layer(*args, **kwargs) + return fitting + + +class TestOOMRetry(unittest.TestCase): + def test_execute_oom_retry_mode_raises_retry_signal(self) -> None: + auto_batch_size = CustomizedAutoBatchSizeGPU(256, 2.0) + + oom = OutOfMemoryError("oom") + + def executor(batch_size: int, start_index: int) -> tuple[int, None]: + raise oom + + auto_batch_size.set_oom_retry_mode(True) + with self.assertRaises(RetrySignal) as context: + auto_batch_size.execute(executor, 0, 1) + self.assertIs(context.exception.__cause__, oom) + self.assertEqual(auto_batch_size.current_batch_size, 128) + + def test_execute_oom_retry_mode_false_returns_zero(self) -> None: + auto_batch_size = CustomizedAutoBatchSizeGPU(256, 2.0) + + def executor(batch_size: int, start_index: int) -> tuple[int, None]: + raise OutOfMemoryError("oom") + + auto_batch_size.set_oom_retry_mode(False) + n_batch, result = auto_batch_size.execute(executor, 0, 1) + self.assertEqual(n_batch, 0) + self.assertIsNone(result) + self.assertEqual(auto_batch_size.current_batch_size, 128) + + def test_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: + deep_eval = DummyDeepEval(fail_once=True) + self.assertEqual(deep_eval.eval_descriptor(), [1, 2, 3]) + self.assertEqual(deep_eval.eval_calls, 2) + self.assertEqual( + deep_eval.model.descriptor_hook_calls, + [True, False, True, False], + ) + self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + + def test_eval_fitting_last_layer_retry_clears_hook_between_attempts(self) -> None: + deep_eval = DummyDeepEval(fail_once=True) + self.assertEqual(deep_eval.eval_fitting_last_layer(), [4, 5, 6]) + self.assertEqual(deep_eval.eval_calls, 2) + self.assertEqual( + deep_eval.model.fitting_hook_calls, + [True, False, True, False], + ) + self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + + def test_eval_descriptor_runtime_error_clears_state(self) -> None: + deep_eval = DummyDeepEval(runtime_error=True) + with self.assertRaisesRegex(RuntimeError, "non-retry failure"): + deep_eval.eval_descriptor() + self.assertEqual(deep_eval.model.descriptor_hook_calls, [True, False]) + self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + + def test_eval_fitting_last_layer_runtime_error_clears_state(self) -> None: + deep_eval = DummyDeepEval(runtime_error=True) + with self.assertRaisesRegex(RuntimeError, "non-retry failure"): + deep_eval.eval_fitting_last_layer() + self.assertEqual(deep_eval.model.fitting_hook_calls, [True, False]) + self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + + +if __name__ == "__main__": + unittest.main() From b5f789aebddf96f097ced45b4393cfff7948950c Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Sun, 24 May 2026 05:48:24 +0000 Subject: [PATCH 08/12] fix(infer): use iterative OOM hook retries Avoid recursive RetrySignal handling in eval_descriptor and eval_fitting_last_layer so repeated OOM retries do not consume Python stack frames. The loop still clears hook and retry state between attempts before retrying. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- deepmd/pd/infer/deep_eval.py | 106 +++++++++++++++-------------------- deepmd/pt/infer/deep_eval.py | 106 +++++++++++++++-------------------- 2 files changed, 92 insertions(+), 120 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 12ff454b74..72dcc2bff0 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -826,37 +826,30 @@ def eval_descriptor( model = ( self.dp.model["Default"] if isinstance(self.dp, ModelWrapper) else self.dp ) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_descriptor_hook(True) - retry = False - try: - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - except RetrySignal: - retry = True - finally: - model.set_eval_descriptor_hook(False) + while True: if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_descriptor( - coords, - cells, - atom_types, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - return to_numpy_array(descriptor) + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_descriptor_hook(True) + retry = False + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + descriptor = model.eval_descriptor() + except RetrySignal: + retry = True + finally: + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if not retry: + return to_numpy_array(descriptor) def eval_fitting_last_layer( self, @@ -899,34 +892,27 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_fitting_last_layer_hook(True) - retry = False - try: - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - except RetrySignal: - retry = True - finally: - model.set_eval_fitting_last_layer_hook(False) + while True: if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_fitting_last_layer( - coords, - cells, - atom_types, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - return to_numpy_array(fitting_net) + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_fitting_last_layer_hook(True) + retry = False + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + fitting_net = model.eval_fitting_last_layer() + except RetrySignal: + retry = True + finally: + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if not retry: + return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index de2803cab2..e48b26057e 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -796,37 +796,30 @@ def eval_descriptor( Descriptors. """ model = self.dp.model["Default"] - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_descriptor_hook(True) - retry = False - try: - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - except RetrySignal: - retry = True - finally: - model.set_eval_descriptor_hook(False) + while True: if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_descriptor( - coords, - cells, - atom_types, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - return to_numpy_array(descriptor) + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_descriptor_hook(True) + retry = False + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + descriptor = model.eval_descriptor() + except RetrySignal: + retry = True + finally: + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if not retry: + return to_numpy_array(descriptor) def eval_fitting_last_layer( self, @@ -869,34 +862,27 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_fitting_last_layer_hook(True) - retry = False - try: - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - except RetrySignal: - retry = True - finally: - model.set_eval_fitting_last_layer_hook(False) + while True: if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_fitting_last_layer( - coords, - cells, - atom_types, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - return to_numpy_array(fitting_net) + self.auto_batch_size.set_oom_retry_mode(True) + model.set_eval_fitting_last_layer_hook(True) + retry = False + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + fitting_net = model.eval_fitting_last_layer() + except RetrySignal: + retry = True + finally: + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) + if not retry: + return to_numpy_array(fitting_net) From 31e42cb7ba536791c5f4acb50abcf8171b6e8061 Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Mon, 25 May 2026 11:31:11 +0000 Subject: [PATCH 09/12] test(oom): exercise production eval retry paths Replace the hand-written DummyDeepEval orchestration with real PT/PD DeepEval instances constructed through __new__, while mocking their dependencies. This keeps the AutoBatchSize test intact and makes retry/finally assertions pin the production eval_descriptor and eval_fitting_last_layer methods. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- source/tests/common/test_oom_retry.py | 238 +++++++++++++++----------- 1 file changed, 140 insertions(+), 98 deletions(-) diff --git a/source/tests/common/test_oom_retry.py b/source/tests/common/test_oom_retry.py index da95d9177d..c7da33ecae 100644 --- a/source/tests/common/test_oom_retry.py +++ b/source/tests/common/test_oom_retry.py @@ -1,8 +1,16 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import unittest +from types import SimpleNamespace from typing import ( Any, ) +from unittest.mock import ( + MagicMock, + call, + patch, +) + +import numpy as np from deepmd.utils.batch_size import ( AutoBatchSize, @@ -31,79 +39,6 @@ def set_oom_retry_mode(self, enable: bool) -> None: self.modes.append(enable) -class DummyModel: - def __init__(self) -> None: - self.descriptor_hook_calls: list[bool] = [] - self.fitting_hook_calls: list[bool] = [] - - def set_eval_descriptor_hook(self, enable: bool) -> None: - self.descriptor_hook_calls.append(enable) - - def set_eval_fitting_last_layer_hook(self, enable: bool) -> None: - self.fitting_hook_calls.append(enable) - - def eval_descriptor(self) -> list[int]: - return [1, 2, 3] - - def eval_fitting_last_layer(self) -> list[int]: - return [4, 5, 6] - - -class DummyDeepEval: - def __init__(self, fail_once: bool = False, runtime_error: bool = False) -> None: - self.auto_batch_size = DummyAutoBatchSize() - self.model = DummyModel() - self.dp = {"model": {"Default": self.model}} - self.fail_once = fail_once - self.runtime_error = runtime_error - self.eval_calls = 0 - - def eval(self, *args: Any, **kwargs: Any) -> None: - self.eval_calls += 1 - if self.runtime_error: - raise RuntimeError("non-retry failure") - if self.fail_once and self.eval_calls == 1: - raise RetrySignal - - def eval_descriptor(self, *args: Any, **kwargs: Any) -> list[int]: - model = self.dp["model"]["Default"] - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_descriptor_hook(True) - retry = False - try: - self.eval(*args, **kwargs) - descriptor = model.eval_descriptor() - except RetrySignal: - retry = True - finally: - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_descriptor(*args, **kwargs) - return descriptor - - def eval_fitting_last_layer(self, *args: Any, **kwargs: Any) -> list[int]: - model = self.dp["model"]["Default"] - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(True) - model.set_eval_fitting_last_layer_hook(True) - retry = False - try: - self.eval(*args, **kwargs) - fitting = model.eval_fitting_last_layer() - except RetrySignal: - retry = True - finally: - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) - if retry: - return self.eval_fitting_last_layer(*args, **kwargs) - return fitting - - class TestOOMRetry(unittest.TestCase): def test_execute_oom_retry_mode_raises_retry_signal(self) -> None: auto_batch_size = CustomizedAutoBatchSizeGPU(256, 2.0) @@ -131,39 +66,146 @@ def executor(batch_size: int, start_index: int) -> tuple[int, None]: self.assertIsNone(result) self.assertEqual(auto_batch_size.current_batch_size, 128) - def test_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: - deep_eval = DummyDeepEval(fail_once=True) - self.assertEqual(deep_eval.eval_descriptor(), [1, 2, 3]) - self.assertEqual(deep_eval.eval_calls, 2) + def _make_backend(self, backend: str, method_name: str) -> tuple[Any, MagicMock]: + try: + if backend == "pt": + from deepmd.pt.infer.deep_eval import DeepEval + else: + from deepmd.pd.infer.deep_eval import DeepEval + except ModuleNotFoundError as exc: + self.skipTest(f"{backend} backend dependencies are unavailable: {exc}") + + abstract_methods = getattr(DeepEval, "__abstractmethods__", frozenset()) + try: + DeepEval.__abstractmethods__ = frozenset() + deep_eval = object.__new__(DeepEval) + finally: + DeepEval.__abstractmethods__ = abstract_methods + + model = MagicMock() + model.eval_descriptor.return_value = np.array([1, 2, 3]) + model.eval_fitting_last_layer.return_value = np.array([4, 5, 6]) + + if backend == "pd" and method_name == "eval_descriptor": + # Paddle eval_descriptor accepts either a ModelWrapper or a direct model. + deep_eval.dp = model + else: + deep_eval.dp = SimpleNamespace(model={"Default": model}) + deep_eval.auto_batch_size = DummyAutoBatchSize() + return deep_eval, model + + def _assert_retry_clears_hook_between_attempts( + self, + backend: str, + method_name: str, + hook_name: str, + expected: np.ndarray, + ) -> None: + deep_eval, model = self._make_backend(backend, method_name) + with patch.object( + deep_eval, "eval", side_effect=[RetrySignal, None] + ) as eval_mock: + result = getattr(deep_eval, method_name)( + coords=np.zeros((3, 1, 3)), + cells=None, + atom_types=np.array([0]), + ) + self.assertEqual(eval_mock.call_count, 2) + np.testing.assert_array_equal(result, expected) self.assertEqual( - deep_eval.model.descriptor_hook_calls, - [True, False, True, False], + getattr(model, hook_name).call_args_list, + [call(True), call(False), call(True), call(False)], ) self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) - - def test_eval_fitting_last_layer_retry_clears_hook_between_attempts(self) -> None: - deep_eval = DummyDeepEval(fail_once=True) - self.assertEqual(deep_eval.eval_fitting_last_layer(), [4, 5, 6]) - self.assertEqual(deep_eval.eval_calls, 2) + self.assertEqual(deep_eval.auto_batch_size.modes, [True, False, True, False]) + + def _assert_runtime_error_clears_state( + self, + backend: str, + method_name: str, + hook_name: str, + ) -> None: + deep_eval, model = self._make_backend(backend, method_name) + with patch.object( + deep_eval, + "eval", + side_effect=RuntimeError("non-retry failure"), + ): + with self.assertRaisesRegex(RuntimeError, "non-retry failure"): + getattr(deep_eval, method_name)( + coords=np.zeros((3, 1, 3)), + cells=None, + atom_types=np.array([0]), + ) self.assertEqual( - deep_eval.model.fitting_hook_calls, - [True, False, True, False], + getattr(model, hook_name).call_args_list, [call(True), call(False)] ) self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + self.assertEqual(deep_eval.auto_batch_size.modes, [True, False]) + + def test_pt_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: + self._assert_retry_clears_hook_between_attempts( + "pt", + "eval_descriptor", + "set_eval_descriptor_hook", + np.array([1, 2, 3]), + ) - def test_eval_descriptor_runtime_error_clears_state(self) -> None: - deep_eval = DummyDeepEval(runtime_error=True) - with self.assertRaisesRegex(RuntimeError, "non-retry failure"): - deep_eval.eval_descriptor() - self.assertEqual(deep_eval.model.descriptor_hook_calls, [True, False]) - self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + def test_pt_eval_fitting_last_layer_retry_clears_hook_between_attempts( + self, + ) -> None: + self._assert_retry_clears_hook_between_attempts( + "pt", + "eval_fitting_last_layer", + "set_eval_fitting_last_layer_hook", + np.array([4, 5, 6]), + ) - def test_eval_fitting_last_layer_runtime_error_clears_state(self) -> None: - deep_eval = DummyDeepEval(runtime_error=True) - with self.assertRaisesRegex(RuntimeError, "non-retry failure"): - deep_eval.eval_fitting_last_layer() - self.assertEqual(deep_eval.model.fitting_hook_calls, [True, False]) - self.assertFalse(deep_eval.auto_batch_size.oom_retry_mode) + def test_pd_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: + self._assert_retry_clears_hook_between_attempts( + "pd", + "eval_descriptor", + "set_eval_descriptor_hook", + np.array([1, 2, 3]), + ) + + def test_pd_eval_fitting_last_layer_retry_clears_hook_between_attempts( + self, + ) -> None: + self._assert_retry_clears_hook_between_attempts( + "pd", + "eval_fitting_last_layer", + "set_eval_fitting_last_layer_hook", + np.array([4, 5, 6]), + ) + + def test_pt_eval_descriptor_runtime_error_clears_state(self) -> None: + self._assert_runtime_error_clears_state( + "pt", + "eval_descriptor", + "set_eval_descriptor_hook", + ) + + def test_pt_eval_fitting_last_layer_runtime_error_clears_state(self) -> None: + self._assert_runtime_error_clears_state( + "pt", + "eval_fitting_last_layer", + "set_eval_fitting_last_layer_hook", + ) + + def test_pd_eval_descriptor_runtime_error_clears_state(self) -> None: + self._assert_runtime_error_clears_state( + "pd", + "eval_descriptor", + "set_eval_descriptor_hook", + ) + + def test_pd_eval_fitting_last_layer_runtime_error_clears_state(self) -> None: + self._assert_runtime_error_clears_state( + "pd", + "eval_fitting_last_layer", + "set_eval_fitting_last_layer_hook", + ) if __name__ == "__main__": From a80ce640601d24cdbf6d5e601c515abc6e8a9496 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 May 2026 11:34:48 +0000 Subject: [PATCH 10/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/tests/common/test_oom_retry.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/source/tests/common/test_oom_retry.py b/source/tests/common/test_oom_retry.py index c7da33ecae..82310b0545 100644 --- a/source/tests/common/test_oom_retry.py +++ b/source/tests/common/test_oom_retry.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later import unittest -from types import SimpleNamespace +from types import ( + SimpleNamespace, +) from typing import ( Any, ) @@ -69,9 +71,13 @@ def executor(batch_size: int, start_index: int) -> tuple[int, None]: def _make_backend(self, backend: str, method_name: str) -> tuple[Any, MagicMock]: try: if backend == "pt": - from deepmd.pt.infer.deep_eval import DeepEval + from deepmd.pt.infer.deep_eval import ( + DeepEval, + ) else: - from deepmd.pd.infer.deep_eval import DeepEval + from deepmd.pd.infer.deep_eval import ( + DeepEval, + ) except ModuleNotFoundError as exc: self.skipTest(f"{backend} backend dependencies are unavailable: {exc}") From a1ba195a6bfd0577a8508f14d24b916d62086ae0 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Mon, 25 May 2026 20:26:55 +0800 Subject: [PATCH 11/12] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Jinzhe Zeng --- deepmd/utils/batch_size.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 82de03695c..a38eed0224 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -292,7 +292,12 @@ def is_oom_error(self, e: Exception) -> bool: def set_oom_retry_mode(self, enable: bool) -> None: """Set OOM retry mode. - In OOM retry mode, all data will be re-executed. + In OOM retry mode, an OOM during execution may reduce the current + batch size and raise :class:`RetrySignal` to indicate that execution + should be retried. + + Callers that want all data to be re-executed must catch + :class:`RetrySignal` and restart the full evaluation themselves. Parameters ---------- From bd871c3394913a30418fbb2966b9d890644ab69b Mon Sep 17 00:00:00 2001 From: "njzjz-bot (driven by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5))[bot]" <48687836+njzjz-bot@users.noreply.github.com> Date: Mon, 25 May 2026 15:36:00 +0000 Subject: [PATCH 12/12] test(oom): return floating mock outputs Production eval helpers convert model outputs with backend precision handling, which rejects integer arrays. Use floating arrays in the mocked descriptor and fitting outputs so the production PT/PD retry tests exercise the intended cleanup path. Authored by OpenClaw (model: custom-chat-jinzhezeng-group/gpt-5.5) --- source/tests/common/test_oom_retry.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/tests/common/test_oom_retry.py b/source/tests/common/test_oom_retry.py index c7da33ecae..38b803ff6d 100644 --- a/source/tests/common/test_oom_retry.py +++ b/source/tests/common/test_oom_retry.py @@ -83,8 +83,8 @@ def _make_backend(self, backend: str, method_name: str) -> tuple[Any, MagicMock] DeepEval.__abstractmethods__ = abstract_methods model = MagicMock() - model.eval_descriptor.return_value = np.array([1, 2, 3]) - model.eval_fitting_last_layer.return_value = np.array([4, 5, 6]) + model.eval_descriptor.return_value = np.array([1.0, 2.0, 3.0]) + model.eval_fitting_last_layer.return_value = np.array([4.0, 5.0, 6.0]) if backend == "pd" and method_name == "eval_descriptor": # Paddle eval_descriptor accepts either a ModelWrapper or a direct model. @@ -148,7 +148,7 @@ def test_pt_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: "pt", "eval_descriptor", "set_eval_descriptor_hook", - np.array([1, 2, 3]), + np.array([1.0, 2.0, 3.0]), ) def test_pt_eval_fitting_last_layer_retry_clears_hook_between_attempts( @@ -158,7 +158,7 @@ def test_pt_eval_fitting_last_layer_retry_clears_hook_between_attempts( "pt", "eval_fitting_last_layer", "set_eval_fitting_last_layer_hook", - np.array([4, 5, 6]), + np.array([4.0, 5.0, 6.0]), ) def test_pd_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: @@ -166,7 +166,7 @@ def test_pd_eval_descriptor_retry_clears_hook_between_attempts(self) -> None: "pd", "eval_descriptor", "set_eval_descriptor_hook", - np.array([1, 2, 3]), + np.array([1.0, 2.0, 3.0]), ) def test_pd_eval_fitting_last_layer_retry_clears_hook_between_attempts( @@ -176,7 +176,7 @@ def test_pd_eval_fitting_last_layer_retry_clears_hook_between_attempts( "pd", "eval_fitting_last_layer", "set_eval_fitting_last_layer_hook", - np.array([4, 5, 6]), + np.array([4.0, 5.0, 6.0]), ) def test_pt_eval_descriptor_runtime_error_clears_state(self) -> None: