diff --git a/CHANGELOG.md b/CHANGELOG.md index e7db2a263..4ac3dd171 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Local type handler registries. - Expose `merge_trees` publicly: this function can be use to merge trees into a single tree using a comprehensive recursive strategy +- #v1 Add `DeletionOptions` to configure V1 Checkpointer's checkpoint deletion +behavior. ### Changed - The PyPi `orbax` package is deprecated in favor of domain-specific namespace diff --git a/checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py b/checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py index a5bc38590..ea6d9ba7d 100644 --- a/checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py +++ b/checkpoint/orbax/checkpoint/experimental/v1/_src/context/context.py @@ -115,6 +115,7 @@ def __init__( checkpointables_options: options_lib.CheckpointablesOptions | None = None, pathways_options: options_lib.PathwaysOptions | None = None, checkpoint_layout: options_lib.CheckpointLayout | None = None, + deletion_options: options_lib.DeletionOptions | None = None, ): self._pytree_options = pytree_options or ( context.pytree_options if context else options_lib.PyTreeOptions() @@ -146,6 +147,9 @@ def __init__( if context else options_lib.CheckpointLayout.ORBAX ) + self._deletion_options = deletion_options or ( + context.deletion_options if context else options_lib.DeletionOptions() + ) @property def pytree_options(self) -> options_lib.PyTreeOptions: @@ -179,6 +183,10 @@ def pathways_options(self) -> options_lib.PathwaysOptions: def checkpoint_layout(self) -> options_lib.CheckpointLayout: return self._checkpoint_layout + @property + def deletion_options(self) -> options_lib.DeletionOptions: + return self._deletion_options + def operation_id(self) -> str: return synchronization.OperationIdGenerator.get_current_operation_id() diff --git a/checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py b/checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py index 545f30ca0..f9fcaeb93 100644 --- a/checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py +++ b/checkpoint/orbax/checkpoint/experimental/v1/_src/context/options.py @@ -472,6 +472,35 @@ class PathwaysOptions: checkpointing_impl: pathways_types.CheckpointingImpl | None = None +@dataclasses.dataclass(frozen=True, kw_only=True) +class DeletionOptions: + """Options used to configure checkpoint deletion behavior. + + Attributes: + todelete_subdir: If set, checkpoints to be deleted will be only renamed into + a subdirectory with the provided string. Otherwise, they will be directly + deleted from the file system. Useful if checkpoint deletion is time + consuming. By default, delete the checkpoint assets. Ignored if file + system is Google Cloud Storage (directory is prefixed with gs://). + todelete_full_path: Specifies a path relative to the bucket root for + "soft-deleting" checkpoints on Google Cloud Storage (GCS). Instead of + being permanently removed, checkpoints are moved to this new location + within the same bucket. For instance, if a checkpoint is in + gs://my-bucket/experiments/run1/, providing the value trash/ will move a + deleted step to gs://my-bucket/trash/. Useful when direct + deletion is time consuming. It gathers all deleted items in a centralized + path for future cleanup. + enable_background_delete: If True, old checkpoint deletions will be done in + a background thread, otherwise, it will be done at the end of each save. + When it's enabled, make sure to call Checkpointer.close() or use + context to make sure all old steps are deleted before exit. + """ + + todelete_subdir: str | None = None + todelete_full_path: str | None = None + enable_background_delete: bool = False + + class CheckpointLayout(enum.Enum): """The layout of the checkpoint. diff --git a/checkpoint/orbax/checkpoint/experimental/v1/_src/training/checkpointer.py b/checkpoint/orbax/checkpoint/experimental/v1/_src/training/checkpointer.py index dc4f6a957..c6da631ea 100644 --- a/checkpoint/orbax/checkpoint/experimental/v1/_src/training/checkpointer.py +++ b/checkpoint/orbax/checkpoint/experimental/v1/_src/training/checkpointer.py @@ -169,8 +169,9 @@ def __init__( preservation_policy=preservation_policy, step_name_format=step_name_format, max_to_keep=None, # Unlimited. - # TODO(b/401541834) Configure todelete_subdir. - # TODO(b/401541834) Enable background deletion. + todelete_subdir=context.deletion_options.todelete_subdir, + todelete_full_path=context.deletion_options.todelete_full_path, + enable_background_delete=context.deletion_options.enable_background_delete, async_options=context.async_options.v0(), file_options=context.file_options.v0(), multiprocessing_options=context.multiprocessing_options.v0(),