Add the model Saving and loading.

codewithdark-git · codewithdark-git · commit db94d8d34465 · 2025-04-05T15:37:05.000+05:00
diff --git a/diffusionLM/save_model/__init__.py b/diffusionLM/save_model/__init__.py
@@ -0,0 +1,12 @@
+"""Model saving and loading utilities"""
+
+from .model_save import save_model, load_model, ModelSaveError
+from .register_model import registerANDpush, ModelRegistrationError
+
+__all__ = [
+    "save_model",
+    "load_model",
+    "ModelSaveError",
+    "registerANDpush",
+    "ModelRegistrationError",
+]
diff --git a/diffusionLM/save_model/model_save.py b/diffusionLM/save_model/model_save.py
@@ -0,0 +1,85 @@
+import torch
+import logging
+from pathlib import Path
+from typing import Tuple, Optional
+
+from transformers import PreTrainedModel
+from diffusionLM.model.transformers_model import DiffusionConfig, DiffusionLLM
+
+logger = logging.getLogger(__name__)
+
+class ModelSaveError(Exception):
+    """Custom exception for model saving/loading errors"""
+    pass
+
+def save_model(
+    model: DiffusionLLM,
+    optimizer: torch.optim.Optimizer,
+    save_path: str,
+    final: bool = False,
+) -> None:
+    """Save model and optimizer state."""
+    try:
+        save_dir = Path(save_path)
+        save_dir.mkdir(parents=True, exist_ok=True)
+
+        step = getattr(model, 'current_step', 1)
+        prefix = "final" if final else f"step_{step}"
+        save_name = save_dir / f"{prefix}_model.pt"
+
+        # Save the model
+        torch.save(
+            {
+                "model_state_dict": model.state_dict(),
+                "optimizer_state_dict": optimizer.state_dict(),
+                "step": step,
+                "config": model.config.__dict__,
+            },
+            save_name,
+        )
+        logger.info(f"Model saved to {save_name}")
+        
+    except Exception as e:
+        logger.error(f"Failed to save model: {str(e)}")
+        raise ModelSaveError(f"Failed to save model: {str(e)}")
+
+def load_model(
+    load_path: str,
+    device: Optional[torch.device] = None,
+) -> Tuple[DiffusionLLM, torch.optim.Optimizer]:
+    """Load saved model."""
+    try:
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        
+        # Load checkpoint
+        if not Path(load_path).exists():
+            raise ModelSaveError(f"Checkpoint not found at {load_path}")
+            
+        checkpoint = torch.load(load_path, map_location=device)
+        
+        # Create config and model
+        config_dict = checkpoint.get("config", {})
+        if not config_dict:
+            raise ModelSaveError("No config found in checkpoint")
+
+        # Filter out unexpected keyword arguments
+        expected_keys = DiffusionConfig.__init__.__code__.co_varnames
+        filtered_config_dict = {k: v for k, v in config_dict.items() if k in expected_keys}
+
+        config = DiffusionConfig(**filtered_config_dict)
+
+        # Create model
+        model = DiffusionLLM(config)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        model.to(device)
+
+        # Create optimizer
+        optimizer = torch.optim.AdamW(model.parameters())
+        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+
+        return model, optimizer
+
+    except Exception as e:
+        logger.error(f"Failed to load model: {str(e)}")
+        raise ModelSaveError(f"Failed to load model: {str(e)}")
diff --git a/diffusionLM/save_model/register_model.py b/diffusionLM/save_model/register_model.py
@@ -0,0 +1,58 @@
+import torch
+import os
+import logging
+from pathlib import Path
+from typing import Optional
+
+from huggingface_hub import HfApi, Repository
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
+from diffusionLM.model.transformers_model import DiffusionConfig, DiffusionLLM
+
+logger = logging.getLogger(__name__)
+
+class ModelRegistrationError(Exception):
+    """Custom exception for model registration errors"""
+    pass
+
+def registerANDpush(
+    model: DiffusionLLM,
+    tokenizer,
+    model_type: str,
+    model_name: type[DiffusionLLM],
+    model_config: type[DiffusionConfig],
+    repo_id: str = "codewithdark/DiffusionLM",
+    private: bool = False,
+) -> None:
+    """Register and push model to Hugging Face Hub."""
+    try:
+        # Register model architecture
+        AutoConfig.register(model_type, model_config)
+        AutoModel.register(model_config, model_name)
+        AutoModelForCausalLM.register(model_config, model_name)
+
+        api = HfApi()
+        
+        # Create repo
+        try:
+            api.create_repo(repo_id=repo_id, private=private)
+            logger.info(f"Created new repository: {repo_id}")
+        except Exception as e:
+            logger.warning(f"Repository creation failed (may already exist): {e}")
+
+        # Setup local repo
+        repo_local_path = Path("SaveModel/DiffusionLM")
+        repo_local_path.mkdir(parents=True, exist_ok=True)
+        
+        repo = Repository(local_dir=str(repo_local_path), clone_from=repo_id)
+
+        # Save model and tokenizer
+        tokenizer.save_pretrained(repo_local_path)
+        model.save_pretrained(repo_local_path)
+        
+        # Push to hub
+        repo.push_to_hub(commit_message="Initial model and tokenizer commit")
+        logger.info(f"Model and tokenizer pushed to {repo_id}")
+
+    except Exception as e:
+        logger.error(f"Model registration failed: {str(e)}")
+        raise ModelRegistrationError(f"Failed to register model: {str(e)}")