From 15183a9695b10d2c67454f46aa1651b0f335c05e Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Fri, 12 Dec 2025 15:49:22 +0100
Subject: [PATCH 1/9] Improve _ReferenceBuffer debugging

---
 Deeploy/DeeployTypes.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index d9d768fabc..8022d8ac93 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -504,6 +504,15 @@ def _bufferRepresentation(self) -> Dict:
         repr['offset'] = self._offset
         return repr
 
+    def __str__(self) -> str:
+        if hasattr(self, "_type"):
+            return f'VariableBuffer: name: {self.name}, type: {self._type}, reference: {self._referenceName}+{self._offset}'
+
+        return f'VariableBuffer: name: {self.name}, reference: {self._referenceName}+{self._offset}'
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
 
 class NetworkContext():
     """The global context of the compiler. This object holds all the typing inferred in the type-checking passes within the respective buffers. It holds all hoisted transient buffers, struct buffers, and global definitions. The context is the source of truth for all code generation in the backend.

From c8cb1cdac8f9317b7fea4ee311c689e456f6fdb0 Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Fri, 12 Dec 2025 15:49:01 +0100
Subject: [PATCH 2/9] Add length check for offset

---
 .../CodeTransformationPasses/MemoryAllocation.py     | 12 ++++++------
 Deeploy/DeeployTypes.py                              |  6 ++++++
 .../NetworkDeployers/MemoryLevelDeployer.py          |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index f10d333502..163e99c086 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -125,9 +125,9 @@ def apply(self,
 
             memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
             if memoryLevel not in ctxt._dynamicSize:
-                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes())
+                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes)
             else:
-                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes())
+                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes)
 
             executionBlock.addLeft(buffer.allocTemplate, buffer._bufferRepresentation())
 
@@ -146,7 +146,7 @@ def apply(self,
                 if memoryLevel not in ctxt._dynamicSize:
                     ctxt._dynamicSize[memoryLevel] = 0
                 else:
-                    ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes())
+                    ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes)
                 executionBlock.addRight(buffer.deallocTemplate, buffer._bufferRepresentation())
 
         return ctxt, executionBlock
@@ -178,9 +178,9 @@ def apply(self,
 
             memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
             if memoryLevel not in ctxt._dynamicSize:
-                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes())
+                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes)
             else:
-                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes())
+                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes)
 
             buffer._live = True
 
@@ -197,7 +197,7 @@ def apply(self,
             if memoryLevel not in ctxt._dynamicSize:
                 ctxt._dynamicSize[memoryLevel] = 0
             else:
-                ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes())
+                ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes)
 
             buffer._live = False
 
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 8022d8ac93..4d643d03be 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -348,6 +348,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool:
             queue |= buffNext.aliases - visited
         return live
 
+    @property
     def sizeInBytes(self) -> int:
         """Returns the size of this VariableBuffer in bytes
 
@@ -386,6 +387,11 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f'TransientBuffer: name: {self.name}, size: {self.size}'
 
+    @classmethod
+    def fromVariableBuffer(cls, buffer: VariableBuffer):
+        ret = cls(name = buffer.name, size = buffer.sizeInBytes)
+
+    @property
     def sizeInBytes(self) -> int:
         return int(self.size)
 
diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
index 2599f9e819..3e0e5659ce 100644
--- a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
+++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
@@ -89,7 +89,7 @@ def _printMemorySummary(self):
                 # We do not count structs for now, since they are not properly modeled
                 if isinstance(_buffer, ConstantBuffer) and getattr(_buffer, "_deploy", False):
                     if (hasattr(_buffer, "_memoryLevel") and _buffer._memoryLevel == level) or level in ("None", None):
-                        staticSize += _buffer.sizeInBytes()
+                        staticSize += _buffer.sizeInBytes
 
             total = staticSize + dynamicSize
             memLevels = self.Platform.memoryHierarchy.memoryLevels

From 0c973b56645ef632f8c60e9132277443a374b55f Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Thu, 5 Feb 2026 12:56:50 +0100
Subject: [PATCH 3/9] Add Many Docsstrings

---
 .../CodeTransformationPasses/Closure.py       | 250 +++++
 .../CycleMeasurement.py                       |  55 ++
 .../IntrospectiveCodeTransformation.py        | 157 ++-
 .../MemoryAllocation.py                       | 171 ++++
 .../CodeTransformationPasses/PrintInputs.py   | 487 ++++++++++
 Deeploy/CommonExtensions/DataTypes.py         |  36 +
 .../OptimizationPasses/Matchers.py            | 433 +++++++++
 Deeploy/DeeployTypes.py                       |  15 +-
 Deeploy/TilingExtension/TilerExtension.py     | 904 +++++++++++++++++-
 Deeploy/TilingExtension/TilingCodegen.py      | 585 ++++++++++++
 10 files changed, 3082 insertions(+), 11 deletions(-)

diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
index 70a91fd0ce..a7579b85a0 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -53,19 +53,76 @@
 
 
 class ClosureExecutionBlock(ExecutionBlock):
+    """
+    Execution block wrapper for closure-based code generation.
+
+    This class extends ExecutionBlock to support closure-based code generation
+    patterns, where functions are wrapped in closures with argument structures.
+    It maintains a reference to the base execution block that contains the
+    actual code to be wrapped.
+
+    Notes
+    -----
+    This class is used in the closure generation process to maintain the
+    relationship between the closure wrapper and the original execution block.
+    """
 
     def __init__(self, nodeTemplate = None, closureBlock: Optional[ExecutionBlock] = None):
+        """
+        Initialize a ClosureExecutionBlock.
+
+        Parameters
+        ----------
+        nodeTemplate : NodeTemplate, optional
+            The node template for this execution block. Default is None.
+        closureBlock : ExecutionBlock, optional
+            The execution block to be wrapped in a closure. Default is None.
+        """
         super().__init__(nodeTemplate)
         self.closureBlock = closureBlock
 
     @property
     def baseBlock(self):
+        """
+        Get the base execution block, unwrapping nested closures.
+
+        Recursively unwraps ClosureExecutionBlock instances to find the
+        underlying base execution block that contains the actual code.
+
+        Returns
+        -------
+        ExecutionBlock
+            The base execution block without closure wrappers.
+
+        Notes
+        -----
+        This property handles nested closures by recursively calling
+        baseBlock until a non-ClosureExecutionBlock is found.
+        """
         if isinstance(self.closureBlock, ClosureExecutionBlock):
             return self.closureBlock.baseBlock
         return self.closureBlock
 
 
 class ClosureGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """
+    Code transformation pass for generating function closures.
+
+    This class transforms execution blocks into closure-based code patterns
+    where functions are wrapped with argument structures. It generates the
+    necessary struct definitions, closure functions, and call sites to
+    enable closure-based execution patterns in generated code.
+
+
+    Notes
+    -----
+    The closure generation process involves:
+    1. Analyzing the execution block to identify dynamic references
+    2. Creating a struct type to hold closure arguments
+    3. Generating the closure function definition
+    4. Replacing the original call with a closure call
+    5. Optionally generating argument writeback code
+    """
 
     closureStructArgType: Dict[str, Type[Union[Pointer, Immediate, Struct]]]
     closureStructArgs: Dict[str, Union[Pointer, Immediate, Struct]]
@@ -75,6 +132,22 @@ def __init__(self,
                  closureSuffix = "_closure",
                  writeback: bool = True,
                  generateStruct: bool = True):
+        """
+        Initialize the ClosureGeneration transformation pass.
+
+        Parameters
+        ----------
+        closureCallTemplate : NodeTemplate, optional
+            Template for generating closure function calls. Default is the
+            global _closureCallTemplate.
+        closureSuffix : str, optional
+            Suffix to append to closure function names. Default is "_closure".
+        writeback : bool, optional
+            Whether to generate writeback code for closure arguments.
+            Default is True.
+        generateStruct : bool, optional
+            Whether to generate argument structure definitions. Default is True.
+        """
         super().__init__()
         self.closureSuffix = closureSuffix
         self.closureTemplate = _closureTemplate
@@ -86,6 +159,31 @@ def __init__(self,
 
     # Don't override this
     def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
+        """
+        Generate the closure argument structure.
+
+        Analyzes the execution block to identify dynamic references and creates
+        a struct type to hold all closure arguments. This struct will be used
+        to pass arguments to the closure function.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        executionBlock : ExecutionBlock
+            The execution block to analyze for dynamic references.
+
+        Notes
+        -----
+        This method populates the following instance attributes:
+        - closureStructArgType: The struct class type for closure arguments
+        - closureStructArgs: The struct instance with argument mappings
+
+        The method handles different buffer types:
+        - TransientBuffer: Mapped to void pointers
+        - StructBuffer: Excluded from closure arguments
+        - Other buffers: Use their native types
+        """
 
         # Add closure struct info to operatorRepresentation
         closureStructArgsType: Dict[str, Type[Union[Pointer, Immediate, Struct]]] = {}
@@ -108,6 +206,31 @@ def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: Execution
 
     # Don't override this
     def _generateClosureCtxt(self, ctxt: NetworkContext, nodeName: str) -> NetworkContext:
+        """
+        Generate closure context and global definitions.
+
+        Creates the closure function definition and struct type definition,
+        then hoists them to the global scope. This includes generating
+        the actual closure function code and the argument struct typedef.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context to modify with global definitions.
+        nodeName : str
+            The name of the node for tracking dependencies.
+
+        Returns
+        -------
+        NetworkContext
+            The modified network context with closure definitions added.
+
+        Notes
+        -----
+        This method generates and hoists the following global definitions:
+        - Closure argument struct typedef
+        - Closure function definition with argument casting and optional writeback
+        """
 
         ret = ctxt.hoistStruct(self.closureStructArgs, self.closureName + "_args", self.closureStructArgType)
         ctxt.lookup(ret)._users.append(nodeName)
@@ -133,6 +256,36 @@ def _generateClosureCtxt(self, ctxt: NetworkContext, nodeName: str) -> NetworkCo
     # Don't override this
     def _generateClosureCall(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
                              nodeName: str) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Generate the closure call and replace the original execution block.
+
+        Creates a new ClosureExecutionBlock that wraps the original execution
+        with closure call code. This includes the closure function call and
+        optional argument writeback code.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context for code generation.
+        executionBlock : ExecutionBlock
+            The original execution block to wrap with closure calls.
+        nodeName : str
+            The name of the node for struct generation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The modified network context
+            - The new ClosureExecutionBlock with closure calls
+
+        Notes
+        -----
+        This method replaces the original function call with:
+        1. A closure function call (added to the left)
+        2. Optional argument writeback code (added to the right if enabled)
+        3. Optional argument struct generation
+        """
 
         allArgs = {
             "closureName": self.closureName,
@@ -158,6 +311,41 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply the closure generation transformation.
+
+        Transforms the given execution block into a closure-based pattern
+        by generating the necessary struct, closure function, and call site.
+        This is the main entry point for the closure transformation.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to transform into a closure pattern.
+        name : str
+            The base name for generating closure-related identifiers.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The modified network context with closure definitions
+            - The new ClosureExecutionBlock with closure call patterns
+
+        Notes
+        -----
+        The transformation process includes:
+        1. Generating a unique closure name with the specified suffix
+        2. Capturing the original function call code
+        3. Creating the closure argument struct
+        4. Generating the closure function definition in global scope
+        5. Replacing the original call with a closure call pattern
+        """
+
         # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers)
         self.closureName = "_" + name + self.closureSuffix
         self.functionCall = executionBlock.generate(ctxt)
@@ -168,6 +356,23 @@ def apply(self,
 
 
 class MemoryAwareClosureGeneration(ClosureGeneration):
+    """
+    Memory-aware closure generation for multi-level memory hierarchies.
+
+    This class extends ClosureGeneration to handle memory-aware closure
+    generation where only certain memory levels are included in the closure
+    arguments. It filters buffers based on their memory level, including
+    only those that belong to specific memory regions in the hierarchy.
+
+    Notes
+    -----
+    This class is useful for multi-level memory systems where different
+    memory levels have different access patterns and only certain levels
+    should be passed as closure arguments. Buffers are included if they:
+    - Have no memory level annotation
+    - Belong to the start region
+    - Do not belong to the end region (are in higher levels)
+    """
 
     def __init__(self,
                  closureCallTemplate: NodeTemplate = _closureCallTemplate,
@@ -176,12 +381,57 @@ def __init__(self,
                  generateStruct: bool = True,
                  startRegion: str = "L2",
                  endRegion: str = "L1"):
+        """
+        Initialize the MemoryAwareClosureGeneration transformation pass.
+
+        Parameters
+        ----------
+        closureCallTemplate : NodeTemplate, optional
+            Template for generating closure function calls. Default is the
+            global _closureCallTemplate.
+        closureSuffix : str, optional
+            Suffix to append to closure function names. Default is "_closure".
+        writeback : bool, optional
+            Whether to generate writeback code for closure arguments.
+            Default is True.
+        generateStruct : bool, optional
+            Whether to generate argument structure definitions. Default is True.
+        startRegion : str, optional
+            The starting memory region to include in closures. Default is "L2".
+        endRegion : str, optional
+            The ending memory region to include in closures. Default is "L1".
+        """
         super().__init__(closureCallTemplate, closureSuffix, writeback, generateStruct)
         self.startRegion = startRegion
         self.endRegion = endRegion
 
     # Don't override this
     def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
+        """
+        Generate memory-aware closure argument structure.
+
+        Overrides the base class method to implement memory-level filtering.
+        Only includes buffers that belong to appropriate memory levels based
+        on the configured start and end regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        executionBlock : ExecutionBlock
+            The execution block to analyze for dynamic references.
+
+        Notes
+        -----
+        This method filters dynamic references based on memory levels:
+        - Includes buffers with no memory level annotation
+        - Includes buffers from the start region
+        - Includes buffers not from the end region (higher memory levels)
+
+        The filtering logic ensures that only relevant buffers are passed
+        as closure arguments, reducing memory transfer overhead in
+        multi-level memory hierarchies.
+        """
 
         # Add closure struct info to operatorRepresentation
         closureStructArgsType = {}
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
index 42f5d57b1a..cabef767dd 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
@@ -9,12 +9,67 @@
 
 
 class ProfilingCodeGeneration(CodeTransformationPass):
+    """
+    Code transformation pass for inserting cycle measurement profiling code.
+
+    This class extends CodeTransformationPass to automatically insert profiling
+    code around execution blocks. It adds cycle counting instrumentation before
+    and after the target code, enabling performance measurement and analysis
+    of individual operations during runtime.
+
+    The generated profiling code uses a `getCycles()` function to measure
+    execution time and prints the results to stdout. This is useful for
+    performance analysis, optimization, and debugging of neural network
+    operations.
+
+    Notes
+    -----
+    This transformation requires that the target platform provides a
+    `getCycles()` function that returns the current cycle count as a uint32_t.
+    The transformation also assumes printf functionality is available for
+    output formatting.
+
+    The profiling code is non-intrusive and can be easily enabled or disabled
+    by including or excluding this transformation pass from the compilation
+    pipeline.
+    """
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply cycle measurement profiling to an execution block.
+
+        Wraps the given execution block with cycle counting code that measures
+        and reports the execution time. The profiling code is added before
+        (left) and after (right) the original execution block.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context for code generation. This parameter is passed
+            through unchanged as cycle measurement doesn't modify the context.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with cycle measurement code.
+            The original block remains unchanged, with profiling code added
+            around it.
+        name : str
+            The name of the operation being profiled. This name is used to
+            generate unique variable names and is included in the output
+            message for identification.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+            This parameter is not used by the cycle measurement transformation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with profiling code added
+        """
         executionBlock.addLeft(NodeTemplate("""
         uint32_t ${op}_cycles = getCycles();
         """), {"op": name})
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
index 7e682b2644..f8ca7b4049 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
@@ -17,15 +17,62 @@
 
 
 class IntrospectiveCodeTransformationMixIn():
+    """A mix-in class providing introspective code transformation capabilities for template-based code generation.
+
+    This class enables analysis and manipulation of template code by parsing it into an abstract syntax tree (AST),
+    allowing for dynamic transformations such as variable indexing, dereferencing, and extraction of dynamic references.
+    It is designed to work with template objects and their parse trees, supporting advanced code introspection and
+    modification tasks commonly required in code generation frameworks.
+
+    Key Features
+    ------------
+    - Parse template source code into a tree structure for introspection.
+    - Programmatically index or dereference variables within templates.
+    - Extract dynamic references (e.g., buffers, tensors) used in code blocks.
+    - Support for unrolling struct references and distinguishing between local/global context.
+    - Efficient caching of parse trees for repeated template analysis.
+
+    Intended Usage
+    --------------
+    This mix-in is intended to be used with classes that manage code templates, enabling them to inspect and transform
+    template code at runtime. It is particularly useful in scenarios where code generation must adapt dynamically to
+    context or user input, such as in neural network frameworks or domain-specific languages.
+    """
 
     parseTreeDict: Dict[int, TemplateNode] = {}
 
     @staticmethod
     def _generateParseTree(template: Template) -> TemplateNode:
+        """Generate the parse tree for the given template.
+
+        Parameters
+        ----------
+        template : Template
+            The template to parse.
+
+        Returns
+        -------
+        TemplateNode
+            The root node of the parse tree.
+        """
         return Lexer(template._source).parse()
 
     @staticmethod
     def _reconstructCode(template: Template, node: TemplateNode) -> Template:
+        """Reconstruct the template from the parse tree.
+
+        Parameters
+        ----------
+        template : Template
+            The template to modify.
+        node : TemplateNode
+            The parse tree node to use.
+
+        Returns
+        -------
+        Template
+            The modified template.
+        """
         lexer = Lexer(template._source)
         source = codegen.compile(
             node,
@@ -43,6 +90,8 @@ def _reconstructCode(template: Template, node: TemplateNode) -> Template:
         )
         module = types.ModuleType(template.module_id)
         code = compile(source, template.module_id, "exec")
+
+        # Execute the compiled code in the module's namespace
         exec(code, module.__dict__, module.__dict__)
 
         template._code = code
@@ -52,6 +101,22 @@ def _reconstructCode(template: Template, node: TemplateNode) -> Template:
 
     @staticmethod
     def _indexPointer(parseTree: TemplateNode, ptrName: str, index: str) -> TemplateNode:
+        """Index a pointer in the parse tree.
+
+        Parameters
+        ----------
+        parseTree : TemplateNode
+            The parse tree to modify.
+        ptrName : str
+            The name of the pointer to index.
+        index : str
+            The index to use.
+
+        Returns
+        -------
+        TemplateNode
+            The modified parse tree.
+        """
         indexes = [i for i, node in enumerate(parseTree.nodes) if isinstance(node, Expression) and node.text == ptrName]
 
         for offset, idx in enumerate(indexes):
@@ -66,6 +131,19 @@ def _indexPointer(parseTree: TemplateNode, ptrName: str, index: str) -> Template
 
     @staticmethod
     def indexVars(template: Template, varNames: List[str], index: str) -> None:
+        """Index the specified variables in the given template.
+
+        Modifies the template in place by indexing the specified variable names.
+
+        Parameters
+        ----------
+        template : Template
+            The template to modify.
+        varNames : List[str]
+            The variable names to index.
+        index : str
+            The index to use.
+        """
         if len(varNames) == 0:
             return
         parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
@@ -75,6 +153,20 @@ def indexVars(template: Template, varNames: List[str], index: str) -> None:
 
     @staticmethod
     def _dereferencePointer(parseTree: TemplateNode, ptrName: str) -> TemplateNode:
+        """Dereference a pointer in the parse tree.
+
+        Parameters
+        ----------
+        parseTree : TemplateNode
+            The parse tree to modify.
+        ptrName : str
+            The name of the pointer to dereference.
+
+        Returns
+        -------
+        TemplateNode
+            The modified parse tree with dereferenced pointers.
+        """
         indexes = [i for i, node in enumerate(parseTree.nodes) if isinstance(node, Expression) and node.text == ptrName]
 
         for offset, idx in enumerate(indexes):
@@ -85,6 +177,18 @@ def _dereferencePointer(parseTree: TemplateNode, ptrName: str) -> TemplateNode:
 
     @staticmethod
     def dereferenceVars(template: Template, varNames: List[str]) -> None:
+        """Dereference the specified variables in the given template.
+
+        This function modifies the provided template in place by dereferencing
+        the variables listed in `varNames`. The template is modified in place.
+
+        Parameters
+        ----------
+        template : Template
+            The template object to be modified.
+        varNames : list of str
+            List of variable names to dereference within the template.
+        """
         if len(varNames) == 0:
             return
         parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
@@ -97,6 +201,24 @@ def extractDynamicReferences(self,
                                  executionBlock: ExecutionBlock = None,
                                  unrollStructs = False,
                                  includeGlobalReferences = False):
+        """Extract all dynamic references from the given execution block.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context.
+        executionBlock : ExecutionBlock, optional
+            The execution block.
+        unrollStructs : bool, optional
+            Whether to unroll structs.
+        includeGlobalReferences : bool, optional
+            Whether to include global references.
+
+        Returns
+        -------
+        List[str]
+            A list of dynamic references.
+        """
 
         makoDynamicReferences = []
         for codeSnippet in executionBlock.codeSnippets:
@@ -113,7 +235,20 @@ def extractDynamicReferences(self,
 
     @staticmethod
     def _fixCtxtOrdering(ctxt: NetworkContext, nameList: List[str]) -> List[str]:
-
+        """Fix the ordering of context names based on their appearance in the context.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context.
+        nameList : List[str]
+            The list of context names to order.
+
+        Returns
+        -------
+        List[str]
+            The ordered list of context names.
+        """
         orderList = [*ctxt.globalObjects.keys(), *ctxt.localObjects.keys()]
         _nameList = sorted(nameList.copy(), key = lambda key: orderList.index(key))
 
@@ -125,6 +260,26 @@ def _extractDynamicExpressions(self,
                                    template: Template,
                                    unrollStructs = False,
                                    includeGlobalReferences = False):
+        """Extract dynamic expressions from the given template.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context.
+        operatorRepresentation : OperatorRepresentation
+            The operator representation mapping expressions to their representations.
+        template : Template
+            The template to extract expressions from.
+        unrollStructs : bool, optional
+            Whether to recursively unroll struct references. Defaults to False.
+        includeGlobalReferences : bool, optional
+            Whether to include global references in the result. Defaults to False.
+
+        Returns
+        -------
+        List[str]
+            A list of dynamic expressions, including local (and optionally global) references.
+        """
         codeHash = hash(template._source)
 
         if codeHash in self.parseTreeDict.keys():
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index 163e99c086..609a179c7b 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -13,8 +13,18 @@
 
 
 class _ArgStructAllocateTemplate(NodeTemplate):
+    """Template for allocating an argument struct."""
 
     def __init__(self, templateStr: str, bufferName: str):
+        """Initialize the argument struct allocation template.
+
+        Parameters
+        ----------
+        templateStr : str
+            The template string.
+        bufferName : str
+            The name of the buffer.
+        """
         super().__init__(templateStr)
         self.bufferName = bufferName
 
@@ -25,6 +35,7 @@ def __init__(self, templateStr: str, bufferName: str):
 
 
 class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """A code transformation pass that generates a struct for function arguments."""
 
     def __init__(self):
         super().__init__()
@@ -34,6 +45,27 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """Apply the argument struct generation transformation.
+
+        This transformation generates a struct for the function arguments. It allocates
+        memory for the struct and initializes its fields.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context.
+        executionBlock : ExecutionBlock
+            The execution block.
+        name : str
+            The name of the argument.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            The transformed network context and execution block.
+        """
 
         references = self.extractDynamicReferences(ctxt, executionBlock, True)
         buffers = [ctxt.lookup(key) for key in references]
@@ -55,8 +87,19 @@ def apply(self,
 
 
 class MemoryManagementGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """A code transformation pass that manages memory allocation and deallocation for buffers.
+
+    This pass is responsible for ensuring that memory is allocated for buffers when they are needed and deallocated when
+    they are no longer in use."""
 
     def __init__(self, memoryLevelRegex: Optional[str] = None):
+        """Initialize the memory management generation pass.
+
+        Parameters
+        ----------
+        memoryLevelRegex : str, optional
+            A regular expression to match memory levels.
+        """
         super().__init__()
         if memoryLevelRegex is not None:
             self.regex = re.compile(memoryLevelRegex)
@@ -64,6 +107,18 @@ def __init__(self, memoryLevelRegex: Optional[str] = None):
             self.regex = None
 
     def is_memory_level(self, buffer: VariableBuffer) -> bool:
+        """Check if the given buffer is a memory level buffer.
+
+        Parameters
+        ----------
+        buffer : VariableBuffer
+            The buffer to check.
+
+        Returns
+        -------
+        bool
+            True if the buffer is a memory level buffer, False otherwise.
+        """
         if self.regex is None:
             return not hasattr(buffer, "_memoryLevel")
         else:
@@ -71,19 +126,76 @@ def is_memory_level(self, buffer: VariableBuffer) -> bool:
 
     @staticmethod
     def is_final_input(buffer: VariableBuffer, nodeName: str) -> bool:
+        """Check if the given buffer is a final input buffer.
+
+        Parameters
+        ----------
+        buffer : VariableBuffer
+            The buffer to check.
+        nodeName : str
+            The name of the node to check against.
+
+        Returns
+        -------
+        bool
+            True if the buffer is a final input buffer, False otherwise.
+        """
         return not isinstance(buffer, (StructBuffer, TransientBuffer)) and \
             len(buffer._users) > 0 and nodeName == buffer._users[-1]
 
     @staticmethod
     def is_output(buffer: VariableBuffer, nodeName: str) -> bool:
+        """Check if the given buffer is an output buffer.
+
+        """
         return not isinstance(buffer, (StructBuffer, TransientBuffer)) and nodeName not in buffer._users
 
     @staticmethod
     def is_transient(buffer: VariableBuffer, nodeName: str) -> bool:
+        """Check if the given buffer is a transient buffer.
+
+        Parameters
+        ----------
+        buffer : VariableBuffer
+            The buffer to check.
+        nodeName : str
+            The name of the node to check against.
+
+        Returns
+        -------
+        bool
+            True if the buffer is a transient buffer, False otherwise.
+        """
         return isinstance(buffer, TransientBuffer) and nodeName in buffer._users
 
     @staticmethod
     def topologicallySortBuffers(buffers: List[VariableBuffer]) -> List[VariableBuffer]:
+        """
+        Topologically sorts a list of VariableBuffer objects based on their reference dependencies.
+
+        This method iteratively identifies buffers that are not referenced by any other buffer in the list,
+        adding them to the sorted result. Buffers that reference others (via _ReferenceBuffer and _referenceName)
+        are deferred until their dependencies are resolved. The process continues until all buffers are sorted,
+        or a circular reference is detected (which raises an assertion error).
+
+        The first buffers in the sorted list are those that do not have any dependencies, while the last buffers
+        are those that are only referenced by others.
+
+        Raises
+        ------
+        AssertionError
+            If a circular reference is detected among the buffers, preventing a valid topological sort.
+
+        Parameters
+        ----------
+        buffers : List[VariableBuffer]
+            The list of buffers to sort.
+
+        Returns
+        -------
+        List[VariableBuffer]
+            The topologically sorted list of buffers.
+        """
         sortedBuffers = []
         unsortedBufferNames = [buff.name for buff in buffers]
         lastLen = len(unsortedBufferNames)
@@ -107,6 +219,30 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """Apply the memory management generation transformation.
+
+        This function is responsible for analyzing the memory usage of the given execution block
+        and generating the necessary memory allocation and deallocation commands. It also takes care
+        of managing the lifetimes of the buffers involved and ensuring that they are properly released
+        when no longer needed.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context to use.
+        executionBlock : ExecutionBlock
+            The execution block to analyze.
+        name : str
+            The name of the node to check against.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            The updated network context and execution block.
+        """
+
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
@@ -119,6 +255,7 @@ def apply(self,
         inputs = [buff for buff in memoryLevelBuffers if self.is_final_input(buff, name)]
 
         # We have to allocate the output buffers, unless they are global
+        # Topological sorting is necessary to ensure that we allocate reference buffers before their dependents
         for buffer in reversed(self.topologicallySortBuffers(outputs + transients)):
             assert buffer._live == False, f"Tried to allocate already live buffer {buffer.name}"
             buffer._live = True
@@ -153,8 +290,21 @@ def apply(self,
 
 
 class MemoryPassthroughGeneration(MemoryManagementGeneration):
+    """A code transformation pass that implements a 'passthrough' memory management strategy.
+
+    In the context of code generation and memory management, 'passthrough' means that this pass does not
+    perform any actual allocation or deallocation of memory buffers. Instead, it simply marks buffers as
+    live or dead based on their usage, without modifying the underlying memory state and eventually generating
+    code that reflects these changes.
+    """
 
     def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        """Initialize the memory management passthrough pass.
+
+        Args:
+            memoryHierarchyRegex (Optional[str], optional): A regex pattern to match memory hierarchy.
+            Defaults to None.
+        """
         super().__init__(memoryHierarchyRegex)
 
     def apply(self,
@@ -162,6 +312,27 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """Apply the memory management passthrough transformation.
+
+        This function marks buffers as live or dead based on their usage, without performing any actual
+        memory allocation or deallocation.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context.
+        executionBlock : ExecutionBlock
+            The execution block.
+        name : str
+            The name of the buffer.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Defaults to _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            The updated network context and execution block.
+        """
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
index 300c5d2ad9..32d249093b 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
@@ -37,8 +37,62 @@
 
 
 class PrintInputGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """
+    Code transformation pass for generating debug print statements for input tensors.
+
+    This class extends CodeTransformationPass to automatically insert debug
+    printing code that displays the contents of input tensors before operation
+    execution. It's useful for debugging, verification, and analysis of neural
+    network operations by showing the actual data values being processed.
+
+    The generated print statements include tensor metadata (name, type, shape,
+    memory address) and formatted tensor contents with proper indexing for
+    multi-dimensional arrays.
+
+    Notes
+    -----
+    This transformation only processes tensors that are actual inputs to the
+    operation (not transient, constant, or struct buffers) and that have the
+    operation in their user list. The printing is added before the operation
+    execution.
+
+    The generated C code uses nested loops to iterate through all tensor
+    dimensions and prints values with appropriate formatting based on the
+    data type (integer vs floating-point).
+    """
 
     def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        """
+        Create representation dictionary for a tensor reference.
+
+        Analyzes a tensor reference to determine if it should be printed and
+        creates the necessary template variables for code generation.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        ref : str
+            The reference name of the tensor to analyze.
+        name : str
+            The name of the operation for filtering tensors.
+
+        Returns
+        -------
+        dict or None
+            A dictionary containing template variables if the tensor should
+            be printed, None otherwise. The dictionary includes:
+            - bufferName: The name of the buffer reference
+            - bufferType: The data type of the buffer
+            - bufferShape: The shape/dimensions of the buffer
+            - nodeName: The operation name
+
+        Notes
+        -----
+        Returns None for:
+        - TransientBuffer, ConstantBuffer, or StructBuffer instances
+        - Tensors that don't have the operation in their user list
+        """
         _buf = ctxt.lookup(ref)
         refbuf = _buf
 
@@ -58,6 +112,39 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply input tensor printing transformation to an execution block.
+
+        Analyzes all dynamic references in the execution block and adds debug
+        print statements for input tensors before the operation execution.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with input printing code.
+        name : str
+            The name of the operation being instrumented, used for filtering
+            which tensors are considered inputs.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with input print statements added
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters for tensors that are inputs to this operation
+        3. Adds debug print statements before the operation execution
+        4. Generates formatted output showing tensor metadata and contents
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
@@ -73,8 +160,48 @@ def apply(self,
 
 
 class MemoryAwareGeneration():
+    """
+    Base class for memory-aware debug printing transformations.
+
+    This class provides memory hierarchy filtering functionality for debug
+    printing transformations. It allows selective printing of tensors based
+    on their memory level assignments, enabling focused debugging of specific
+    memory regions in multi-level memory architectures.
+
+    Parameters
+    ----------
+    memoryHierarchyRegex : str, optional
+        A regular expression pattern to match against buffer memory levels.
+        If None, only buffers without memory level annotations are included.
+
+    Attributes
+    ----------
+    regex : re.Pattern or None
+        Compiled regular expression for memory level matching, or None if
+        no filtering is applied.
+
+    Notes
+    -----
+    This class is designed to be used as a mixin with specific printing
+    transformation classes. It provides the `_matchesRegex` method for
+    filtering buffers based on their memory level assignments.
+
+    The regex-based filtering enables fine-grained control over which
+    memory levels are included in debug output, which is crucial for
+    debugging complex memory hierarchies in embedded neural network
+    deployments.
+    """
 
     def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        """
+        Initialize the MemoryAwareGeneration base class.
+
+        Parameters
+        ----------
+        memoryHierarchyRegex : str, optional
+            A regular expression pattern to match against buffer memory levels.
+            If None, only buffers without memory level annotations are included.
+        """
         super().__init__()
         if memoryHierarchyRegex is not None:
             self.regex = re.compile(memoryHierarchyRegex)
@@ -82,6 +209,34 @@ def __init__(self, memoryHierarchyRegex: Optional[str] = None):
             self.regex = None
 
     def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
+        """
+        Check if a buffer matches the memory hierarchy regex pattern.
+
+        Determines whether a buffer should be included in debug output based
+        on its memory level assignment and the configured regex pattern.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        key : str
+            The buffer reference key to check.
+
+        Returns
+        -------
+        bool
+            True if the buffer matches the criteria and should be included
+            in debug output, False otherwise.
+
+        Notes
+        -----
+        Matching logic:
+        - If no regex is configured: matches buffers without memory level
+        - If regex is configured: matches buffers whose memory level
+          matches the regex pattern
+        - Buffers without memory level annotations don't match when
+          a regex is configured
+        """
         _buffer = ctxt.lookup(key)
 
         if self.regex is None:
@@ -95,12 +250,63 @@ def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
 
 
 class MemoryAwarePrintInputGeneration(MemoryAwareGeneration, PrintInputGeneration):
+    """
+    Memory-aware input tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintInputGeneration to
+    provide selective debug printing of input tensors based on their memory
+    level assignments. It's particularly useful for debugging multi-level
+    memory architectures where you want to focus on specific memory regions.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    input printing logic from PrintInputGeneration, applying memory-based
+    filtering before generating debug print statements.
+
+    Notes
+    -----
+    This transformation is especially valuable in embedded neural network
+    deployments with complex memory hierarchies (e.g., L1/L2/L3 cache levels,
+    scratchpad memories, external DRAM) where debugging specific memory
+    regions is crucial for performance optimization and correctness verification.
+    """
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware input tensor printing transformation.
+
+        Filters input tensors by memory level before adding debug print
+        statements, enabling focused debugging of specific memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered input printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered input print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for tensors that are inputs to this operation
+        4. Adds debug print statements for qualifying tensors
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
@@ -118,8 +324,62 @@ def apply(self,
 
 
 class PrintOutputGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """
+    Code transformation pass for generating debug print statements for output tensors.
+
+    This class extends CodeTransformationPass to automatically insert debug
+    printing code that displays the contents of output tensors after operation
+    execution. It's useful for debugging, verification, and analysis of neural
+    network operations by showing the actual data values produced.
+
+    The class complements PrintInputGeneration by focusing on outputs rather
+    than inputs, providing a complete view of data flow through operations.
+
+    Notes
+    -----
+    This transformation only processes tensors that are actual outputs from
+    the operation (not used by the current operation, but either used by
+    other operations or global buffers). The printing is added after the
+    operation execution.
+
+    Output tensors are identified by checking that the operation is NOT in
+    their user list, indicating the operation produces rather than consumes
+    the tensor.
+    """
 
     def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        """
+        Create representation dictionary for an output tensor reference.
+
+        Analyzes a tensor reference to determine if it's an output tensor
+        that should be printed and creates the necessary template variables.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        ref : str
+            The reference name of the tensor to analyze.
+        name : str
+            The name of the operation for filtering tensors.
+
+        Returns
+        -------
+        dict or None
+            A dictionary containing template variables if the tensor should
+            be printed, None otherwise. The dictionary includes:
+            - bufferName: The name of the buffer reference
+            - bufferType: The data type of the buffer
+            - bufferShape: The shape/dimensions of the buffer
+            - nodeName: The operation name
+
+        Notes
+        -----
+        Returns None for:
+        - TransientBuffer, ConstantBuffer, or StructBuffer instances
+        - Tensors that have the operation in their user list (inputs)
+        - Unused local tensors (not global and no users)
+        """
         _buf = ctxt.lookup(ref)
         refbuf = _buf
 
@@ -142,6 +402,39 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply output tensor printing transformation to an execution block.
+
+        Analyzes all dynamic references in the execution block and adds debug
+        print statements for output tensors after the operation execution.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with output printing code.
+        name : str
+            The name of the operation being instrumented, used for filtering
+            which tensors are considered outputs.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with output print statements added
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters for tensors that are outputs from this operation
+        3. Adds debug print statements after the operation execution
+        4. Generates formatted output showing tensor metadata and contents
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
@@ -157,12 +450,63 @@ def apply(self,
 
 
 class MemoryAwarePrintOutputGeneration(MemoryAwareGeneration, PrintOutputGeneration):
+    """
+    Memory-aware output tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintOutputGeneration to
+    provide selective debug printing of output tensors based on their memory
+    level assignments. It enables focused debugging of output data in specific
+    memory regions within multi-level memory architectures.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    output printing logic from PrintOutputGeneration, applying memory-based
+    filtering before generating debug print statements for output tensors.
+
+    Notes
+    -----
+    This transformation is particularly valuable for verifying that output
+    data is correctly written to the intended memory levels in complex
+    memory hierarchies, and for debugging memory management issues in
+    embedded neural network deployments.
+    """
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware output tensor printing transformation.
+
+        Filters output tensors by memory level before adding debug print
+        statements, enabling focused debugging of specific memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered output printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered output print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for tensors that are outputs from this operation
+        4. Adds debug print statements for qualifying tensors after execution
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
@@ -180,8 +524,61 @@ def apply(self,
 
 
 class PrintConstantGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
+    """
+    Code transformation pass for generating debug print statements for constant tensors.
+
+    This class extends CodeTransformationPass to automatically insert debug
+    printing code that displays the contents of constant tensors used by
+    operations. It's useful for debugging, verification, and analysis of
+    neural network weights, biases, and other constant parameters.
+
+    Constant tensors represent model parameters and other immutable data
+    that don't change during execution. Printing these values helps verify
+    that the correct parameters are loaded and accessible during operation
+    execution.
+
+    Notes
+    -----
+    This transformation only processes ConstantBuffer instances that have
+    users (are actually referenced by operations). The printing is added
+    before the operation execution to show the constant values being used.
+
+    This is particularly useful for debugging quantization issues, parameter
+    loading problems, and weight/bias verification in neural networks.
+    """
 
     def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
+        """
+        Create representation dictionary for a constant tensor reference.
+
+        Analyzes a tensor reference to determine if it's a constant tensor
+        that should be printed and creates the necessary template variables.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        ref : str
+            The reference name of the tensor to analyze.
+        name : str
+            The name of the operation (used for template generation).
+
+        Returns
+        -------
+        dict or None
+            A dictionary containing template variables if the tensor should
+            be printed, None otherwise. The dictionary includes:
+            - bufferName: The name of the buffer reference
+            - bufferType: The data type of the buffer
+            - bufferShape: The shape/dimensions of the buffer
+            - nodeName: The operation name
+
+        Notes
+        -----
+        Returns None for:
+        - Non-ConstantBuffer instances
+        - Constant buffers with no users (unused constants)
+        """
         _buf = ctxt.lookup(ref)
         refbuf = _buf
 
@@ -195,6 +592,38 @@ def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
 
     def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
               name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply constant tensor printing transformation to an execution block.
+
+        Analyzes all dynamic references in the execution block and adds debug
+        print statements for constant tensors before the operation execution.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with constant printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with constant print statements added
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters for constant buffers that have users
+        3. Adds debug print statements before the operation execution
+        4. Generates formatted output showing constant tensor metadata and values
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
@@ -210,12 +639,70 @@ def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
 
 
 class MemoryAwarePrintConstantGeneration(MemoryAwareGeneration, PrintConstantGeneration):
+    """
+    Memory-aware constant tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintConstantGeneration to
+    provide selective debug printing of constant tensors based on their memory
+    level assignments. It enables focused debugging of constant data (weights,
+    biases, parameters) in specific memory regions within multi-level memory
+    architectures.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    constant printing logic from PrintConstantGeneration, applying memory-based
+    filtering before generating debug print statements for constant tensors.
+
+    Notes
+    -----
+    This transformation is particularly valuable for:
+    - Verifying parameter placement in specific memory levels
+    - Debugging weight loading and quantization in embedded deployments
+    - Analyzing memory usage patterns for constant data
+    - Troubleshooting parameter access issues in complex memory hierarchies
+
+    It's especially useful in scenarios where different constant tensors
+    are placed in different memory levels for performance optimization.
+    """
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware constant tensor printing transformation.
+
+        Filters constant tensors by memory level before adding debug print
+        statements, enabling focused debugging of parameters in specific
+        memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered constant printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+            This parameter is currently unused by the implementation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered constant print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for constant buffers that have users
+        4. Adds debug print statements for qualifying constant tensors
+        """
 
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index c05ea3b9d9..e479b92704 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -10,54 +10,63 @@
 
 
 class int8_t(IntegerImmediate):
+    """8-bit signed integer type."""
     typeName = "int8_t"
     typeWidth = 8
     signed = True
 
 
 class int16_t(IntegerImmediate):
+    """16-bit signed integer type."""
     typeName = "int16_t"
     typeWidth = 16
     signed = True
 
 
 class int32_t(IntegerImmediate):
+    """32-bit signed integer type."""
     typeName = "int32_t"
     typeWidth = 32
     signed = True
 
 
 class int64_t(IntegerImmediate):
+    """64-bit signed integer type."""
     typeName = "int64_t"
     typeWidth = 64
     signed = True
 
 
 class uint8_t(IntegerImmediate):
+    """8-bit unsigned integer type."""
     typeName = "uint8_t"
     typeWidth = 8
     signed = False
 
 
 class uint16_t(IntegerImmediate):
+    """16-bit unsigned integer type."""
     typeName = "uint16_t"
     typeWidth = 16
     signed = False
 
 
 class uint32_t(IntegerImmediate):
+    """32-bit unsigned integer type."""
     typeName = "uint32_t"
     typeWidth = 32
     signed = False
 
 
 class uint64_t(IntegerImmediate):
+    """64-bit unsigned integer type."""
     typeName = "uint64_t"
     typeWidth = 64
     signed = False
 
 
 class bfloat16_t(FloatImmediate):
+    """16-bit bfloat float type with 7-bit mantissa and 8-bit exponent."""
     typeName = "bfloat16_t"
     typeWidth = 16
     typeMantissa = 7
@@ -65,6 +74,7 @@ class bfloat16_t(FloatImmediate):
 
 
 class float16_t(FloatImmediate):
+    """16-bit float type with 10-bit mantissa and 5-bit exponent."""
     typeName = "float16_t"
     typeWidth = 16
     typeMantissa = 10
@@ -72,6 +82,7 @@ class float16_t(FloatImmediate):
 
 
 class float32_t(FloatImmediate):
+    """32-bit float type with 23-bit mantissa and 8-bit exponent."""
     typeName = "float32_t"
     typeWidth = 32
     typeMantissa = 23
@@ -79,6 +90,7 @@ class float32_t(FloatImmediate):
 
 
 class float64_t(FloatImmediate):
+    """64-bit float type with 11-bit mantissa and 52-bit exponent."""
     typeName = "float64_t"
     typeWidth = 64
     typeMantissa = 52
@@ -96,6 +108,18 @@ class float64_t(FloatImmediate):
 
 
 def minimalIntegerType(value: Union[int, Iterable[int], npt.NDArray]) -> Type[IntegerImmediate]:
+    """Returns the minimal integer type that can represent all values in the given list.
+
+    Parameters
+    ----------
+    values : Union[int, Iterable[int]
+        The list of integer values to analyze.
+
+    Returns
+    -------
+    Type[IntegerImmediate]
+        The minimal integer type that can represent all values.
+    """
     # Sort data types by typeWidth and signedness (unsigned types go first)
     sorted_types = sorted(
         IntegerDataTypes,
@@ -110,6 +134,18 @@ def minimalIntegerType(value: Union[int, Iterable[int], npt.NDArray]) -> Type[In
 
 
 def minimalFloatType(value: Union[float, Iterable[float], npt.NDArray]) -> Type[FloatImmediate]:
+    """Returns the minimal float type that can represent all values in the given list.
+
+    Parameters
+    ----------
+    values : Union[float, Iterable[float]
+        The list of float values to analyze.
+
+    Returns
+    -------
+    Type[FloatImmediate]
+        The minimal float type that can represent all values.
+    """
     # Sort data types by typeWidth
     sorted_types = sorted(
         FloatDataTypes,
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
index cec95ec134..9153bbda9f 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
@@ -9,17 +9,100 @@
 
 
 class Match(NamedTuple):
+    """
+    Represents a successful pattern match in a computational graph.
+
+    This named tuple encapsulates the result of matching a pattern graph
+    against a larger computational graph. It contains both the anchor node
+    (starting point of the match) and a complete mapping between pattern
+    nodes and their corresponding matched nodes in the target graph.
+
+    Attributes
+    ----------
+    anchor : gs.Node
+        The node in the target graph that serves as the starting point
+        for the pattern match. This is typically the first node that
+        matched the pattern and from which the full match was discovered.
+    nodes_map : Dict[str, gs.Node]
+        A dictionary mapping pattern node names to their corresponding
+        matched nodes in the target graph. The keys are pattern node names
+        and the values are the actual matched nodes from the target graph.
+
+    Notes
+    -----
+    This class is used by pattern matching algorithms to represent successful
+    matches. The nodes_map provides a complete correspondence between the
+    pattern structure and the matched subgraph, enabling transformations
+    and optimizations to be applied to the matched regions.
+    """
     anchor: gs.Node
     nodes_map: Dict[str, gs.Node]
 
 
 class SubgraphMatcher:
+    """
+    Base class for pattern matching in computational graphs.
+
+    This class provides the foundation for matching pattern graphs against
+    larger computational graphs. It supports both exact string matching and
+    regular expression matching for operation types, enabling flexible
+    pattern recognition for graph optimization and transformation.
+
+    The matcher identifies non-overlapping instances of a pattern within
+    a target graph, returning Match objects that can be used for subsequent
+    transformations or analysis.
+
+    Notes
+    -----
+    This is an abstract base class that defines the interface for pattern
+    matching. Concrete implementations must override the abstract methods
+    `_valid_pattern` and `_nodes_map_from_anchor` to define specific
+    matching algorithms.
+
+    The matching process ensures non-overlapping matches, meaning each
+    node in the target graph can only participate in at most one match.
+    """
 
     def __init__(self, regex_op: bool = False):
+        """
+        Initialize the SubgraphMatcher.
+
+        Parameters
+        ----------
+        regex_op : bool, optional
+            Whether to use regular expression matching for operation types.
+            Default is False for exact string matching.
+        """
         # operation matching policy
         self.regex_op = regex_op
 
     def is_op_match(self, patternNode: gs.Node, graphNode: gs.Node):
+        """
+        Check if a pattern node operation matches a graph node operation.
+
+        Compares the operation types of two nodes according to the configured
+        matching policy (regex or exact match).
+
+        Parameters
+        ----------
+        patternNode : gs.Node
+            The pattern node whose operation type serves as the match criterion.
+        graphNode : gs.Node
+            The target graph node to check for a match.
+
+        Returns
+        -------
+        bool
+            True if the operations match according to the configured policy,
+            False otherwise.
+
+        Notes
+        -----
+        When regex_op is True, the pattern node's operation is treated as a
+        regular expression pattern and matched against the graph node's
+        operation using `re.fullmatch`. When False, exact string equality
+        is used.
+        """
         if self.regex_op:
             return re.fullmatch(patternNode.op, graphNode.op) is not None
         else:
@@ -27,13 +110,84 @@ def is_op_match(self, patternNode: gs.Node, graphNode: gs.Node):
 
     # Override this
     def _valid_pattern(self, pattern: gs.Graph) -> None:
+        """
+        Validate that a pattern graph meets the requirements for matching.
+
+        This abstract method should be overridden by subclasses to implement
+        pattern validation logic specific to their matching algorithm.
+
+        Parameters
+        ----------
+        pattern : gs.Graph
+            The pattern graph to validate.
+
+        Raises
+        ------
+        AssertionError
+            If the pattern does not meet the required constraints.
+
+        Notes
+        -----
+        This method is called before attempting to match a pattern and should
+        verify that the pattern has the correct structure for the specific
+        matching algorithm being used.
+        """
         _ = pattern
 
     # Override this
     def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        """
+        Attempt to match a pattern starting from an anchor node.
+
+        This abstract method should be overridden by subclasses to implement
+        the core matching logic specific to their algorithm.
+
+        Parameters
+        ----------
+        anchor : gs.Node
+            The potential starting node for matching the pattern.
+        pattern : gs.Graph
+            The pattern graph to match.
+
+        Returns
+        -------
+        Optional[Dict[str, gs.Node]]
+            A dictionary mapping pattern node names to matched graph nodes
+            if the pattern matches starting from the anchor, None otherwise.
+
+        Notes
+        -----
+        This method contains the core pattern matching algorithm and should
+        return a complete mapping from pattern nodes to graph nodes if a
+        valid match is found.
+        """
         _, _ = anchor, pattern
 
     def _match_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Match]:
+        """
+        Attempt to create a complete match starting from an anchor node.
+
+        Uses the subclass-specific matching algorithm to find a node mapping
+        and validates that the mapping covers all nodes in the pattern.
+
+        Parameters
+        ----------
+        anchor : gs.Node
+            The potential starting node for pattern matching.
+        pattern : gs.Graph
+            The pattern graph to match against.
+
+        Returns
+        -------
+        Optional[Match]
+            A Match object containing the anchor and complete node mapping
+            if successful, None if the pattern doesn't match from this anchor.
+
+        Notes
+        -----
+        This method ensures that a valid match covers all nodes in the
+        pattern graph before considering it successful.
+        """
         nodes_map = self._nodes_map_from_anchor(anchor, pattern)
 
         if nodes_map is not None and len(nodes_map.keys()) == len(pattern.nodes):
@@ -42,6 +196,37 @@ def _match_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Mat
             return None
 
     def match(self, graph: gs.Graph, pattern: gs.Graph):
+        """
+        Find all non-overlapping matches of a pattern in a target graph.
+
+        Systematically searches the target graph for instances of the pattern,
+        ensuring that each node participates in at most one match to avoid
+        conflicts during transformations.
+
+        Parameters
+        ----------
+        graph : gs.Graph
+            The target graph to search for pattern matches.
+        pattern : gs.Graph
+            The pattern graph to find instances of.
+
+        Returns
+        -------
+        List[Match]
+            A list of Match objects representing all non-overlapping instances
+            of the pattern found in the target graph.
+
+        Notes
+        -----
+        The algorithm:
+        1. Validates the pattern using the subclass-specific validation
+        2. Iterates through all nodes in the target graph as potential anchors
+        3. Attempts to match the pattern from each anchor
+        4. Collects only non-overlapping matches to avoid conflicts
+
+        Non-overlapping means that if a node is part of one match, it cannot
+        be part of any other match in the returned list.
+        """
         self._valid_pattern(pattern)
 
         # Return a list of non-overlapping matches of pattern
@@ -65,19 +250,107 @@ def is_overlap(match: Match):
 
 
 class NonBranchingMatcher(SubgraphMatcher):
+    """
+    Pattern matcher for sequential computational graphs without branching.
+
+    This matcher is optimized for patterns that form a simple chain of operations
+    without splits or merges in the computational flow. It uses a recursive
+    algorithm to follow the linear path of operations.
+
+    The matching algorithm follows edges from the anchor node to build a complete
+    mapping between pattern nodes and graph nodes, verifying operation types
+    and attributes at each step.
+    Notes
+    -----
+    This matcher is efficient for linear operation sequences such as:
+    - Conv -> BatchNorm -> ReLU chains
+    - Linear -> Dropout -> Activation sequences
+    - Simple preprocessing pipelines
+
+    The algorithm assumes that each node in the pattern has at most one
+    output connection to the next node in the sequence.
+    """
+
     # simplified matcher which matches call_module ops more reasonably
     def __init__(self, regex_op: bool = False):
+        """
+        Initialize the non-branching pattern matcher.
+
+        Parameters
+        ----------
+        regex_op : bool, optional
+            Enable regex-based operation type matching. Default is False.
+        """
         # This checking is sufficient - iff the graph is acyclic and connected (checked by parser)
         # and every node has one output, the graph is sequential
         super().__init__(regex_op)
 
     def _valid_pattern(self, pattern: gs.Graph):
+        """
+        Validate that the pattern is suitable for non-branching matching.
+
+        Ensures that the pattern graph forms a simple sequential chain
+        without branching or multiple outputs.
+
+        Parameters
+        ----------
+        pattern : gs.Graph
+            The pattern graph to validate.
+
+        Raises
+        ------
+        AssertionError
+            If the pattern has more than one output or any node has
+            multiple outputs (indicating branching).
+
+        Notes
+        -----
+        Valid patterns for non-branching matching must satisfy:
+        1. Exactly one graph output
+        2. Each node has exactly one output (no branching)
+        3. Forms a simple chain of operations
+        """
         assert len(pattern.outputs) == 1, "Found more than one output"
         for node in pattern.nodes:
             assert len(node.outputs) == 1, "Graph needs to be purely sequential!"
 
     def _match_nodes_recursive(self, pn: gs.Node, gn: gs.Node, pattern_length: int,
                                nodes_map: dict) -> Optional[Dict[str, gs.Node]]:
+        """
+        Recursively match nodes in a sequential pattern.
+
+        Follows the linear chain of operations from the current nodes,
+        building a complete mapping between pattern and graph nodes.
+
+        Parameters
+        ----------
+        pn : gs.Node
+            Current node in the pattern graph.
+        gn : gs.Node
+            Current node in the target graph.
+        pattern_length : int
+            Total number of nodes in the pattern (for termination).
+        nodes_map : dict
+            Accumulated mapping from pattern node names to graph nodes.
+
+        Returns
+        -------
+        Optional[Dict[str, gs.Node]]
+            Complete node mapping if the pattern matches from this point,
+            None if the pattern doesn't match.
+
+        Notes
+        -----
+        The algorithm:
+        1. Verifies that current nodes are compatible (type and attributes)
+        2. Adds the current mapping to nodes_map
+        3. If pattern is complete, returns the mapping
+        4. Otherwise, recursively matches the next nodes in the sequence
+
+        This simplified approach works because we've already validated
+        that the pattern is purely sequential.
+        """
+
         # as we do sequential traversal, the first step (checking if nodes
         # already traversed) of the original _match_nodes function can be
         # discarded
@@ -119,20 +392,150 @@ def attributes_are_equal(pn: gs.Node, gn: gs.Node) -> bool:
         return self._match_nodes_recursive(pn.o(), gn.o(), pattern_length - 1, nodes_map)
 
     def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        """
+        Create a complete node mapping starting from an anchor node.
+
+        Initiates the recursive matching process from the first node in the
+        pattern and the provided anchor node in the target graph.
+
+        Parameters
+        ----------
+        anchor : gs.Node
+            The starting node in the target graph.
+        pattern : gs.Graph
+            The pattern graph to match.
+
+        Returns
+        -------
+        Optional[Dict[str, gs.Node]]
+            A complete mapping from pattern node names to graph nodes
+            if the pattern matches starting from the anchor, None otherwise.
+
+        Notes
+        -----
+        This method selects the first node from the pattern as the pattern
+        anchor and delegates to the recursive matching algorithm. For
+        sequential patterns, the choice of pattern anchor doesn't affect
+        the result since there's only one valid traversal order.
+        """
         pattern_anchor = next(iter(pattern.nodes))
         return self._match_nodes_recursive(pattern_anchor, anchor, len(pattern.nodes), {})
 
 
 class BranchingMatcher(SubgraphMatcher):
+    """
+    Pattern matcher for computational graphs with branching and merging.
+
+    This matcher handles complex patterns that contain splits, merges, and
+    other non-sequential structures. It uses a more sophisticated
+    algorithm that can traverse graphs in both forward and reverse directions
+    to handle branching patterns.
+
+    The matching algorithm explores multiple paths through the graph and
+    can handle patterns with:
+    - Multiple inputs/outputs per node
+    - Fan-out (one node feeding multiple nodes)
+    - Fan-in (multiple nodes feeding one node)
+    - Complex DAG structures
+
+    Parameters
+    ----------
+    regex_op : bool, optional
+        If True, enables regex-based operation type matching instead of
+        exact string matching. Default is False.
+
+    Notes
+    -----
+    This matcher is suitable for complex patterns such as:
+    - ResNet skip connections
+    - Attention mechanisms with multiple branches
+    - Feature pyramid networks
+    - Any pattern with non-linear control flow
+
+    The algorithm is more computationally intensive than NonBranchingMatcher
+    but provides full generality for arbitrary DAG patterns.
+    """
+
     # simplified matcher which matches call_module ops more reasonably
     def __init__(self, regex_op: bool = False):
+        """
+        Initialize the branching pattern matcher.
+
+        Parameters
+        ----------
+        regex_op : bool, optional
+            Enable regex-based operation type matching. Default is False.
+        """
         super().__init__(regex_op)
 
     def _valid_pattern(self, pattern: gs.Graph):
+        """
+        Validate that the pattern is suitable for branching matching.
+
+        Ensures that the pattern has exactly one output, but allows for
+        complex internal structure with branching and merging.
+
+        Parameters
+        ----------
+        pattern : gs.Graph
+            The pattern graph to validate.
+
+        Raises
+        ------
+        AssertionError
+            If the pattern has more than one output.
+
+        Notes
+        -----
+        Unlike NonBranchingMatcher, this validator only checks for a single
+        output but allows arbitrary internal complexity including:
+        - Nodes with multiple inputs (fan-in)
+        - Nodes with multiple outputs (fan-out)
+        - Complex DAG structures
+        """
         assert len(pattern.outputs) == 1, "Found more than one output"
 
     def _match_nodes_recursive(self, pn: gs.Node, gn: gs.Node, nodes_map: dict,
                                direction: Literal["Forward", "Reverse"]) -> Optional[Dict]:
+        """
+        Recursively match nodes in a branching pattern.
+
+        Explores the graph in the specified direction, handling both forward
+        traversal (following outputs) and reverse traversal (following inputs)
+        to match complex branching patterns.
+
+        Parameters
+        ----------
+        pn : gs.Node
+            Current node in the pattern graph.
+        gn : gs.Node
+            Current node in the target graph.
+        nodes_map : dict
+            Accumulated mapping from pattern node names to graph nodes.
+        direction : Literal["Forward", "Reverse"]
+            Direction of graph traversal - "Forward" follows outputs,
+            "Reverse" follows inputs.
+
+        Returns
+        -------
+        Optional[Dict]
+            Updated node mapping if the pattern continues to match,
+            None if the pattern doesn't match from this point.
+
+        Raises
+        ------
+        AssertionError
+            If direction is not "Forward" or "Reverse".
+
+        Notes
+        -----
+        The algorithm:
+        1. Validates that current nodes are compatible
+        2. Adds the current mapping if not already present
+        3. Recursively explores neighbors in the specified direction
+        4. Handles both fan-out and fan-in scenarios
+        5. Terminates when all pattern nodes are matched
+        """
         assert direction in ["Forward", "Reverse"], f"'{direction}' is not a valid matching direction!"
 
         # Check if nodes are identical
@@ -232,5 +635,35 @@ def attributes_are_equal(pn: gs.Node, gn: gs.Node):
         assert False, "This statement should never be reached!"
 
     def _nodes_map_from_anchor(self, anchor: gs.Node, pattern: gs.Graph) -> Optional[Dict[str, gs.Node]]:
+        """
+        Create a complete node mapping starting from an anchor node.
+
+        Initiates the recursive branching matching process from the first node
+        in the pattern and the provided anchor node in the target graph.
+
+        Parameters
+        ----------
+        anchor : gs.Node
+            The starting node in the target graph.
+        pattern : gs.Graph
+            The pattern graph to match.
+
+        Returns
+        -------
+        Optional[Dict[str, gs.Node]]
+            A complete mapping from pattern node names to graph nodes
+            if the pattern matches starting from the anchor, None otherwise.
+
+        Notes
+        -----
+        This method:
+        1. Selects the first node from the pattern as the pattern anchor
+        2. Initiates forward traversal from the anchor nodes
+        3. Uses the full branching matching algorithm to handle complex patterns
+
+        The forward direction is used initially, but the recursive algorithm
+        may switch to reverse direction as needed to properly explore
+        branching structures.
+        """
         pattern_anchor = next(iter(pattern.nodes))
         return self._match_nodes_recursive(pattern_anchor, anchor, {}, 'Forward')
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 4d643d03be..f5962718f0 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -521,7 +521,9 @@ def __repr__(self) -> str:
 
 
 class NetworkContext():
-    """The global context of the compiler. This object holds all the typing inferred in the type-checking passes within the respective buffers. It holds all hoisted transient buffers, struct buffers, and global definitions. The context is the source of truth for all code generation in the backend.
+    """The global context of the compiler. This object holds all the typing inferred in the type-checking passes within
+    the respective buffers. It holds all hoisted transient buffers, struct buffers, and global definitions.
+    The context is the source of truth for all code generation in the backend.
     """
 
     def __init__(self,
@@ -559,8 +561,8 @@ def dealiasBuffer(self, name: str) -> str:
         Raises
         ------
         Exception
-            Raises an Exception if aliases are circular
-
+            Raises an Exception if aliases are circular, i.e. there
+            is no underlying VariableBuffer
         """
         seenAliases: Set[str] = set()
         alias = self.lookup(name)
@@ -586,8 +588,8 @@ def unravelReference(self, ref: VariableBuffer) -> VariableBuffer:
         Raises
         ------
         Exception
-            Raises an Exception if references are circular
-
+            Raises an Exception if references are circular, i.e. there
+            is no underlying VariableBuffer
         """
         seenRefs = set()
         while isinstance(ref, _ReferenceBuffer):
@@ -937,7 +939,7 @@ def hoistReference(self,
         reference : VariableBuffer
             Referenced VariableBuffer
         shape: Tuple[int, ...]
-            Shape of the _ReferenceBuffer
+            Shape of the reference
         offset: Union[int, str, VariableBuffer]
             Offset from the reference
         override_type: Optional[Type[BaseType]]
@@ -947,7 +949,6 @@ def hoistReference(self,
         -------
         _ReferenceBuffer
             Returns the newly registered _ReferenceBuffer
-
         """
         ref = _ReferenceBuffer(name, reference, shape, offset)
         if override_type is not None:
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index 9b48d9456c..d932b22740 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -44,6 +44,51 @@
 
 
 class Tiler():
+    """Tiler for a computation graphs with memory-aware optimization.
+
+    The Tiler class provides functionality for tiling operations to fit within
+    memory constraints of target hardware platforms. It performs memory allocation, constraint
+    solving, and scheduling to optimize execution within hierarchical memory systems.
+
+    Parameters
+    ----------
+    memoryHierarchy : MemoryHierarchy
+        The memory hierarchy specification defining available memory levels and their capacities.
+
+    Attributes
+    ----------
+    arenaName : str
+        Name prefix for memory arena buffers.
+    memorySchedulerClass : Type[MemoryScheduler]
+        Class type for memory scheduler instances.
+    memoryHierarchy : MemoryHierarchy
+        The memory hierarchy configuration.
+    tilerModel : Optional[TilerModel]
+        The constraint solver model for tiling optimization.
+    innerMemoryScheduler : MemoryScheduler
+        Scheduler for inner memory level allocation.
+    outerMemoryScheduler : MemoryScheduler
+        Scheduler for outer memory level allocation.
+    symbolicMemoryConstraints : Optional[List[PatternMemoryConstraints]]
+        Symbolic memory constraints for the tiling problem.
+    visualizeMemoryAlloc : bool
+        Flag to enable memory allocation visualization.
+    memoryAllocStrategy : {"TetrisRandom", "TetrisCo-Opt", "MiniMalloc"}
+        Strategy for memory allocation.
+    searchStrategy : {"min", "max", "random-max"}
+        Search strategy for constraint solving.
+
+    Examples
+    --------
+    >>> L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 1024000)
+    >>> L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 512000)
+    >>> L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 128000)
+    >>> memoryHierarchy = MemoryHierarchy([L3, L2, L1])
+    >>> memoryHierarchy.setDefaultMemoryLevel("L3")
+    >>> tiler = Tiler(hierarchy)
+    >>> tiler.memoryAllocStrategy = "MiniMalloc"
+    >>> solution = tiler.computeTilingSchedule(context)
+    """
 
     arenaName = "MEMORYARENA"
     memorySchedulerClass: Type[MemoryScheduler] = MemoryScheduler
@@ -53,6 +98,13 @@ class Tiler():
 
     # Initialize with the list of TemplateTCFbinding
     def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = None, workDir: Optional[str] = None):
+        """Initialize the Tiler with a memory hierarchy.
+
+        Parameters
+        ----------
+        memoryHierarchy : MemoryHierarchy
+            The memory hierarchy specification defining available memory levels.
+        """
 
         self.memoryHierarchy = memoryHierarchy
         self.tilerModel: Optional[TilerModel] = None
@@ -85,10 +137,39 @@ def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = N
 
     @property
     def worstCaseBufferSize(self):
+        """Get the worst-case buffer sizes for each memory level.
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary mapping memory level names to their worst-case buffer sizes in bytes.
+        """
         return self._worstCaseBufferSize
 
     def plotMemoryAlloc(self, memoryMap: Dict[str, List[List[MemoryBlock]]], ctxt: NetworkContext, deeployStateDir: str,
                         memoryHierarchy: MemoryHierarchy):
+        """Generate interactive visualization of memory allocation patterns.
+
+        Creates an HTML file with Plotly visualizations showing memory allocation
+        over time for each memory level in the hierarchy.
+
+        Parameters
+        ----------
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map containing blocks for each memory level and time step.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        deeployStateDir : str
+            Directory path where the visualization HTML file will be saved.
+        memoryHierarchy : MemoryHierarchy
+            Memory hierarchy configuration for the visualization.
+
+        Notes
+        -----
+        Generates a file named 'memory_alloc.html' in the specified directory.
+        Each memory level is visualized as a separate subplot showing buffer
+        lifetimes and address space usage.
+        """
 
         os.makedirs(os.path.abspath(deeployStateDir), exist_ok = True)
         memoryAllocPlotPath = os.path.abspath(os.path.join(deeployStateDir, f"memory_alloc.html"))
@@ -177,6 +258,29 @@ def plotSingleMemoryLevel(memoryLevel: MemoryLevel):
 
     def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
                                      memoryMap: Dict[str, List[List[MemoryBlock]]]) -> NetworkContext:
+        """Convert network context to use static memory allocation.
+
+        Transforms the network context to use statically allocated memory arenas
+        based on the computed memory map. Updates buffer allocation templates
+        to reference specific offsets within memory arenas.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context to be updated.
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map containing blocks for each memory level.
+
+        Returns
+        -------
+        NetworkContext
+            Updated network context with static memory allocation.
+
+        Notes
+        -----
+        Creates memory arena buffers for each memory level and updates
+        individual buffer allocation templates to use offsets within these arenas.
+        """
 
         maxAddr: Dict[str, int] = {}
 
@@ -254,6 +358,41 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
         return ctxt
 
     def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memoryLevel: str):
+        """Perform memory allocation using the MiniMalloc external tool.
+
+        Interfaces with the external MiniMalloc memory allocator to compute
+        optimal memory allocation for the given memory blocks and constraints.
+
+        Parameters
+        ----------
+        memoryMap : List[MemoryBlock]
+            List of memory blocks to be allocated.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        nodeMemoryConstraint : Optional[NodeMemoryConstraint]
+            Memory constraints for the current node, if available.
+        capacity : int
+            Total memory capacity available for allocation.
+        memoryLevel : str
+            Name of the memory level being allocated.
+
+        Returns
+        -------
+        List[MemoryBlock]
+            Updated memory blocks with assigned address spaces.
+
+        Raises
+        ------
+        KeyError
+            If MINIMALLOC_INSTALL_DIR environment variable is not set.
+        subprocess.CalledProcessError
+            If the MiniMalloc tool fails to execute successfully.
+
+        Notes
+        -----
+        Requires the MiniMalloc tool to be installed and the MINIMALLOC_INSTALL_DIR
+        environment variable to be set to the installation directory.
+        """
 
         with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file:
             writer = csv.writer(file, lineterminator = "\n")
@@ -312,6 +451,31 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
         return memoryMap
 
     def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
+        """Compute the optimal tiling schedule for the network.
+
+        Solves the constraint optimization problem to find the best tiling
+        solution that satisfies memory and computational constraints.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing the computational graph and constraints.
+
+        Returns
+        -------
+        TilingSolution
+            The computed tiling solution with memory constraints for each pattern.
+
+        Raises
+        ------
+        AssertionError
+            If the tiler model or symbolic memory constraints are not initialized.
+
+        Notes
+        -----
+        This method requires that setupModel() has been called previously to
+        initialize the constraint model and symbolic memory constraints.
+        """
         assert self.tilerModel is not None and self.symbolicMemoryConstraints is not None, "Set up the model before trying to compute a schedule!"
         collector = self.tilerModel.trySolveModel()
         tilingSolution = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints)
@@ -323,6 +487,29 @@ def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
         return tilingSolution
 
     def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution) -> MemoryMap:
+        """Compute memory allocation map from the tiling solution.
+
+        Generates a concrete memory allocation map that assigns specific
+        memory addresses to each buffer based on the tiling solution.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tilingSolution : TilingSolution
+            The computed tiling solution.
+
+        Returns
+        -------
+        MemoryMap
+            Dictionary mapping memory level names to lists of memory blocks
+            for each time step.
+
+        Notes
+        -----
+        The memory allocation strategy (TetrisRandom, TetrisCo-Opt, or MiniMalloc)
+        determines how the actual memory addresses are assigned.
+        """
         memoryMap = {}
 
         for key in self.innerMemoryScheduler.memoryMap.keys():
@@ -348,6 +535,30 @@ def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution)
 
     def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSolution,
                             memoryMap: Dict) -> NetworkContext:
+        """Annotate memory constraints with actual address space allocations.
+
+        Updates the memory constraints in the tiling solution with the actual
+        address spaces computed during memory allocation.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tilingSolution : TilingSolution
+            The tiling solution to be annotated.
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            Memory allocation map with assigned address spaces.
+
+        Returns
+        -------
+        NetworkContext
+            Updated network context (returned for consistency).
+
+        Notes
+        -----
+        This method modifies the tiling solution in-place by adding address
+        space information to memory constraints.
+        """
         for idx, pattern in enumerate(tilingSolution):
             for nodeIdx, nodeConstraint in enumerate(pattern.nodeConstraints):
                 for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values():
@@ -373,6 +584,32 @@ def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSoluti
 
     def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: OrderedDict[str, ONNXLayer],
                    targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NetworkContext:
+        """Set up the constraint optimization model for tiling.
+
+        Initializes the tiler model with geometric constraints, memory constraints,
+        and optimization objectives based on the network schedule and layer bindings.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing the computational graph.
+        schedule : Schedule
+            Execution schedule defining the order of operations.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        NetworkContext
+            The network context (returned for consistency).
+
+        Notes
+        -----
+        This method must be called before computeTilingSchedule() to initialize
+        the constraint model and symbolic memory constraints.
+        """
 
         wrapSchedule: List[SubGraph] = []
         for entry in schedule:
@@ -396,6 +633,37 @@ def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: Ord
     # SCHEREMO: Return a integer factor or IntVar variable for the multi Buffer coefficient given the tiling path, hop and tensorName.
     def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
                             hop: str, tensorName: str) -> Union[int, IntVar]:
+        """Determine multi-buffering coefficient for a tensor in the tiling strategy.
+
+        Computes the buffering factor (e.g., double buffering = 2) for a given tensor
+        based on its type and usage pattern in the computation graph. This coefficient
+        is used to determine how many copies of the tensor should be kept in memory.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel, (unused)
+            The constraint solver model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        pattern : SubGraph, (unused)
+            The computation pattern being analyzed.
+        path : List[str], (unused)
+            Memory hierarchy path for the tensor.
+        hop : str, (unused)
+            Current memory level in the path.
+        tensorName : str
+            Name of the tensor to analyze.
+
+        Returns
+        -------
+        Union[int, IntVar]
+            Buffering coefficient (typically 1 for transient buffers, 2 for others).
+
+        Notes
+        -----
+        The multi-buffering strategy helps overlap computation with data movement
+        by maintaining multiple copies of buffers at different memory levels.
+        """
 
         varBuffer = ctxt.lookup(tensorName)
 
@@ -426,6 +694,30 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt
 
     def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstraints, pattern: SubGraph,
                                   ctxt: NetworkContext) -> PatternMemoryConstraints:
+        """Propagate I/O buffer strategy across the tiling pattern.
+
+        Implements static n-tuple buffering strategy by propagating border tensor
+        constraints across all steps in the tiling pattern.
+
+        Parameters
+        ----------
+        tileConstraintPattern : PatternMemoryConstraints
+            Memory constraints for the tiling pattern.
+        pattern : SubGraph
+            The computation subgraph being tiled.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+
+        Returns
+        -------
+        PatternMemoryConstraints
+            Updated pattern memory constraints with propagated I/O buffer strategy.
+
+        Notes
+        -----
+        This method ensures that border tensors (inputs/outputs of the pattern)
+        maintain consistent memory allocation across all computation steps.
+        """
 
         borderTensorStep = NodeMemoryConstraint()
         for patternStep in tileConstraintPattern.nodeConstraints:
@@ -438,6 +730,37 @@ def propagateIOBufferStrategy(self, tileConstraintPattern: PatternMemoryConstrai
 
     def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
                                        tensorConstraint: TensorMemoryConstraint) -> TensorMemoryConstraint:
+        """Resolve symbolic tensor memory constraints to concrete values.
+
+        Converts symbolic variables in tensor memory constraints to their
+        concrete values from the solver solution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint solver model with the solution.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        collector : SolutionCollector
+            Solution collector from the constraint solver.
+        tensorConstraint : TensorMemoryConstraint
+            Symbolic tensor memory constraint to resolve.
+
+        Returns
+        -------
+        TensorMemoryConstraint
+            Tensor memory constraint with resolved concrete values.
+
+        Raises
+        ------
+        AssertionError
+            If the tiler model is not initialized.
+
+        Notes
+        -----
+        This method extracts the actual buffer sizes and shapes from the
+        solved constraint model and creates concrete memory constraints.
+        """
         assert self.tilerModel is not None, "Can't resolve tensor memory constraints, tilerModel is None!"
 
         tensorName = tensorConstraint.tensorName
@@ -472,6 +795,32 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
 
     def _getTilingSolution(self, tilerModel: TilerModel, ctxt: NetworkContext, collector: SolutionCollector,
                            allConstraints: List[PatternMemoryConstraints]) -> List[PatternMemoryConstraints]:
+        """Extract tiling solution from the solved constraint model.
+
+        Processes all pattern memory constraints and resolves symbolic variables
+        to create a concrete tiling solution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The solved constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        collector : SolutionCollector
+            Solution collector from the constraint solver.
+        allConstraints : List[PatternMemoryConstraints]
+            List of all symbolic pattern memory constraints.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Resolved tiling solution with concrete memory constraints.
+
+        Notes
+        -----
+        Only constraints that require resolution (multi-level or transient buffers)
+        are processed. Global single-level buffers are skipped.
+        """
 
         retList = []
 
@@ -502,6 +851,29 @@ def _checkResolve(ctxt, tensorName, tensorConstraint):
 
     def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkContext,
                                       schedule: List[SubGraph]) -> TilerModel:
+        """Set up tensor dimension product variables in the tiler model.
+
+        Adds variables representing the number of elements in each tensor
+        to the constraint model for each pattern in the schedule.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with tensor dimension variables.
+
+        Notes
+        -----
+        Only processes tensors that are marked for deployment in the context.
+        """
 
         for idx, pattern in enumerate(schedule):
             subGraph = gs.Graph(nodes = pattern)
@@ -517,6 +889,33 @@ def _setupTensorDimensionProducts(self, tilerModel: TilerModel, ctxt: NetworkCon
 
     def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
                                    layerBinding: OrderedDict[str, ONNXLayer]) -> TilerModel:
+        """Set up geometric constraints for each layer in the schedule.
+
+        Adds geometric and policy constraints from each layer's tile constraint
+        specification to the tiler model.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with geometric constraints.
+
+        Notes
+        -----
+        Each pattern is treated as a decoupled sub-problem with respect to
+        geometric constraints. Dimension variables are regenerated for each
+        tensor using the copyIdx mechanism.
+        """
 
         # SCHEREMO: Each pattern is a decoupled sub-problem w.r.t the geometric constraints.
         # We need to regenerate dimension variables for each tensor
@@ -542,6 +941,30 @@ def _setupGeometricConstraints(self, tilerModel: TilerModel, ctxt: NetworkContex
         return tilerModel
 
     def _setupHeuristics(self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph]) -> TilerModel:
+        """Set up optimization heuristics for the tiler model.
+
+        Adds optimization objectives to maximize memory usage efficiency
+        for each pattern in the schedule.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        TilerModel
+            Updated tiler model with optimization objectives.
+
+        Notes
+        -----
+        Creates pattern-level memory size variables and adds maximization
+        objectives to encourage efficient memory utilization.
+        """
 
         for idx, pattern in enumerate(schedule):
 
@@ -581,6 +1004,34 @@ def _setupMemoryConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
             layerBinding: OrderedDict[str, ONNXLayer],
             targetMemoryLevelMapping: TargetMemoryLevelMapping) -> Tuple[TilerModel, List[PatternMemoryConstraints]]:
+        """Set up memory constraints for the tiling optimization.
+
+        Generates memory constraints for both inner and outer memory levels,
+        considering the memory hierarchy and scheduling requirements.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model to update.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[TilerModel, List[PatternMemoryConstraints]]
+            Updated tiler model and list of all memory constraints.
+
+        Notes
+        -----
+        Sets up both outer (inter-pattern) and inner (intra-pattern) memory
+        constraints, considering the chosen memory allocation strategy.
+        """
 
         allMemoryConstraints = self._generateAllMemoryConstraints(tilerModel, ctxt, schedule, layerBinding,
                                                                   targetMemoryLevelMapping)
@@ -621,6 +1072,34 @@ def _generateAllMemoryConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
             layerBinding: OrderedDict[str, ONNXLayer],
             targetMemoryLevelMapping: TargetMemoryLevelMapping) -> List[PatternMemoryConstraints]:
+        """Generate all memory constraints combining dynamic and constant tensors.
+
+        Creates comprehensive memory constraints by combining dynamic tensor
+        constraints with constant tensor constraints for each pattern.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Complete list of memory constraints for all patterns.
+
+        Notes
+        -----
+        Combines results from _generateMemoryConstraints to create the complete
+        constraint set including both variable and constant buffers.
+        """
 
         dynamicTensorConstraints, constantTensorConstraints = self._generateMemoryConstraints(
             tilerModel, ctxt, schedule, layerBinding, targetMemoryLevelMapping)
@@ -641,6 +1120,39 @@ def _generateMemoryConstraints(
         self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
         layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping
     ) -> Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]:
+        """Generate memory constraints for variable and constant buffers.
+
+        Creates detailed memory constraints including outer/inner variable
+        buffer constraints, tiled tensor constraints, and constant buffer
+        constraints.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[List[PatternMemoryConstraints], NodeMemoryConstraint]
+            Tuple containing:
+            - List of pattern memory constraints for dynamic tensors
+            - Node memory constraint for constant buffers
+
+        Notes
+        -----
+        Generates three levels of constraints:
+        1. First-level: global buffers + higher-level tensors
+        2. Tiled tensor constraints with double buffering
+        3. In-place tensor constraints for unkilled tensors
+        """
 
         # SCHEREMO: Construct non-double-buffered constraints of local variable buffers
 
@@ -703,6 +1215,38 @@ def _generateMemoryConstraints(
 
     def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext,
                           tensorMemoryConstraint: TensorMemoryConstraint, pattern: SubGraph) -> TensorMemoryConstraint:
+        """Generate tiling path for a tensor across memory hierarchy levels.
+
+        Creates memory constraints for a tensor that needs to move between
+        different levels of the memory hierarchy, including multi-buffering.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        tensorMemoryConstraint : TensorMemoryConstraint
+            Original tensor memory constraint with multiple levels.
+        pattern : SubGraph
+            The computation pattern using this tensor.
+
+        Returns
+        -------
+        TensorMemoryConstraint
+            Updated tensor memory constraint with complete tiling path.
+
+        Raises
+        ------
+        AssertionError
+            If the tensor constraint doesn't have exactly 2 memory levels,
+            or if the multi-buffer factor is invalid.
+
+        Notes
+        -----
+        Uses breadth-first search to find the path between memory levels
+        and applies multi-buffering strategy at each intermediate level.
+        """
 
         assert len(tensorMemoryConstraint.memoryConstraints.keys()
                   ) == 2, "Can't generate a tile path for more than one hierarchy level!"
@@ -736,6 +1280,34 @@ def _generateTilePath(self, tilerModel: TilerModel, ctxt: NetworkContext,
     def _generateIntermediateTilingSteps(self, tilerModel: TilerModel, ctxt: NetworkContext,
                                          sourceStep: NodeMemoryConstraint, destinationStep: NodeMemoryConstraint,
                                          pattern: SubGraph) -> NodeMemoryConstraint:
+        """Generate intermediate tiling steps between source and destination constraints.
+
+        Creates tiling constraints for tensors that need to move between different
+        memory levels within a computation pattern.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        sourceStep : NodeMemoryConstraint
+            Memory constraints for the source step.
+        destinationStep : NodeMemoryConstraint
+            Memory constraints for the destination step.
+        pattern : SubGraph
+            The computation pattern being analyzed.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for intermediate tiling steps.
+
+        Notes
+        -----
+        Identifies tensors that require tiling (those with multiple memory
+        constraints) and generates appropriate tiling paths for them.
+        """
         tileConstraintStep = NodeMemoryConstraint()
 
         mergedStep = sourceStep + destinationStep
@@ -755,6 +1327,39 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont
                                      sourceConstraints: List[PatternMemoryConstraints],
                                      destinationConstraints: List[PatternMemoryConstraints],
                                      schedule: List[SubGraph]) -> List[PatternMemoryConstraints]:
+        """Generate tiling path constraints for all patterns in the schedule.
+
+        Creates comprehensive tiling constraints by combining source and destination
+        constraints for each pattern and applying I/O buffer strategies.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        sourceConstraints : List[PatternMemoryConstraints]
+            Source memory constraints for each pattern.
+        destinationConstraints : List[PatternMemoryConstraints]
+            Destination memory constraints for each pattern.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+
+        Returns
+        -------
+        List[PatternMemoryConstraints]
+            Complete tiling path constraints for all patterns.
+
+        Raises
+        ------
+        AssertionError
+            If source pattern constraints are not single-step.
+
+        Notes
+        -----
+        Assumes source patterns are constant and single-step since they
+        represent tensors that are live throughout the pattern execution.
+        """
 
         tileConstraints = []
 
@@ -781,6 +1386,26 @@ def _generateTilePathConstraints(self, tilerModel: TilerModel, ctxt: NetworkCont
         return tileConstraints
 
     def _generateBufferConstraints(self, ctxt: NetworkContext) -> NodeMemoryConstraint:
+        """Generate memory constraints for constant global buffers.
+
+        Creates memory constraints for all constant buffers that are marked
+        for deployment in the network context.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for all constant global buffers.
+
+        Notes
+        -----
+        Only processes constant buffers with _deploy flag set to True.
+        Each buffer is treated as an input tensor in the constraints.
+        """
 
         constantGlobalConstraint: NodeMemoryConstraint = NodeMemoryConstraint()
         constantGlobalBuffers = [
@@ -805,6 +1430,37 @@ def _generateVariableBufferConstraints(
         self, tilerModel: TilerModel, ctxt: NetworkContext, schedule: List[SubGraph],
         layerBinding: OrderedDict[str, ONNXLayer], targetMemoryLevelMapping: TargetMemoryLevelMapping
     ) -> Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]:
+        """Generate memory constraints for variable buffers using flow analysis.
+
+        Performs liveness analysis on the computation graph to determine
+        memory requirements for variable buffers at different points in execution.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        schedule : List[SubGraph]
+            List of computation patterns in the schedule.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        Tuple[List[PatternMemoryConstraints], List[PatternMemoryConstraints]]
+            Tuple containing:
+            - Outer memory constraints (inter-pattern)
+            - Inner memory constraints (intra-pattern)
+
+        Notes
+        -----
+        Uses graph flow analysis to compute liveness information and generates
+        both outer (pattern-level) and inner (step-level) memory constraints.
+        Includes transient buffer constraints for each computation step.
+        """
 
         def deltaFlow(
                 patternFlow: List[GenericFlowState[TensorMemLevelTuple]]) -> GenericFlowState[TensorMemLevelTuple]:
@@ -877,6 +1533,35 @@ def deltaFlow(
     def _generatePatternStepTransientBufferConstraints(
             self, tilerModel: TilerModel, ctxt: NetworkContext, layerBinding: OrderedDict[str, ONNXLayer],
             step: gs.Node, targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NodeMemoryConstraint:
+        """Generate memory constraints for transient buffers in a pattern step.
+
+        Computes memory requirements for temporary buffers needed during
+        the execution of a single computation step.
+
+        Parameters
+        ----------
+        tilerModel : TilerModel
+            The constraint model.
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        layerBinding : OrderedDict[str, ONNXLayer]
+            Mapping from node names to their layer implementations.
+        step : gs.Node
+            The computation node being analyzed.
+        targetMemoryLevelMapping : TargetMemoryLevelMapping
+            Mapping defining which memory levels to use for each tensor.
+
+        Returns
+        -------
+        NodeMemoryConstraint
+            Memory constraints for transient buffers in this step.
+
+        Notes
+        -----
+        Transient buffers are assumed to be allocated in the same memory
+        level as the main input of the computation step. Buffer sizes are
+        computed using the layer template's transient buffer size calculation.
+        """
 
         patternStepTransientBufferSizes = NodeMemoryConstraint()
 
@@ -907,6 +1592,26 @@ def _generatePatternStepTransientBufferConstraints(
         return patternStepTransientBufferSizes
 
     def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool:
+        """Assert that the schedule uses layer-wise tiling (one node per pattern).
+
+        Verifies that each pattern in the schedule contains exactly one node,
+        which is required for certain memory allocation strategies.
+
+        Parameters
+        ----------
+        schedule : List[List[gs.Node]]
+            The execution schedule to validate.
+
+        Returns
+        -------
+        bool
+            True if all patterns contain exactly one node, False otherwise.
+
+        Notes
+        -----
+        Layer-wise tiling is required when using the MiniMalloc memory
+        allocation strategy.
+        """
         for pattern in schedule:
             if len(pattern) > 1:
                 return False
@@ -914,12 +1619,55 @@ def assertLayerWiseTiling(self, schedule: List[List[gs.Node]]) -> bool:
         return True
 
     def assertUniformMemoryLevelAllocation(self, ctxt: NetworkContext, defaultMemoryLevel: str) -> bool:
+        """Assert that all local buffers are allocated to the default memory level.
+
+        Verifies that all local buffers in the network context are assigned
+        to the specified default memory level.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            Network context containing buffer information.
+        defaultMemoryLevel : str
+            Name of the default memory level to check against.
+
+        Returns
+        -------
+        bool
+            True if all local buffers use the default memory level, False otherwise.
+
+        Notes
+        -----
+        Uniform memory level allocation is required when using the MiniMalloc
+        memory allocation strategy.
+        """
         for buffer in ctxt.localObjects.values():
             if buffer._memoryLevel != defaultMemoryLevel:
                 return False
         return True
 
     def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None:
+        """Test the correctness of a computed tiling solution.
+
+        Validates that buffer sizes in the tiling solution are properly
+        aligned according to memory alignment requirements.
+
+        Parameters
+        ----------
+        tilingSolution : TilingSolution
+            The tiling solution to validate.
+
+        Raises
+        ------
+        AssertionError
+            If any buffer is not properly aligned or if multi-buffer
+            coefficients are not integers.
+
+        Notes
+        -----
+        Checks that all allocated buffers meet the byte alignment requirements
+        specified in MemoryScheduler.byteAlignment.
+        """
         # LMACAN: Assert buffer sizes are word aligned as per comment in MemoryScheduler.py:MemoryScheduler._buildCostVector()
         byteAlignment = MemoryScheduler.byteAlignment
         for patternMemoryConstraint in tilingSolution:
@@ -934,6 +1682,32 @@ def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None:
 
     def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]], graph: gs.Graph,
                                  schedule: Schedule) -> None:
+        """Test the correctness of a computed memory map.
+
+        Validates that the memory map correctly represents buffer lifetimes
+        and ensures all required buffers are alive when needed.
+
+        Parameters
+        ----------
+        memoryMap : Dict[str, List[List[MemoryBlock]]]
+            The memory map to validate.
+        graph : gs.Graph
+            The computation graph.
+        schedule : Schedule
+            The execution schedule.
+
+        Raises
+        ------
+        AssertionError
+            If output buffers are not alive until the end, input buffers
+            are not alive at the beginning, or required buffers are not
+            alive during computation steps.
+
+        Notes
+        -----
+        Performs comprehensive validation of buffer lifetimes to ensure
+        the memory map is consistent with the computation requirements.
+        """
 
         memoryBlockMap = {
             memoryBlock.name: memoryBlock for levelMemoryMap in memoryMap.values() for memoryBlock in levelMemoryMap[-1]
@@ -960,12 +1734,48 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
 
 
 class TilerDeployerWrapper(NetworkDeployerWrapper):
+    """Wrapper for network deployers that adds tiling capabilities.
+
+    Extends NetworkDeployerWrapper to provide automatic tiling and memory
+    management for neural network deployment on memory-constrained hardware.
+
+    Parameters
+    ----------
+    deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper]
+        The base deployer to wrap with tiling capabilities.
+    tilerCls : Type[Tiler], optional
+        The tiler class to use, by default Tiler.
+
+    Attributes
+    ----------
+    tiler : Tiler
+        The tiler instance used for memory optimization.
+
+    Raises
+    ------
+    AssertionError
+        If the platform is not a MemoryPlatform or MemoryPlatformWrapper.
+
+    Notes
+    -----
+    The wrapper automatically handles tiling setup, constraint solving,
+    and memory allocation during the binding process.
+    """
 
     def __init__(self,
                  deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper],
                  tilerCls: Type[Tiler] = Tiler,
                  testName: Optional[str] = None,
                  workDir: Optional[str] = None):
+        """Initialize the tiler deployer wrapper.
+
+        Parameters
+        ----------
+        deployer : Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper]
+            The base deployer to wrap.
+        tilerCls : Type[Tiler], optional
+            The tiler class to instantiate, by default Tiler.
+        """
         super().__init__(deployer)
         assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
@@ -973,9 +1783,56 @@ def __init__(self,
 
     @property
     def worstCaseBufferSize(self):
+        """Get the worst-case buffer sizes including inputs and outputs.
+
+        Computes the total worst-case memory requirements including
+        both tiled buffers and input/output buffers.
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary mapping memory level names to their total worst-case
+            buffer sizes in bytes.
+
+        Notes
+        -----
+        Extends the tiler's worst-case buffer size calculation by adding
+        the memory requirements of input and output buffers.
+        """
         return self.tiler.worstCaseBufferSize
 
     def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optional[MemoryMap] = None):
+        """Perform tiling and memory allocation for the network.
+
+        Executes the complete tiling process including constraint setup,
+        optimization, memory allocation, and code generation updates.
+
+        Parameters
+        ----------
+        tilingSolution : Optional[TilingSolution], optional
+            Pre-computed tiling solution to use instead of computing one.
+            If None, the solution will be computed automatically.
+        memoryMap : Optional[MemoryMap], optional
+            Pre-computed memory map to use instead of computing one.
+            If None, the memory map will be computed automatically.
+
+        Raises
+        ------
+        AssertionError
+            If only one of tilingSolution or memoryMap is provided,
+            if MiniMalloc is used with non-layer-wise tiling,
+            or if tensors are not uniformly allocated when using MiniMalloc.
+
+        Notes
+        -----
+        When using MiniMalloc memory allocation strategy, additional
+        constraints apply:
+        - Only layer-wise execution is supported
+        - All tensors must be in the default memory level
+
+        The method performs validation of the computed solutions and
+        updates the execution blocks with tiling information.
+        """
         assert (tilingSolution is None and memoryMap is None) or (tilingSolution is not None and memoryMap is not None), \
             "You need to provide both the manual tilingSolution and the memoryMap to override tiling."
 
@@ -1022,6 +1879,21 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio
         # SCHEREMO: Code generation STUB
 
     def bind(self):
+        """Bind the network with automatic tiling.
+
+        Performs the complete binding process including layer binding
+        and automatic tiling optimization.
+
+        Returns
+        -------
+        bool
+            True if binding was successful, False otherwise.
+
+        Notes
+        -----
+        Calls the parent bind() method first, then performs tiling
+        if the initial binding was successful.
+        """
         if not super().bind():
             return False
 
@@ -1046,9 +1918,35 @@ def _printMemorySummary(self):
 
 
 def TilingReadyNodeBindings(nodeBindings: List[NodeBinding], tileConstraint: TileConstraint) -> List[NodeBinding]:
-    '''
-    Apply the TillingReadyNodeTemplate to the template of each NodeBinding.
-    '''
+    """Apply tiling constraints to a list of node bindings.
+
+    Creates deep copies of the provided node bindings and attaches the
+    specified tile constraint to each binding's template.
+
+    Parameters
+    ----------
+    nodeBindings : List[NodeBinding]
+        List of node bindings to make tiling-ready.
+    tileConstraint : TileConstraint
+        The tile constraint to attach to each binding.
+
+    Returns
+    -------
+    List[NodeBinding]
+        List of node bindings with tiling constraints attached.
+
+    Notes
+    -----
+    The function creates deep copies to avoid modifying the original
+    node bindings. Each template in the copied bindings gets the
+    tileConstraint attribute set.
+
+    Examples
+    --------
+    >>> bindings = [binding1, binding2, binding3]
+    >>> constraint = MyTileConstraint()
+    >>> tiling_bindings = TilingReadyNodeBindings(bindings, constraint)
+    """
     nodeBindingsCopy = copy.deepcopy(nodeBindings)  #.copy()
     for binding in nodeBindingsCopy:
         binding.template.tileConstraint = tileConstraint
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 0974fa337b..945aaa971d 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -16,18 +16,94 @@
 
 @dataclass
 class MemoryTransfer():
+    """
+    Represents a memory transfer operation between two memory levels.
+
+    This dataclass encapsulates the source and destination memory constraints
+    for a memory transfer operation in the tiling system, defining where data
+    is transferred from and to in the memory hierarchy.
+
+    Attributes
+    ----------
+    source : MemoryConstraint
+        The source memory constraint defining the memory level data is
+        transferred from.
+    destination : MemoryConstraint
+        The destination memory constraint defining the memory level data is
+        transferred to.
+
+    Notes
+    -----
+    This class is used in conjunction with memory hierarchies to define
+    data movement patterns during tiled neural network execution.
+    """
     source: MemoryConstraint
     destination: MemoryConstraint
 
 
 @dataclass
 class HyperRectangle():
+    """
+    Represents a multi-dimensional rectangular region in tensor space.
+
+    A HyperRectangle defines a rectangular tile or region within a
+    multi-dimensional tensor, specified by its position (offset) and
+    dimensions (size) in each axis. This is fundamental for tiled
+    processing of tensors where operations are performed on smaller
+    rectangular chunks.
+
+    Attributes
+    ----------
+    offset : Tuple[int, ...]
+        Position of the hyperrectangle in feature map space. Each element
+        represents the starting index along the corresponding dimension.
+    dims : Tuple[int, ...]
+        Size of the hyperrectangle along each dimension. Each element
+        represents the extent of the rectangle in the corresponding dimension.
+
+    Parameters
+    ----------
+    offset : Tuple[int, ...]
+        Starting position of the rectangle in multi-dimensional space.
+    dims : Tuple[int, ...]
+        Dimensions/size of the rectangle in multi-dimensional space.
+
+    Raises
+    ------
+    AssertionError
+        If the offset and dims tuples have different lengths.
+
+    Notes
+    -----
+    The offset and dims must have the same rank (number of dimensions).
+    This ensures the hyperrectangle is well-defined in the tensor space.
+
+    Examples
+    --------
+    >>> rect = HyperRectangle((0, 5), (10, 15))
+    >>> # Creates a 2D rectangle starting at (0,5) with size 10x15
+    """
     # position of the hyperrectangle in feature map space
     offset: Tuple[int, ...]
     # size of the hyperrectangle
     dims: Tuple[int, ...]
 
     def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
+        """
+        Initialize a HyperRectangle with given offset and dimensions.
+
+        Parameters
+        ----------
+        offset : Tuple[int, ...]
+            Starting position of the rectangle in multi-dimensional space.
+        dims : Tuple[int, ...]
+            Dimensions/size of the rectangle in multi-dimensional space.
+
+        Raises
+        ------
+        AssertionError
+            If offset and dims have mismatching dimensions.
+        """
         assert len(offset) == len(
             dims), f"HyperRectangle offset and dims for mismatching dimensions {offset} and {dims}"
 
@@ -37,10 +113,58 @@ def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
 
 @dataclass
 class AbsoluteHyperRectangle:
+    """
+    Represents a HyperRectangle with an absolute offset in memory space.
+
+    This class combines a HyperRectangle with an absolute memory offset,
+    providing both the logical tensor coordinates and the physical memory
+    location. This is useful for tracking tiles that have been positioned
+    in specific memory locations during tiling operations.
+
+    Attributes
+    ----------
+    rectangle : HyperRectangle
+        The hyperrectangle defining the logical tensor region.
+    absoluteOffset : Tuple[int, ...]
+        The absolute offset in memory space where this rectangle is located.
+
+    Parameters
+    ----------
+    rectangle : HyperRectangle
+        The hyperrectangle to associate with the absolute offset.
+    absoluteOffset : Tuple[int, ...]
+        The absolute position in memory space.
+
+    Raises
+    ------
+    AssertionError
+        If the absoluteOffset and rectangle.offset have mismatching dimensions.
+
+    Notes
+    -----
+    The absoluteOffset must have the same dimensionality as the rectangle's
+    offset to ensure consistent coordinate mapping between logical and physical
+    memory spaces.
+    """
     rectangle: HyperRectangle
     absoluteOffset: Tuple[int, ...]
 
     def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]):
+        """
+        Initialize an AbsoluteHyperRectangle with rectangle and absolute offset.
+
+        Parameters
+        ----------
+        rectangle : HyperRectangle
+            The hyperrectangle defining the logical tensor region.
+        absoluteOffset : Tuple[int, ...]
+            The absolute position in memory space.
+
+        Raises
+        ------
+        AssertionError
+            If absoluteOffset and rectangle.offset have mismatching dimensions.
+        """
         assert len(absoluteOffset) == len(
             rectangle.offset
         ), f"AsoluteHyperRectangle's absoluteOffset and rectangle's offset for mismatching dimensions {absoluteOffset} and {rectangle.offset}"
@@ -51,6 +175,46 @@ def __init__(self, rectangle: HyperRectangle, absoluteOffset: Tuple[int, ...]):
 
 @dataclass
 class TilingSchedule():
+    """
+    Represents a complete schedule for tiled execution of neural network operations.
+
+    A TilingSchedule defines how data should be loaded, processed, and stored
+    during tiled execution. It specifies the memory offsets for input and output
+    tensors, as well as the hyperrectangles that define which regions of data
+    are processed in each tiling step.
+
+    Attributes
+    ----------
+    inputBaseOffsets : Dict[str, List[int]]
+        Dictionary mapping tensor names to lists of base memory offsets for
+        input tiles. Each list should have length equal to the number of tiles.
+    outputBaseOffsets : Dict[str, List[int]]
+        Dictionary mapping tensor names to lists of base memory offsets for
+        output tiles. Each list should have length equal to the number of tiles.
+    inputLoadSchedule : List[Dict[str, HyperRectangle]]
+        List of dictionaries, one per tile, mapping tensor names to the
+        hyperrectangles that should be loaded as input for that tile.
+    outputLoadSchedule : List[Dict[str, HyperRectangle]]
+        List of dictionaries, one per tile, mapping tensor names to the
+        hyperrectangles that should be stored as output for that tile.
+
+    Parameters
+    ----------
+    inputBaseOffsets : Dict[str, List[int]]
+        Input tensor base offsets for each tile.
+    outputBaseOffsets : Dict[str, List[int]]
+        Output tensor base offsets for each tile.
+    inputLoadSchedule : List[Dict[str, HyperRectangle]]
+        Input loading schedule for each tile.
+    outputLoadSchedule : List[Dict[str, HyperRectangle]]
+        Output storing schedule for each tile.
+
+    Notes
+    -----
+    The lengths of inputLoadSchedule and outputLoadSchedule should typically
+    be equal, representing the same number of tiles. Each schedule step
+    corresponds to processing one tile of the operation.
+    """
     # the places to store input tiles
     # Should have length numTiles
     inputBaseOffsets: Dict[str, List[int]]
@@ -70,6 +234,27 @@ class TilingSchedule():
     def __init__(self, inputBaseOffsets: Dict[str, List[int]], outputBaseOffsets: Dict[str, List[int]],
                  inputLoadSchedule: List[Dict[str, HyperRectangle]], outputLoadSchedule: List[Dict[str,
                                                                                                    HyperRectangle]]):
+        """
+        Initialize a TilingSchedule with specified offsets and load schedules.
+
+        Parameters
+        ----------
+        inputBaseOffsets : Dict[str, List[int]]
+            Input tensor base offsets for each tile.
+        outputBaseOffsets : Dict[str, List[int]]
+            Output tensor base offsets for each tile.
+        inputLoadSchedule : List[Dict[str, HyperRectangle]]
+            Input loading schedule for each tile.
+        outputLoadSchedule : List[Dict[str, HyperRectangle]]
+            Output storing schedule for each tile.
+
+        Raises
+        ------
+        AssertionError
+            If any key from inputBaseOffsets is missing from a schedule step
+            in inputLoadSchedule, or if any key from outputBaseOffsets is
+            missing from a schedule step in outputLoadSchedule.
+        """
 
         # assert len(inputLoadSchedule) == len(outputLoadSchedule), "Didn't get equal amount of input and output tiles!"
 
@@ -100,6 +285,30 @@ def __repr__(self) -> str:
         return outStr
 
     def __add__(self, other: TilingSchedule) -> TilingSchedule:
+        """
+        Concatenate two TilingSchedule objects.
+
+        Combines this tiling schedule with another by concatenating their
+        load schedules while maintaining the same base offsets. This is
+        useful for creating composite tiling schedules from multiple stages.
+
+        Parameters
+        ----------
+        other : TilingSchedule
+            The other TilingSchedule to concatenate with this one.
+
+        Returns
+        -------
+        TilingSchedule
+            A new TilingSchedule containing the concatenated load schedules
+            from both input schedules.
+
+        Raises
+        ------
+        AssertionError
+            If the other object is not a TilingSchedule, or if the tensor
+            keys don't match between the two schedules.
+        """
 
         assert isinstance(other, TilingSchedule), f"Other {other} is not a TilingSchedule"
 
@@ -124,10 +333,60 @@ def __add__(self, other: TilingSchedule) -> TilingSchedule:
 
 @dataclass
 class VariableReplacementScheme():
+    """
+    Defines how variables should be replaced with tile-specific values.
+
+    This class manages the replacement of scalar variables with arrays of
+    tile-specific values during tiled execution. It tracks both the per-tile
+    replacement values and the corresponding data types for each variable.
+
+    Attributes
+    ----------
+    perTileReplacements : Dict[str, List]
+        Dictionary mapping variable names to lists of replacement values,
+        one value per tile. Each list should have length equal to the
+        number of tiles.
+    replacementTypes : Dict[str, Type[Pointer]]
+        Dictionary mapping variable names to their corresponding pointer
+        types for the replacement arrays.
+
+    Parameters
+    ----------
+    perTileReplacements : Dict[str, List]
+        Per-tile replacement values for each variable.
+    replacementTypes : Dict[str, Type[Pointer]]
+        Type information for each replacement variable.
+
+    Raises
+    ------
+    AssertionError
+        If the keys in perTileReplacements and replacementTypes don't match
+        exactly, or if they have different numbers of entries.
+
+    Notes
+    -----
+    This scheme is used to replace compile-time constants with runtime
+    arrays during tiled execution, enabling different values for each tile.
+    """
     perTileReplacements: Dict[str, List]
     replacementTypes: Dict[str, Type[Pointer]]
 
     def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[str, Type[Pointer]]):
+        """
+        Initialize a VariableReplacementScheme with replacements and types.
+
+        Parameters
+        ----------
+        perTileReplacements : Dict[str, List]
+            Per-tile replacement values for each variable.
+        replacementTypes : Dict[str, Type[Pointer]]
+            Type information for each replacement variable.
+
+        Raises
+        ------
+        AssertionError
+            If the keys don't match exactly or have different counts.
+        """
         assert len(perTileReplacements.keys()) == len(
             replacementTypes.keys()), "Exactly all replacements must have one type"
 
@@ -138,6 +397,29 @@ def __init__(self, perTileReplacements: Dict[str, List], replacementTypes: Dict[
         self.replacementTypes = replacementTypes
 
     def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme:
+        """
+        Concatenate two VariableReplacementScheme objects.
+
+        Combines this replacement scheme with another by concatenating their
+        per-tile replacement lists. This is useful for merging replacement
+        schemes from multiple tiling stages.
+
+        Parameters
+        ----------
+        other : VariableReplacementScheme
+            The other VariableReplacementScheme to concatenate with this one.
+
+        Returns
+        -------
+        VariableReplacementScheme
+            A new VariableReplacementScheme with concatenated replacement lists.
+
+        Raises
+        ------
+        AssertionError
+            If the other object is not a VariableReplacementScheme, or if
+            the variable keys don't match between the two schemes.
+        """
 
         assert isinstance(other, VariableReplacementScheme), f"Other {other} is not a VariableReplacementScheme"
 
@@ -161,6 +443,33 @@ def __add__(self, other: VariableReplacementScheme) -> VariableReplacementScheme
 def minimizeVariableReplacement(
         scheme: VariableReplacementScheme,
         operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, Dict]:
+    """
+    Optimize a variable replacement scheme by eliminating constant replacements.
+
+    Analyzes the replacement scheme and removes variables that have the same
+    value across all tiles, directly setting them in the operator representation
+    instead. This optimization reduces memory usage and improves performance.
+
+    Parameters
+    ----------
+    scheme : VariableReplacementScheme
+        The original variable replacement scheme to optimize.
+    operatorRepresentation : OperatorRepresentation
+        The operator representation that will be updated with constant values.
+
+    Returns
+    -------
+    Tuple[VariableReplacementScheme, Dict]
+        A tuple containing:
+        - The minimized VariableReplacementScheme with only non-constant variables
+        - A dictionary of updates to apply to the operator representation
+
+    Notes
+    -----
+    Variables with identical values across all tiles are considered constants
+    and are removed from the replacement scheme. Their single value is set
+    directly in the operator representation.
+    """
     newPerTileRep = {}
     newRepTypes = {}
 
@@ -175,6 +484,48 @@ def minimizeVariableReplacement(
 
 
 def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+    """
+    Minimize a hyperrectangle by collapsing dimensions where possible.
+
+    Reduces the dimensionality of a hyperrectangle by merging consecutive
+    dimensions where the rectangle spans the entire reference shape. This
+    optimization is useful for memory transfers and reduces complexity.
+
+    Parameters
+    ----------
+    rect : HyperRectangle
+        The hyperrectangle to minimize.
+    referenceShape : Sequence[int]
+        The shape of the reference tensor that the rectangle is within.
+
+    Returns
+    -------
+    Tuple[HyperRectangle, Tuple[int, ...]]
+        A tuple containing:
+        - The minimized HyperRectangle with collapsed dimensions
+        - The corresponding minimized reference shape
+
+    Raises
+    ------
+    AssertionError
+        If the rectangle offset is non-zero when dimensions match the
+        reference shape (indicating the rectangle spans the full dimension).
+
+    Notes
+    -----
+    Dimensions are collapsed from right to left. When a rectangle dimension
+    equals the reference dimension and has zero offset, it can be merged
+    with adjacent dimensions to reduce the overall rank.
+
+    Example
+    -------
+    >>> rect = HyperRectangle((0, 0), (2, 2))
+    >>> minimizeRectangle(rect, (4, 4))
+        (HyperRectangle(offset=(0, 0), dims=(2, 2)), (2, 4))
+    >>> rect = HyperRectangle((0, 0), (2, 2))
+    >>> minimizeRectangle(rect, (4, 2))
+        (HyperRectangle(offset=(0,), dims=(4,)), (8,))
+    """
     minRectShape: List[int] = []
     minRectOffset: List[int] = []
     minReferenceShape: List[int] = []
@@ -200,6 +551,37 @@ def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tu
 
 
 def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    """
+    Pad a shape tuple to a target rank by prepending ones.
+
+    Extends a shape tuple to a higher dimensionality by adding leading
+    dimensions of size 1. This is useful for broadcasting operations
+    and ensuring consistent tensor ranks.
+
+    Parameters
+    ----------
+    shape : Tuple[int, ...]
+        The original shape tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded shape.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded shape tuple with leading dimensions of size 1.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current shape's rank.
+
+    Examples
+    --------
+    >>> padShape((3, 4), 4)
+    (1, 1, 3, 4)
+    >>> padShape((5,), 3)
+    (1, 1, 5)
+    """
     assert rank >= len(
         shape), f"Cannot pad to rank smaller then shape's. Received rank: {rank}, shape rank: {len(shape)}"
     ret = tuple([1] * (rank - len(shape))) + shape
@@ -208,6 +590,37 @@ def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
 
 
 def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    """
+    Pad an offset tuple to a target rank by prepending zeros.
+
+    Extends an offset tuple to a higher dimensionality by adding leading
+    offset values of 0. This ensures offset tuples match the rank of
+    their corresponding shapes.
+
+    Parameters
+    ----------
+    offset : Tuple[int, ...]
+        The original offset tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded offset.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded offset tuple with leading zeros.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current offset's rank.
+
+    Examples
+    --------
+    >>> padOffset((2, 3), 4)
+    (0, 0, 2, 3)
+    >>> padOffset((5,), 3)
+    (0, 0, 5)
+    """
     assert rank >= len(
         offset), f"Cannot pad to rank smaller then offset's. Received rank: {rank}, offset rank: {len(offset)}"
     ret = tuple([0] * (rank - len(offset))) + offset
@@ -216,6 +629,39 @@ def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
 
 
 def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[int, ...]:
+    """
+    Pad a stride tuple to a target rank by prepending a specified stride value.
+
+    Extends a stride tuple to a higher dimensionality by adding leading
+    stride values. This is useful for maintaining consistent stride
+    calculations across different tensor ranks.
+
+    Parameters
+    ----------
+    stride : Tuple[int, ...]
+        The original stride tuple to pad.
+    rank : int
+        The target rank (number of dimensions) for the padded stride.
+    paddingStride : int
+        The stride value to use for padding (prepended dimensions).
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The padded stride tuple with leading padding stride values.
+
+    Raises
+    ------
+    AssertionError
+        If the target rank is smaller than the current stride's rank.
+
+    Examples
+    --------
+    >>> padStride((4, 1), 4, 16)
+    (16, 16, 4, 1)
+    >>> padStride((1,), 3, 8)
+    (8, 8, 1)
+    """
     assert rank >= len(
         stride), f"Cannot pad to rank smaller then stride's. Received rank: {rank}, stride rank: {len(stride)}"
     ret = tuple([paddingStride] * (rank - len(stride))) + stride
@@ -224,6 +670,36 @@ def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[i
 
 
 def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]:
+    """
+    Calculate memory strides from a tensor shape.
+
+    Computes the stride values for each dimension of a tensor based on its
+    shape. Strides represent the number of elements to skip in memory when
+    moving one position along each dimension.
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        The shape of the tensor as a sequence of dimension sizes.
+
+    Returns
+    -------
+    Tuple[int, ...]
+        The stride values for each dimension, where the last dimension
+        has stride 1 and earlier dimensions have progressively larger strides.
+
+    Notes
+    -----
+    Strides are computed assuming row-major (C-style) memory layout.
+    The stride for dimension i is the product of all dimensions after i.
+
+    Examples
+    --------
+    >>> stridesFromShape([2, 3, 4])
+    (12, 4, 1)
+    >>> stridesFromShape([5, 6])
+    (6, 1)
+    """
     strides = [1] * len(shape)
     for idx, dim in enumerate(reversed(shape[1:])):
         strides[idx + 1] = strides[idx] * dim
@@ -231,18 +707,114 @@ def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]:
 
 
 def calculateFlatOffset(offsets: Sequence[int], strides: Sequence[int]) -> int:
+    """
+    Calculate the flat memory offset from multi-dimensional coordinates.
+
+    Converts multi-dimensional tensor coordinates (offsets) to a single
+    flat memory offset using the provided stride information. This is
+    essential for translating tensor indices to memory addresses.
+
+    Parameters
+    ----------
+    offsets : Sequence[int]
+        The multi-dimensional coordinates/offsets in each dimension.
+    strides : Sequence[int]
+        The stride values for each dimension.
+
+    Returns
+    -------
+    int
+        The flat memory offset corresponding to the multi-dimensional position.
+
+    Raises
+    ------
+    AssertionError
+        If offsets and strides have different numbers of dimensions.
+
+    Notes
+    -----
+    The flat offset is computed as the sum of (offset[i] * stride[i])
+    for all dimensions i.
+
+    Examples
+    --------
+    >>> calculateFlatOffset([1, 2, 3], [12, 4, 1])
+    23
+    >>> calculateFlatOffset([0, 1], [6, 1])
+    1
+    """
     assert len(offsets) == len(strides), \
         f"Offsets and strides have to have the same number of dimensions. Length offsets: {len(offsets)}, strides: {len(strides)}"
     return sum(offset * stride for offset, stride in zip(offsets, strides))
 
 
 def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBuffer) -> int:
+    """
+    Calculate the flat memory offset in bytes for a hyperrectangle tile.
+
+    Computes the byte offset in memory for the starting position of a
+    hyperrectangle tile within a reference buffer. This accounts for
+    both the multi-dimensional positioning and the data type size.
+
+    Parameters
+    ----------
+    tile : HyperRectangle
+        The hyperrectangle tile whose offset should be calculated.
+    referenceBuffer : VariableBuffer
+        The reference buffer containing the tile, used for shape and type info.
+
+    Returns
+    -------
+    int
+        The flat memory offset in bytes from the buffer start to the tile start.
+
+    Notes
+    -----
+    The calculation combines multi-dimensional offset computation with
+    data type width to produce a byte-level memory offset.
+    """
     return int(
         calculateFlatOffset(tile.offset, stridesFromShape(referenceBuffer.shape)) *
         (referenceBuffer._type.referencedType.typeWidth // 8))
 
 
 def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]:
+    """
+    Compute hyperrectangle tiles for a memory transfer operation.
+
+    Generates a list of hyperrectangle tiles that partition the source tensor
+    into smaller chunks that fit within the destination memory constraints.
+    This is fundamental for tiled execution where large tensors are processed
+    in smaller, memory-efficient pieces.
+
+    Parameters
+    ----------
+    memoryTransfer : MemoryTransfer
+        The memory transfer operation defining source and destination constraints.
+
+    Returns
+    -------
+    List[HyperRectangle]
+        A list of hyperrectangle tiles that cover the entire source tensor,
+        each fitting within the destination memory constraints.
+
+    Raises
+    ------
+    AssertionError
+        If source or destination shapes are undefined, if they have different
+        numbers of dimensions, or if any destination dimension is larger than
+        the corresponding source dimension.
+
+    Notes
+    -----
+    The tiling algorithm generates non-overlapping tiles that completely
+    cover the source tensor. Each tile is sized to fit within the destination
+    memory constraints, with edge tiles potentially being smaller to fit
+    exactly within the source tensor boundaries.
+
+    The tiles are generated in row-major order, iterating through dimensions
+    from outermost to innermost.
+    """
     assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!"
     assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!"
 
@@ -256,6 +828,19 @@ def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRect
         assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})"
 
     def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
+        """
+        Generate tile indices in row-major order.
+
+        Parameters
+        ----------
+        tileIndexEnd : List[int]
+            The end index for each dimension (exclusive).
+
+        Yields
+        ------
+        List[int]
+            Successive tile indices covering the entire index space.
+        """
         tileCount = np.prod(tileIndexEnd)
         tileIndex = [0] * len(tileIndexEnd)
         for _ in range(tileCount):

From d22248d3f0a55dc401728a932f5b49a2ac3cbc78 Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Thu, 5 Feb 2026 12:57:47 +0100
Subject: [PATCH 4/9] Fix potential lost function parameter

---
 Deeploy/DeeployTypes.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index f5962718f0..9059b23897 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -531,11 +531,11 @@ def __init__(self,
                  constantBuffer: Type[ConstantBuffer],
                  structBuffer: Type[StructBuffer],
                  transientBuffer: Type[TransientBuffer],
-                 globalObjects = {},
-                 localObjects = {},
+                 globalObjects: Optional[OrderedDict] = None,
+                 localObjects: Optional[OrderedDict] = None,
                  name: str = 'DeeployNetwork'):
-        self.globalObjects = OrderedDict()
-        self.localObjects = OrderedDict()
+        self.globalObjects = globalObjects if globalObjects is not None else OrderedDict()
+        self.localObjects = localObjects if localObjects is not None else OrderedDict()
         self.VariableBuffer = variableBuffer
         self.ConstantBuffer = constantBuffer
         self.StructBuffer = structBuffer

From 9b51f9217381340f2b10462a348a5d1fd9724821 Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Sat, 13 Sep 2025 08:34:16 +0200
Subject: [PATCH 5/9] Move MemoryAware Passes to correct place

---
 .../CodeTransformationPasses/Closure.py       | 109 ------
 .../CodeTransformationPasses/PrintInputs.py   | 329 +----------------
 .../CodeTransformationPasses/Closure.py       | 135 +++++++
 .../CodeTransformationPasses/PrintInputs.py   | 348 ++++++++++++++++++
 .../CodeTransformationPasses/__init__.py      |  24 ++
 .../MemoryLevelAnnotationPasses.py            |  40 +-
 .../MemoryLevelAnnotationPasses.py            |  64 ++++
 .../Neureka/OptimizationPasses/__init__.py    |  24 ++
 Deeploy/Targets/PULPOpen/Bindings.py          |   3 +-
 Deeploy/Targets/Snitch/Bindings.py            |   3 +-
 DeeployTest/testMVP.py                        |   3 +-
 .../testPrintInputOutputTransformation.py     |   5 +-
 12 files changed, 613 insertions(+), 474 deletions(-)
 create mode 100644 Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
 create mode 100644 Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
 create mode 100644 Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
 create mode 100644 Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
 create mode 100644 Deeploy/Targets/Neureka/OptimizationPasses/__init__.py

diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
index a7579b85a0..0b36ad7b4f 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -353,112 +353,3 @@ def apply(self,
         ctxt = self._generateClosureCtxt(ctxt, name)
         ctxt, executionBlock = self._generateClosureCall(ctxt, executionBlock, name)
         return ctxt, executionBlock
-
-
-class MemoryAwareClosureGeneration(ClosureGeneration):
-    """
-    Memory-aware closure generation for multi-level memory hierarchies.
-
-    This class extends ClosureGeneration to handle memory-aware closure
-    generation where only certain memory levels are included in the closure
-    arguments. It filters buffers based on their memory level, including
-    only those that belong to specific memory regions in the hierarchy.
-
-    Notes
-    -----
-    This class is useful for multi-level memory systems where different
-    memory levels have different access patterns and only certain levels
-    should be passed as closure arguments. Buffers are included if they:
-    - Have no memory level annotation
-    - Belong to the start region
-    - Do not belong to the end region (are in higher levels)
-    """
-
-    def __init__(self,
-                 closureCallTemplate: NodeTemplate = _closureCallTemplate,
-                 closureSuffix = "_closure",
-                 writeback: bool = True,
-                 generateStruct: bool = True,
-                 startRegion: str = "L2",
-                 endRegion: str = "L1"):
-        """
-        Initialize the MemoryAwareClosureGeneration transformation pass.
-
-        Parameters
-        ----------
-        closureCallTemplate : NodeTemplate, optional
-            Template for generating closure function calls. Default is the
-            global _closureCallTemplate.
-        closureSuffix : str, optional
-            Suffix to append to closure function names. Default is "_closure".
-        writeback : bool, optional
-            Whether to generate writeback code for closure arguments.
-            Default is True.
-        generateStruct : bool, optional
-            Whether to generate argument structure definitions. Default is True.
-        startRegion : str, optional
-            The starting memory region to include in closures. Default is "L2".
-        endRegion : str, optional
-            The ending memory region to include in closures. Default is "L1".
-        """
-        super().__init__(closureCallTemplate, closureSuffix, writeback, generateStruct)
-        self.startRegion = startRegion
-        self.endRegion = endRegion
-
-    # Don't override this
-    def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
-        """
-        Generate memory-aware closure argument structure.
-
-        Overrides the base class method to implement memory-level filtering.
-        Only includes buffers that belong to appropriate memory levels based
-        on the configured start and end regions.
-
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            The network context containing buffer information.
-        executionBlock : ExecutionBlock
-            The execution block to analyze for dynamic references.
-
-        Notes
-        -----
-        This method filters dynamic references based on memory levels:
-        - Includes buffers with no memory level annotation
-        - Includes buffers from the start region
-        - Includes buffers not from the end region (higher memory levels)
-
-        The filtering logic ensures that only relevant buffers are passed
-        as closure arguments, reducing memory transfer overhead in
-        multi-level memory hierarchies.
-        """
-
-        # Add closure struct info to operatorRepresentation
-        closureStructArgsType = {}
-        closureStruct = {}
-        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, unrollStructs = True)
-
-        filteredMakoDynamicReferences = []
-
-        for ref in makoDynamicReferences:
-            buf = ctxt.lookup(ref)
-            if not hasattr(buf, "_memoryLevel") or buf._memoryLevel is None:
-                filteredMakoDynamicReferences.append(ref)
-                continue
-
-            if buf._memoryLevel == self.startRegion or buf._memoryLevel != self.endRegion:
-                filteredMakoDynamicReferences.append(ref)
-
-        for arg in list(dict.fromkeys(filteredMakoDynamicReferences)):
-            ref = ctxt.lookup(arg)
-            if isinstance(ref, TransientBuffer):
-                closureStructArgsType[ctxt._mangle(arg)] = PointerClass(VoidType)
-            elif not isinstance(ref, StructBuffer):
-                closureStructArgsType[ctxt._mangle(arg)] = ref._type
-
-            if not isinstance(ref, StructBuffer):
-                closureStruct[ctxt._mangle(arg)] = arg
-
-        structClass = StructClass(self.closureName + "_args_t", closureStructArgsType)
-        self.closureStructArgType = structClass
-        self.closureStructArgs = self.closureStructArgType(closureStruct, ctxt)
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
index 32d249093b..87b12e5a7d 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
@@ -2,8 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import re
-from typing import Optional, Tuple
+from typing import Tuple
 
 from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
     IntrospectiveCodeTransformationMixIn
@@ -159,170 +158,6 @@ def apply(self,
         return ctxt, executionBlock
 
 
-class MemoryAwareGeneration():
-    """
-    Base class for memory-aware debug printing transformations.
-
-    This class provides memory hierarchy filtering functionality for debug
-    printing transformations. It allows selective printing of tensors based
-    on their memory level assignments, enabling focused debugging of specific
-    memory regions in multi-level memory architectures.
-
-    Parameters
-    ----------
-    memoryHierarchyRegex : str, optional
-        A regular expression pattern to match against buffer memory levels.
-        If None, only buffers without memory level annotations are included.
-
-    Attributes
-    ----------
-    regex : re.Pattern or None
-        Compiled regular expression for memory level matching, or None if
-        no filtering is applied.
-
-    Notes
-    -----
-    This class is designed to be used as a mixin with specific printing
-    transformation classes. It provides the `_matchesRegex` method for
-    filtering buffers based on their memory level assignments.
-
-    The regex-based filtering enables fine-grained control over which
-    memory levels are included in debug output, which is crucial for
-    debugging complex memory hierarchies in embedded neural network
-    deployments.
-    """
-
-    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
-        """
-        Initialize the MemoryAwareGeneration base class.
-
-        Parameters
-        ----------
-        memoryHierarchyRegex : str, optional
-            A regular expression pattern to match against buffer memory levels.
-            If None, only buffers without memory level annotations are included.
-        """
-        super().__init__()
-        if memoryHierarchyRegex is not None:
-            self.regex = re.compile(memoryHierarchyRegex)
-        else:
-            self.regex = None
-
-    def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
-        """
-        Check if a buffer matches the memory hierarchy regex pattern.
-
-        Determines whether a buffer should be included in debug output based
-        on its memory level assignment and the configured regex pattern.
-
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            The network context containing buffer information.
-        key : str
-            The buffer reference key to check.
-
-        Returns
-        -------
-        bool
-            True if the buffer matches the criteria and should be included
-            in debug output, False otherwise.
-
-        Notes
-        -----
-        Matching logic:
-        - If no regex is configured: matches buffers without memory level
-        - If regex is configured: matches buffers whose memory level
-          matches the regex pattern
-        - Buffers without memory level annotations don't match when
-          a regex is configured
-        """
-        _buffer = ctxt.lookup(key)
-
-        if self.regex is None:
-            return not hasattr(_buffer, "_memoryLevel")
-
-        if not hasattr(_buffer, "_memoryLevel"):
-            return False
-
-        ret = self.regex.findall(ctxt.lookup(key)._memoryLevel)
-        return ret != []
-
-
-class MemoryAwarePrintInputGeneration(MemoryAwareGeneration, PrintInputGeneration):
-    """
-    Memory-aware input tensor debug printing transformation.
-
-    This class combines MemoryAwareGeneration and PrintInputGeneration to
-    provide selective debug printing of input tensors based on their memory
-    level assignments. It's particularly useful for debugging multi-level
-    memory architectures where you want to focus on specific memory regions.
-
-    The class inherits filtering capabilities from MemoryAwareGeneration and
-    input printing logic from PrintInputGeneration, applying memory-based
-    filtering before generating debug print statements.
-
-    Notes
-    -----
-    This transformation is especially valuable in embedded neural network
-    deployments with complex memory hierarchies (e.g., L1/L2/L3 cache levels,
-    scratchpad memories, external DRAM) where debugging specific memory
-    regions is crucial for performance optimization and correctness verification.
-    """
-
-    def apply(self,
-              ctxt: NetworkContext,
-              executionBlock: ExecutionBlock,
-              name: str,
-              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        """
-        Apply memory-aware input tensor printing transformation.
-
-        Filters input tensors by memory level before adding debug print
-        statements, enabling focused debugging of specific memory regions.
-
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            The network context containing buffer and type information.
-        executionBlock : ExecutionBlock
-            The execution block to instrument with filtered input printing code.
-        name : str
-            The name of the operation being instrumented.
-        verbose : CodeGenVerbosity, optional
-            The verbosity level for code generation. Default is _NoVerbosity.
-
-        Returns
-        -------
-        Tuple[NetworkContext, ExecutionBlock]
-            A tuple containing:
-            - The unchanged network context
-            - The modified execution block with filtered input print statements
-
-        Notes
-        -----
-        The transformation:
-        1. Extracts all dynamic references from the execution block
-        2. Filters references based on memory level regex matching
-        3. Further filters for tensors that are inputs to this operation
-        4. Adds debug print statements for qualifying tensors
-        """
-
-        references = self.extractDynamicReferences(ctxt,
-                                                   executionBlock,
-                                                   unrollStructs = True,
-                                                   includeGlobalReferences = True)
-
-        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
-
-        for ref in filteredReferences:
-            refDict = self._getRepDict(ctxt, ref, name)
-            if refDict is not None:
-                executionBlock.addLeft(_DebugPrintTemplate, refDict)
-
-        return ctxt, executionBlock
-
-
 class PrintOutputGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
     """
     Code transformation pass for generating debug print statements for output tensors.
@@ -449,80 +284,6 @@ def apply(self,
         return ctxt, executionBlock
 
 
-class MemoryAwarePrintOutputGeneration(MemoryAwareGeneration, PrintOutputGeneration):
-    """
-    Memory-aware output tensor debug printing transformation.
-
-    This class combines MemoryAwareGeneration and PrintOutputGeneration to
-    provide selective debug printing of output tensors based on their memory
-    level assignments. It enables focused debugging of output data in specific
-    memory regions within multi-level memory architectures.
-
-    The class inherits filtering capabilities from MemoryAwareGeneration and
-    output printing logic from PrintOutputGeneration, applying memory-based
-    filtering before generating debug print statements for output tensors.
-
-    Notes
-    -----
-    This transformation is particularly valuable for verifying that output
-    data is correctly written to the intended memory levels in complex
-    memory hierarchies, and for debugging memory management issues in
-    embedded neural network deployments.
-    """
-
-    def apply(self,
-              ctxt: NetworkContext,
-              executionBlock: ExecutionBlock,
-              name: str,
-              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        """
-        Apply memory-aware output tensor printing transformation.
-
-        Filters output tensors by memory level before adding debug print
-        statements, enabling focused debugging of specific memory regions.
-
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            The network context containing buffer and type information.
-        executionBlock : ExecutionBlock
-            The execution block to instrument with filtered output printing code.
-        name : str
-            The name of the operation being instrumented.
-        verbose : CodeGenVerbosity, optional
-            The verbosity level for code generation. Default is _NoVerbosity.
-
-        Returns
-        -------
-        Tuple[NetworkContext, ExecutionBlock]
-            A tuple containing:
-            - The unchanged network context
-            - The modified execution block with filtered output print statements
-
-        Notes
-        -----
-        The transformation:
-        1. Extracts all dynamic references from the execution block
-        2. Filters references based on memory level regex matching
-        3. Further filters for tensors that are outputs from this operation
-        4. Adds debug print statements for qualifying tensors after execution
-        """
-
-        references = self.extractDynamicReferences(ctxt,
-                                                   executionBlock,
-                                                   unrollStructs = True,
-                                                   includeGlobalReferences = True)
-
-        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
-
-        for ref in filteredReferences:
-            refDict = self._getRepDict(ctxt, ref, name)
-            if refDict is not None:
-                executionBlock.addRight(_DebugPrintTemplate, refDict)
-
-        return ctxt, executionBlock
-
-
 class PrintConstantGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
     """
     Code transformation pass for generating debug print statements for constant tensors.
@@ -590,8 +351,11 @@ def _getRepDict(self, ctxt: NetworkContext, ref: str, name: str):
 
         return {"bufferName": refbuf.name, "bufferType": _buf._type, "bufferShape": _buf.shape, "nodeName": name}
 
-    def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-              name: str) -> Tuple[NetworkContext, ExecutionBlock]:
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
         """
         Apply constant tensor printing transformation to an execution block.
 
@@ -636,84 +400,3 @@ def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
                 executionBlock.addLeft(_DebugPrintTemplate, rep)
 
         return ctxt, executionBlock
-
-
-class MemoryAwarePrintConstantGeneration(MemoryAwareGeneration, PrintConstantGeneration):
-    """
-    Memory-aware constant tensor debug printing transformation.
-
-    This class combines MemoryAwareGeneration and PrintConstantGeneration to
-    provide selective debug printing of constant tensors based on their memory
-    level assignments. It enables focused debugging of constant data (weights,
-    biases, parameters) in specific memory regions within multi-level memory
-    architectures.
-
-    The class inherits filtering capabilities from MemoryAwareGeneration and
-    constant printing logic from PrintConstantGeneration, applying memory-based
-    filtering before generating debug print statements for constant tensors.
-
-    Notes
-    -----
-    This transformation is particularly valuable for:
-    - Verifying parameter placement in specific memory levels
-    - Debugging weight loading and quantization in embedded deployments
-    - Analyzing memory usage patterns for constant data
-    - Troubleshooting parameter access issues in complex memory hierarchies
-
-    It's especially useful in scenarios where different constant tensors
-    are placed in different memory levels for performance optimization.
-    """
-
-    def apply(self,
-              ctxt: NetworkContext,
-              executionBlock: ExecutionBlock,
-              name: str,
-              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        """
-        Apply memory-aware constant tensor printing transformation.
-
-        Filters constant tensors by memory level before adding debug print
-        statements, enabling focused debugging of parameters in specific
-        memory regions.
-
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            The network context containing buffer and type information.
-        executionBlock : ExecutionBlock
-            The execution block to instrument with filtered constant printing code.
-        name : str
-            The name of the operation being instrumented.
-        verbose : CodeGenVerbosity, optional
-            The verbosity level for code generation. Default is _NoVerbosity.
-            This parameter is currently unused by the implementation.
-
-        Returns
-        -------
-        Tuple[NetworkContext, ExecutionBlock]
-            A tuple containing:
-            - The unchanged network context
-            - The modified execution block with filtered constant print statements
-
-        Notes
-        -----
-        The transformation:
-        1. Extracts all dynamic references from the execution block
-        2. Filters references based on memory level regex matching
-        3. Further filters for constant buffers that have users
-        4. Adds debug print statements for qualifying constant tensors
-        """
-
-        references = self.extractDynamicReferences(ctxt,
-                                                   executionBlock,
-                                                   unrollStructs = True,
-                                                   includeGlobalReferences = True)
-
-        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
-
-        for ref in filteredReferences:
-            refDict = self._getRepDict(ctxt, ref, name)
-            if refDict is not None:
-                executionBlock.addLeft(_DebugPrintTemplate, refDict)
-
-        return ctxt, executionBlock
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
new file mode 100644
index 0000000000..ac219c6bae
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
@@ -0,0 +1,135 @@
+# ----------------------------------------------------------------------
+#
+# File: Closure.py
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass, StructClass, VoidType
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, _closureCallTemplate
+from Deeploy.DeeployTypes import ExecutionBlock, NetworkContext, NodeTemplate, StructBuffer, TransientBuffer
+
+
+class MemoryAwareClosureGeneration(ClosureGeneration):
+    """
+    Memory-aware closure generation for multi-level memory hierarchies.
+
+    This class extends ClosureGeneration to handle memory-aware closure
+    generation where only certain memory levels are included in the closure
+    arguments. It filters buffers based on their memory level, including
+    only those that belong to specific memory regions in the hierarchy.
+
+    Notes
+    -----
+    This class is useful for multi-level memory systems where different
+    memory levels have different access patterns and only certain levels
+    should be passed as closure arguments. Buffers are included if they:
+    - Have no memory level annotation
+    - Belong to the start region
+    - Do not belong to the end region (are in higher levels)
+    """
+
+    def __init__(self,
+                 closureCallTemplate: NodeTemplate = _closureCallTemplate,
+                 closureSuffix = "_closure",
+                 writeback: bool = True,
+                 generateStruct: bool = True,
+                 startRegion: str = "L2",
+                 endRegion: str = "L1"):
+        """
+        Initialize the MemoryAwareClosureGeneration transformation pass.
+
+        Parameters
+        ----------
+        closureCallTemplate : NodeTemplate, optional
+            Template for generating closure function calls. Default is the
+            global _closureCallTemplate.
+        closureSuffix : str, optional
+            Suffix to append to closure function names. Default is "_closure".
+        writeback : bool, optional
+            Whether to generate writeback code for closure arguments.
+            Default is True.
+        generateStruct : bool, optional
+            Whether to generate argument structure definitions. Default is True.
+        startRegion : str, optional
+            The starting memory region to include in closures. Default is "L2".
+        endRegion : str, optional
+            The ending memory region to include in closures. Default is "L1".
+        """
+        super().__init__(closureCallTemplate, closureSuffix, writeback, generateStruct)
+        self.startRegion = startRegion
+        self.endRegion = endRegion
+
+    # Don't override this
+    def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: ExecutionBlock):
+        """
+        Generate memory-aware closure argument structure.
+
+        Overrides the base class method to implement memory-level filtering.
+        Only includes buffers that belong to appropriate memory levels based
+        on the configured start and end regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        executionBlock : ExecutionBlock
+            The execution block to analyze for dynamic references.
+
+        Notes
+        -----
+        This method filters dynamic references based on memory levels:
+        - Includes buffers with no memory level annotation
+        - Includes buffers from the start region
+        - Includes buffers not from the end region (higher memory levels)
+
+        The filtering logic ensures that only relevant buffers are passed
+        as closure arguments, reducing memory transfer overhead in
+        multi-level memory hierarchies.
+        """
+
+        # Add closure struct info to operatorRepresentation
+        closureStructArgsType = {}
+        closureStruct = {}
+        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, unrollStructs = True)
+
+        filteredMakoDynamicReferences = []
+
+        for ref in makoDynamicReferences:
+            buf = ctxt.lookup(ref)
+            if not hasattr(buf, "_memoryLevel") or buf._memoryLevel is None:
+                filteredMakoDynamicReferences.append(ref)
+                continue
+
+            if buf._memoryLevel == self.startRegion or buf._memoryLevel != self.endRegion:
+                filteredMakoDynamicReferences.append(ref)
+
+        for arg in list(dict.fromkeys(filteredMakoDynamicReferences)):
+            ref = ctxt.lookup(arg)
+            if isinstance(ref, TransientBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = PointerClass(VoidType)
+            elif not isinstance(ref, StructBuffer):
+                closureStructArgsType[ctxt._mangle(arg)] = ref._type
+
+            if not isinstance(ref, StructBuffer):
+                closureStruct[ctxt._mangle(arg)] = arg
+
+        structClass = StructClass(self.closureName + "_args_t", closureStructArgsType)
+        self.closureStructArgType = structClass
+        self.closureStructArgs = self.closureStructArgType(closureStruct, ctxt)
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
new file mode 100644
index 0000000000..ccd459d6c7
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
@@ -0,0 +1,348 @@
+# ----------------------------------------------------------------------
+#
+# File: PrintInput.py
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Optional, Tuple
+
+from Deeploy.CommonExtensions.CodeTransformationPasses.PrintInputs import PrintConstantGeneration, \
+    PrintInputGeneration, PrintOutputGeneration, _DebugPrintTemplate
+from Deeploy.DeeployTypes import CodeGenVerbosity, ExecutionBlock, NetworkContext, _NoVerbosity
+
+
+class _MemoryAwareGeneration():
+    """
+    Base class for memory-aware debug printing transformations.
+
+    This class provides memory hierarchy filtering functionality for debug
+    printing transformations. It allows selective printing of tensors based
+    on their memory level assignments, enabling focused debugging of specific
+    memory regions in multi-level memory architectures.
+
+    Parameters
+    ----------
+    memoryHierarchyRegex : str, optional
+        A regular expression pattern to match against buffer memory levels.
+        If None, only buffers without memory level annotations are included.
+
+    Attributes
+    ----------
+    regex : re.Pattern or None
+        Compiled regular expression for memory level matching, or None if
+        no filtering is applied.
+
+    Notes
+    -----
+    This class is designed to be used as a mixin with specific printing
+    transformation classes. It provides the `_matchesRegex` method for
+    filtering buffers based on their memory level assignments.
+
+    The regex-based filtering enables fine-grained control over which
+    memory levels are included in debug output, which is crucial for
+    debugging complex memory hierarchies in embedded neural network
+    deployments.
+    """
+
+    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+        """
+        Initialize the MemoryAwareGeneration base class.
+
+        Parameters
+        ----------
+        memoryHierarchyRegex : str, optional
+            A regular expression pattern to match against buffer memory levels.
+            If None, only buffers without memory level annotations are included.
+        """
+        super().__init__()
+        if memoryHierarchyRegex is not None:
+            self.regex = re.compile(memoryHierarchyRegex)
+        else:
+            self.regex = None
+
+    def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
+        """
+        Check if a buffer matches the memory hierarchy regex pattern.
+
+        Determines whether a buffer should be included in debug output based
+        on its memory level assignment and the configured regex pattern.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer information.
+        key : str
+            The buffer reference key to check.
+
+        Returns
+        -------
+        bool
+            True if the buffer matches the criteria and should be included
+            in debug output, False otherwise.
+
+        Notes
+        -----
+        Matching logic:
+        - If no regex is configured: matches buffers without memory level
+        - If regex is configured: matches buffers whose memory level
+          matches the regex pattern
+        - Buffers without memory level annotations don't match when
+          a regex is configured
+        """
+        _buffer = ctxt.lookup(key)
+
+        if self.regex is None:
+            return not hasattr(_buffer, "_memoryLevel")
+
+        if not hasattr(_buffer, "_memoryLevel"):
+            return False
+
+        ret = self.regex.findall(ctxt.lookup(key)._memoryLevel)
+        return ret != []
+
+
+class MemoryAwarePrintInputGeneration(_MemoryAwareGeneration, PrintInputGeneration):
+    """
+    Memory-aware input tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintInputGeneration to
+    provide selective debug printing of input tensors based on their memory
+    level assignments. It's particularly useful for debugging multi-level
+    memory architectures where you want to focus on specific memory regions.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    input printing logic from PrintInputGeneration, applying memory-based
+    filtering before generating debug print statements.
+
+    Notes
+    -----
+    This transformation is especially valuable in embedded neural network
+    deployments with complex memory hierarchies (e.g., L1/L2/L3 cache levels,
+    scratchpad memories, external DRAM) where debugging specific memory
+    regions is crucial for performance optimization and correctness verification.
+    """
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware input tensor printing transformation.
+
+        Filters input tensors by memory level before adding debug print
+        statements, enabling focused debugging of specific memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered input printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered input print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for tensors that are inputs to this operation
+        4. Adds debug print statements for qualifying tensors
+        """
+
+        references = self.extractDynamicReferences(ctxt,
+                                                   executionBlock,
+                                                   unrollStructs = True,
+                                                   includeGlobalReferences = True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
+
+
+class MemoryAwarePrintOutputGeneration(_MemoryAwareGeneration, PrintOutputGeneration):
+    """
+    Memory-aware output tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintOutputGeneration to
+    provide selective debug printing of output tensors based on their memory
+    level assignments. It enables focused debugging of output data in specific
+    memory regions within multi-level memory architectures.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    output printing logic from PrintOutputGeneration, applying memory-based
+    filtering before generating debug print statements for output tensors.
+
+    Notes
+    -----
+    This transformation is particularly valuable for verifying that output
+    data is correctly written to the intended memory levels in complex
+    memory hierarchies, and for debugging memory management issues in
+    embedded neural network deployments.
+    """
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware output tensor printing transformation.
+
+        Filters output tensors by memory level before adding debug print
+        statements, enabling focused debugging of specific memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered output printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered output print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for tensors that are outputs from this operation
+        4. Adds debug print statements for qualifying tensors after execution
+        """
+
+        references = self.extractDynamicReferences(ctxt,
+                                                   executionBlock,
+                                                   unrollStructs = True,
+                                                   includeGlobalReferences = True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addRight(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
+
+
+class MemoryAwarePrintConstantGeneration(_MemoryAwareGeneration, PrintConstantGeneration):
+    """
+    Memory-aware constant tensor debug printing transformation.
+
+    This class combines MemoryAwareGeneration and PrintConstantGeneration to
+    provide selective debug printing of constant tensors based on their memory
+    level assignments. It enables focused debugging of constant data (weights,
+    biases, parameters) in specific memory regions within multi-level memory
+    architectures.
+
+    The class inherits filtering capabilities from MemoryAwareGeneration and
+    constant printing logic from PrintConstantGeneration, applying memory-based
+    filtering before generating debug print statements for constant tensors.
+
+    Notes
+    -----
+    This transformation is particularly valuable for:
+    - Verifying parameter placement in specific memory levels
+    - Debugging weight loading and quantization in embedded deployments
+    - Analyzing memory usage patterns for constant data
+    - Troubleshooting parameter access issues in complex memory hierarchies
+
+    It's especially useful in scenarios where different constant tensors
+    are placed in different memory levels for performance optimization.
+    """
+
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        """
+        Apply memory-aware constant tensor printing transformation.
+
+        Filters constant tensors by memory level before adding debug print
+        statements, enabling focused debugging of parameters in specific
+        memory regions.
+
+        Parameters
+        ----------
+        ctxt : NetworkContext
+            The network context containing buffer and type information.
+        executionBlock : ExecutionBlock
+            The execution block to instrument with filtered constant printing code.
+        name : str
+            The name of the operation being instrumented.
+        verbose : CodeGenVerbosity, optional
+            The verbosity level for code generation. Default is _NoVerbosity.
+            This parameter is currently unused by the implementation.
+
+        Returns
+        -------
+        Tuple[NetworkContext, ExecutionBlock]
+            A tuple containing:
+            - The unchanged network context
+            - The modified execution block with filtered constant print statements
+
+        Notes
+        -----
+        The transformation:
+        1. Extracts all dynamic references from the execution block
+        2. Filters references based on memory level regex matching
+        3. Further filters for constant buffers that have users
+        4. Adds debug print statements for qualifying constant tensors
+        """
+
+        references = self.extractDynamicReferences(ctxt,
+                                                   executionBlock,
+                                                   unrollStructs = True,
+                                                   includeGlobalReferences = True)
+
+        filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
+
+        for ref in filteredReferences:
+            refDict = self._getRepDict(ctxt, ref, name)
+            if refDict is not None:
+                executionBlock.addLeft(_DebugPrintTemplate, refDict)
+
+        return ctxt, executionBlock
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
new file mode 100644
index 0000000000..272a010986
--- /dev/null
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
index 775f5cbfc5..7fc8e485d6 100644
--- a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
+++ b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -4,12 +4,11 @@
 
 from typing import List, Tuple
 
-import numpy as np
 import onnx_graphsurgeon as gs
 
 from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import SequentialPass
-from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, VariableBuffer
-from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.DeeployTypes import NetworkContext, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy
 
 
 class AnnotateDefaultMemoryLevel(SequentialPass):
@@ -47,37 +46,4 @@ def globalBuffers(tensors: List[gs.Tensor]) -> List[VariableBuffer]:
         for _buffer in buffers:
             _buffer._memoryLevel = self.ioLevel
 
-        return ctxt, graph
-
-
-class AnnotateNeurekaWeightMemoryLevel(SequentialPass):
-
-    def __init__(self, neurekaEngineName: str, weightMemoryLevel: MemoryLevel):
-        self._weightMemoryLevel = weightMemoryLevel
-        self.neurekaEngineName = neurekaEngineName
-        super().__init__()
-
-    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
-
-        def _neurekaWeightBufferSize(buffer: ConstantBuffer) -> int:
-            return int(np.prod(buffer.shape))  # Weights are encoded as bytes so no need to check for typeWidth
-
-        weightMemoryOccupation = 0
-
-        # Current weight memory occupation
-        for buffer in {**ctxt.globalObjects, **ctxt.localObjects}.values():
-            if hasattr(buffer, "_memoryLevel") and buffer._memoryLevel == self._weightMemoryLevel.name:
-                weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
-
-        neurekaNodes = [node for node in graph.nodes if node.attrs["engine"] == self.neurekaEngineName]
-        for node in neurekaNodes:
-            if node.op in ["Conv", "RequantizedConv"]:
-
-                if not (ctxt.is_local(node.inputs[1].name) or ctxt.is_global(node.inputs[1].name)):
-                    continue
-
-                buffer = ctxt.lookup(node.inputs[1].name)
-                if weightMemoryOccupation + _neurekaWeightBufferSize(buffer) < self._weightMemoryLevel.size:
-                    buffer._memoryLevel = self._weightMemoryLevel.name
-                    weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
-        return ctxt, graph
+        return ctxt, graph
\ No newline at end of file
diff --git a/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
new file mode 100644
index 0000000000..3f1c440b75
--- /dev/null
+++ b/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -0,0 +1,64 @@
+# ----------------------------------------------------------------------
+#
+# File: MemoryLevelAnnotationPasses.py
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import SequentialPass
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
+
+
+class AnnotateNeurekaWeightMemoryLevel(SequentialPass):
+
+    def __init__(self, neurekaEngineName: str, weightMemoryLevel: MemoryLevel):
+        self._weightMemoryLevel = weightMemoryLevel
+        self.neurekaEngineName = neurekaEngineName
+        super().__init__()
+
+    def apply(self, ctxt: NetworkContext, graph: gs.Graph) -> Tuple[NetworkContext, gs.Graph]:
+
+        def _neurekaWeightBufferSize(buffer: ConstantBuffer) -> int:
+            return int(np.prod(buffer.shape))  # Weights are encoded as bytes so no need to check for typeWidth
+
+        weightMemoryOccupation = 0
+
+        # Current weight memory occupation
+        for buffer in {**ctxt.globalObjects, **ctxt.localObjects}.values():
+            if hasattr(buffer, "_memoryLevel") and buffer._memoryLevel == self._weightMemoryLevel.name:
+                weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
+
+        neurekaNodes = [node for node in graph.nodes if node.attrs["engine"] == self.neurekaEngineName]
+        for node in neurekaNodes:
+            if node.op in ["Conv", "RequantizedConv"]:
+
+                if not (ctxt.is_local(node.inputs[1].name) or ctxt.is_global(node.inputs[1].name)):
+                    continue
+
+                buffer = ctxt.lookup(node.inputs[1].name)
+                if weightMemoryOccupation + _neurekaWeightBufferSize(buffer) < self._weightMemoryLevel.size:
+                    buffer._memoryLevel = self._weightMemoryLevel.name
+                    weightMemoryOccupation += _neurekaWeightBufferSize(buffer)
+        return ctxt, graph
diff --git a/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py b/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py
new file mode 100644
index 0000000000..d0d2573ff1
--- /dev/null
+++ b/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py
@@ -0,0 +1,24 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 84ee2420e3..5d7b02ae62 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -6,7 +6,7 @@
 from functools import partial
 
 from Deeploy.AbstractDataTypes import PointerClass
-from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration, MemoryPassthroughGeneration
 from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \
@@ -14,6 +14,7 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
     GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py
index 25b150b553..946461b984 100644
--- a/Deeploy/Targets/Snitch/Bindings.py
+++ b/Deeploy/Targets/Snitch/Bindings.py
@@ -5,12 +5,13 @@
 from functools import partial
 
 from Deeploy.AbstractDataTypes import PointerClass
-from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
+from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
     MemoryManagementGeneration
 from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
+from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration
 from Deeploy.Targets.Generic.Templates import iNoNormTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
 from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index 01216984af..69e04343ff 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -26,7 +26,8 @@
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
 from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
-    AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel
+    AnnotateIOMemoryLevel
+from Deeploy.Targets.Neureka.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateNeurekaWeightMemoryLevel
 from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
 
diff --git a/DeeployTest/testPrintInputOutputTransformation.py b/DeeployTest/testPrintInputOutputTransformation.py
index c8f0ee70fe..fffaf66e03 100644
--- a/DeeployTest/testPrintInputOutputTransformation.py
+++ b/DeeployTest/testPrintInputOutputTransformation.py
@@ -11,8 +11,9 @@
 from testUtils.testRunner import TestGeneratorArgumentParser, getPaths
 from testUtils.typeMapping import inferTypeAndOffset
 
-from Deeploy.CommonExtensions.CodeTransformationPasses.PrintInputs import MemoryAwarePrintInputGeneration, \
-    MemoryAwarePrintOutputGeneration, PrintInputGeneration, PrintOutputGeneration
+from Deeploy.CommonExtensions.CodeTransformationPasses.PrintInputs import PrintInputGeneration, PrintOutputGeneration
+from Deeploy.MemoryLevelExtension.CodeTransformationPasses.PrintInputs import MemoryAwarePrintInputGeneration, \
+    MemoryAwarePrintOutputGeneration
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform
 

From 22c17eabd5e97fd878d26ceb6eeca92bbac3415f Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Thu, 5 Feb 2026 10:56:31 +0100
Subject: [PATCH 6/9] Changes from rebasing

---
 Deeploy/TilingExtension/TilerExtension.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index d932b22740..2186d4d4c4 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -104,6 +104,10 @@ def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = N
         ----------
         memoryHierarchy : MemoryHierarchy
             The memory hierarchy specification defining available memory levels.
+        testName : Optional[str], optional
+            Optional name for the test case, used for file naming. Defaults to None.
+        workDir : Optional[str], optional
+            Optional working directory for temporary files. Defaults to None.
         """
 
         self.memoryHierarchy = memoryHierarchy
@@ -1775,6 +1779,10 @@ def __init__(self,
             The base deployer to wrap.
         tilerCls : Type[Tiler], optional
             The tiler class to instantiate, by default Tiler.
+        testName : Optional[str], optional
+            Optional name for the test case, used for file naming. Defaults to None.
+        workDir : Optional[str], optional
+            Optional working directory for temporary files. Defaults to None.
         """
         super().__init__(deployer)
         assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \

From 934f63c162b1171780b575fb3341e59205b61337 Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Thu, 5 Feb 2026 11:21:59 +0100
Subject: [PATCH 7/9] Update Readme

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f4e08ba71..7929cbf8f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 
 
 ### List of Pull Requests
+- Improve Docstring and Debugging [#160](https://github.com/pulp-platform/Deeploy/pull/160)
 - Extend Codeowners [#164](https://github.com/pulp-platform/Deeploy/pull/164)
 - Support for MaxPool1D and RQSConv1D for PULPOpen [#146](https://github.com/pulp-platform/Deeploy/pull/146)
 - Use Pre-Commit in CI [#159](https://github.com/pulp-platform/Deeploy/pull/159)
@@ -12,10 +13,16 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Update CLI interface Across Project, Fix Tutorial, and Remove Legacy Test [#157](https://github.com/pulp-platform/Deeploy/pull/157)
 
 ### Added
+- Add many missing docstrings
+- Add `__repr__()` function for `_ReferenceBuffer` class
 - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests.
 - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows
 
 ### Changed
+- Move `MemoryAwareClosureGeneration` pass to `MemoryLevelExtension`
+- Move `MemoryAwarePrint*` passes to `MemoryLevelExtension`
+- Make `sizeInBytes` a class property instead of a function
+- Move `AnnotateNeurekaWeightMemoryLevel` to `Neureka` specific folder
 - Switch CI to use pre-commit for linting
 - Update `pulp-nnx` and `pulp-nn-mixed` submodules to their latest versions
 - PULP-NN moved to TargetLibraries third-party folder

From 19d1a9b56d3db51566b79181dd69c18bbe4694fe Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Thu, 12 Feb 2026 20:55:41 +0100
Subject: [PATCH 8/9] Fix CodeRabbit Feedback

---
 Deeploy/CommonExtensions/DataTypes.py                         | 4 ++--
 Deeploy/DeeployTypes.py                                       | 4 ----
 .../CodeTransformationPasses/PrintInputs.py                   | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index e479b92704..cdb8a8eb13 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -112,7 +112,7 @@ def minimalIntegerType(value: Union[int, Iterable[int], npt.NDArray]) -> Type[In
 
     Parameters
     ----------
-    values : Union[int, Iterable[int]
+    value : Union[int, Iterable[int], npt.NDArray]
         The list of integer values to analyze.
 
     Returns
@@ -138,7 +138,7 @@ def minimalFloatType(value: Union[float, Iterable[float], npt.NDArray]) -> Type[
 
     Parameters
     ----------
-    values : Union[float, Iterable[float]
+    value : Union[float, Iterable[float], npt.NDArray]
         The list of float values to analyze.
 
     Returns
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 9059b23897..4dc1819191 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -387,10 +387,6 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f'TransientBuffer: name: {self.name}, size: {self.size}'
 
-    @classmethod
-    def fromVariableBuffer(cls, buffer: VariableBuffer):
-        ret = cls(name = buffer.name, size = buffer.sizeInBytes)
-
     @property
     def sizeInBytes(self) -> int:
         return int(self.size)
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
index ccd459d6c7..015cd1e558 100644
--- a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
@@ -1,6 +1,6 @@
 # ----------------------------------------------------------------------
 #
-# File: PrintInput.py
+# File: PrintInputs.py
 #
 # Copyright (C) 2023, ETH Zurich and University of Bologna.
 #

From a2968a2bb337aabab620077a4d2613b76d82ab5f Mon Sep 17 00:00:00 2001
From: Philip Wiese <wiesep@iis.ee.ethz.ch>
Date: Mon, 16 Feb 2026 14:27:23 +0100
Subject: [PATCH 9/9] Implement feedback from Calin

---
 Deeploy/CommonExtensions/DataTypes.py         |  2 +-
 Deeploy/Logging.py                            | 24 +------------------
 .../CodeTransformationPasses/Closure.py       | 21 +---------------
 .../CodeTransformationPasses/PrintInputs.py   | 21 +---------------
 .../CodeTransformationPasses/__init__.py      | 21 +---------------
 .../MemoryLevelAnnotationPasses.py            | 21 +---------------
 .../Neureka/OptimizationPasses/__init__.py    | 21 +---------------
 .../PULPOpen/Templates/ReshapeTemplate.py     | 23 +-----------------
 8 files changed, 8 insertions(+), 146 deletions(-)

diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index cdb8a8eb13..ebd344f934 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -90,7 +90,7 @@ class float32_t(FloatImmediate):
 
 
 class float64_t(FloatImmediate):
-    """64-bit float type with 11-bit mantissa and 52-bit exponent."""
+    """64-bit float type with 51-bit mantissa and 11-bit exponent."""
     typeName = "float64_t"
     typeWidth = 64
     typeMantissa = 52
diff --git a/Deeploy/Logging.py b/Deeploy/Logging.py
index 2220e0351c..2bbdec000d 100644
--- a/Deeploy/Logging.py
+++ b/Deeploy/Logging.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: Logging.py
-#
-# Last edited: 22.08.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 # Setup logging
 import logging
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
index ac219c6bae..ffcd5cc206 100644
--- a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/Closure.py
@@ -1,25 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Closure.py
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass, StructClass, VoidType
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, _closureCallTemplate
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
index 015cd1e558..f8deae6e4d 100644
--- a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/PrintInputs.py
@@ -1,25 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PrintInputs.py
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 from typing import Optional, Tuple
diff --git a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
index 272a010986..2f43657417 100644
--- a/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
+++ b/Deeploy/MemoryLevelExtension/CodeTransformationPasses/__init__.py
@@ -1,24 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
index 3f1c440b75..4a1bc95b2d 100644
--- a/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
+++ b/Deeploy/Targets/Neureka/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -1,25 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryLevelAnnotationPasses.py
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py b/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py
index d0d2573ff1..be436b64a3 100644
--- a/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py
+++ b/Deeploy/Targets/Neureka/OptimizationPasses/__init__.py
@@ -1,24 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
index a795a555ed..c37fad2ee7 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: ReshapeTemplate.py
-#
-# Last edited: 16.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple