diff --git a/.vscode/launch.json b/.vscode/launch.json index c9238e502b..82f5390f5f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,9 +11,18 @@ "program": "generateNetwork.py", "console": "integratedTerminal", "cwd": "${workspaceFolder}/DeeployTest", + "env": { + "PYTHONPATH": "${workspaceFolder}" + }, + "subProcess": true, "justMyCode": false, - "args": - "-p${input:platformUntiled} -t${input:model} ${input:additionalArgsUntiled}" + "args": [ + "-p", + "${input:platformUntiled}", + "-t", + "${input:model}", + "${input:additionalArgsUntiled}" + ] }, { "name": "Deeploy Generate Tiled", @@ -22,9 +31,18 @@ "program": "testMVP.py", "console": "integratedTerminal", "cwd": "${workspaceFolder}/DeeployTest", + "env": { + "PYTHONPATH": "${workspaceFolder}" + }, + "subProcess": true, "justMyCode": false, - "args": - "-p${input:platformTiled} -t${input:model} ${input:additionalArgsTiled}" + "args": [ + "-p", + "${input:platformTiled}", + "-t", + "${input:model}", + "${input:additionalArgsTiled}" + ] } ], "inputs": [ @@ -85,7 +103,7 @@ "id": "additionalArgsTiled", "type": "promptString", "description": "Additional Arguments", - "default": "-v --doublebuffer" + "default": "--doublebuffer" } ] -} \ No newline at end of file +} diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py index 609a179c7b..5ca16d2e1f 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py @@ -275,7 +275,8 @@ def apply(self, ctxt._maxDynamicSize[levels] = max(ctxt._maxDynamicSize.get(levels, 0), ctxt._dynamicSize[levels]) for buffer in inputs + transients: - assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}" + if buffer._live == False: + continue buffer._live = False # Don't deallocate if it's an alias of a live buffer if not buffer.has_live_aliases(ctxt): @@ -362,8 +363,8 @@ def apply(self, ctxt._maxDynamicSize[levels] = max(ctxt._maxDynamicSize.get(levels, 0), ctxt._dynamicSize[levels]) for buffer in inputs + transients: - assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}" - + if buffer._live == False: + continue memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel if memoryLevel not in ctxt._dynamicSize: ctxt._dynamicSize[memoryLevel] = 0 diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index de5a66aae9..858f0e1cdc 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -63,6 +63,37 @@ class CodeGenVerbosity: _backendPostParsingFilename = 'backend_post_parsing' _backendPostBindingFilename = 'backend_post_binding' + +def _deeployTypeToNpType(ty: Type[BaseType]): + + def _broadcastInteger(ty: Type[IntegerImmediate]): + if ty.signed: + return np.dtype(getattr(np, "int" + str(ty.typeWidth))) + else: + return np.dtype(getattr(np, "uint" + str(ty.typeWidth))) + + def _broadcastFloat(ty: Type[FloatImmediate]): + if ty.typeWidth == 16: + return np.dtype(np.float16) + if ty.typeWidth == 32: + return np.dtype(np.float32) + if ty.typeWidth == 64: + return np.dtype(np.float64) + return np.dtype(np.float32) + + if issubclass(ty, Pointer) and hasattr(ty, "referencedType"): + if issubclass(ty.referencedType, IntegerImmediate): + return _broadcastInteger(ty.referencedType) + if issubclass(ty.referencedType, FloatImmediate): + return _broadcastFloat(ty.referencedType) + elif issubclass(ty, IntegerImmediate): + return _broadcastInteger(ty) + elif issubclass(ty, FloatImmediate): + return _broadcastFloat(ty) + + return None + + _ctxtExtension = '.pkl' _graphExtension = '.onnx' _dataExtension = '.data' @@ -415,7 +446,12 @@ def __eq__(self, other): def _valueString(self) -> str: values = list(self.values.reshape(-1)) if self._type.typeName == 'float32_t*': - strValues = [f'{value}f' for value in values] + strValues = [] + for value in values: + literal = f"{float(value):.9g}" + if "e" not in literal and "." not in literal: + literal += ".0" + strValues.append(literal + "f") elif self._type.typeName == 'int8_t*': strValues = [f'{int(value)}' for value in values] else: @@ -977,8 +1013,6 @@ def hoistConstant(self, Returns the name of the newly registed ConstantBuffer """ - assert len(constant.outputs) <= 1, f"Constant {constant.name} has more than one output" - name = name if name is not None else constant.name # LMACAN: The shape needs to be copied into a tuple for pickling to work. Don't ask me why.. @@ -2027,25 +2061,7 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net return ctxt, False def _broadcastToNpType(self, ty: Type[BaseType]): - - def _broadcastInteger(ty: Type[IntegerImmediate]): - if ty.signed: - return np.dtype(getattr(np, "int" + str(ty.typeWidth))) - else: - return np.dtype(getattr(np, "uint" + str(ty.typeWidth))) - - def _broadcastFloat(ty: Type[FloatImmediate]): - return np.dtype(getattr(np, "double")) - - if issubclass(ty, Pointer) and hasattr(ty, "referencedType"): - if issubclass(ty.referencedType, IntegerImmediate): - return _broadcastInteger(ty.referencedType) - elif issubclass(ty, IntegerImmediate): - return _broadcastInteger(ty) - elif issubclass(ty, FloatImmediate): - return _broadcastFloat(ty) - - return None + return _deeployTypeToNpType(ty) def typeCheck(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]: """Invokes the mapper's typeCheck method @@ -2106,8 +2122,9 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]: elif ctxt.is_global(node.name): npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type) if isinstance(ctxt.globalObjects[node.name], ConstantBuffer): - if isinstance(node, gs.Constant): + if isinstance(node, gs.Constant) and npType is not None: node.values = node.values.astype(npType) + node.export_dtype = npType else: node.shape = ctxt.globalObjects[node.name].shape if npType is not None: @@ -2856,7 +2873,17 @@ def generateInferenceInitializationCode(self) -> str: name = node.name node.name = self.ctxt._mangle(node.name) - callStack += node.init() + + if ("TILING_CODEGEN" not in node.name and isinstance(node, VariableBuffer) and hasattr(node, "_type") + and issubclass(node._type, Pointer)): + # Local inference buffers are late-bound by the generated layer code. Initializing them to NULL keeps + # clang from flagging false-positive uninitialized reads on paths where the assignment is emitted in a + # separate closure, and marking them unused avoids noise for scratch buffers that are reserved + # generically but optimized away for a specific layer instance. + typeName = node._instance.typeName if hasattr(node, "_instance") else node._type.typeName + callStack += f"{typeName} {node.name} __attribute__((unused)) = NULL;\n" + else: + callStack += node.init() node.name = name return callStack @@ -3121,6 +3148,15 @@ def _exportGraph(self, folderPath, fileName): # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)] for tensor in constTensors: + if tensor.name in self.ctxt.globalObjects: + ctxtTensor = self.ctxt.globalObjects[tensor.name] + if isinstance(ctxtTensor, ConstantBuffer) and hasattr(ctxtTensor, "_type"): + npType = _deeployTypeToNpType(ctxtTensor._type) + if npType is not None: + tensor.values = tensor.values.astype(npType) + tensor.export_dtype = npType + continue + if tensor.dtype != tensor.export_dtype: tensor.values = tensor.values.astype(tensor.export_dtype) diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py index 82b7d1fde4..fb4a647634 100644 --- a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py +++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py @@ -36,6 +36,8 @@ def apply(self, graph: gs.Graph) -> Tuple[gs.Graph]: engine = self.engineMapper.mapNodeToEngine(node, graph) if engine is not None: node.attrs["engine"] = engine.name + if hasattr(engine, "n_cores"): + node.attrs["n_cores"] = engine.n_cores return graph diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py index adbf161328..bc04a00457 100644 --- a/Deeploy/Targets/GAP9/DMA/L3Dma.py +++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py @@ -29,7 +29,7 @@ class GAP9L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture) diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..807f3ba5cd 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -15,16 +15,16 @@ ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \ FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \ FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \ - FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \ - GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \ - MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ + FloatPowTemplate, FloatReduceLogSumExpTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \ + FloatSqrtTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \ + MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \ iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ - LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ - ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, \ + ReduceMeanChecker, ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, \ + SliceChecker, SoftmaxChecker, TransposeChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -227,6 +227,11 @@ BasicTransformer) for type in SignedIntegerDataTypes ] +BasicReduceLogSumExpBindings = [ + NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceLogSumExpTemplate.referenceTemplate, BasicTransformer) +] + BasicReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatReluTemplate.referenceTemplate, BasicTransformer) @@ -286,6 +291,9 @@ BasicConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, BasicTransformer) ] BasicQuantBindings = [ @@ -312,18 +320,34 @@ for type in FloatDataTypes ] -BasicConvTransposeBindings = [ +BasicConvTranspose1DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias + [PointerClass(type)]), + ConvTransposeTemplate.reference1DTemplate, + BasicTransformer) for type in FloatDataTypes +] + [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type)], # input, weight + [PointerClass(type)]), + ConvTransposeTemplate.reference1DTemplate, + BasicTransformer) for type in FloatDataTypes +] + +BasicConvTranspose2DBindings = [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.reference2DTemplate, BasicTransformer) for type in FloatDataTypes ] + [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type)], # input, weight [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.reference2DTemplate, BasicTransformer) for type in FloatDataTypes ] diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..2d011e21f9 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -340,6 +340,12 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese if inputShapes[1] == () or inputShapes[1] == []: inputShapes[1] = (1,) + # Scalars and singletons should broadcast to the tensor operand, + # not shrink the tensor shape to (1,). + if tuple(inputShapes[1]) == (1,): + inputShapes[1] = inputShapes[0] + return (inputShapes, outputShapes) + if len(inputShapes[0]) > len(inputShapes[1]): inputShapes[1] = inputShapes[0] else: @@ -438,6 +444,27 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese return (inputShapes, outputShapes) +class ReduceLogSumExpLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, + channels_first) -> Tuple[Shape, Shape]: + axis = operatorRepresentation['axes'][0] + inputShape = list(copy.deepcopy(inputShapes[0])) + + if operatorRepresentation['keepdims']: + outputShape = inputShape + outputShape[axis] = 1 + else: + outputShape = inputShape[:axis] + inputShape[axis + 1:] + if len(outputShape) == 0: + outputShape = [1] + + return (inputShapes, [outputShape]) + + class ReluLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..3b69f9e526 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -6,6 +6,7 @@ from typing import Tuple import numpy as np +import onnx import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer @@ -334,25 +335,99 @@ class PadParser(NodeParser): def __init__(self): super().__init__() + def _evaluate_constant_tensor(self, tensor: gs.Tensor): + + if isinstance(tensor, gs.Constant): + return np.asarray(tensor.values) + + if not hasattr(tensor, "inputs") or len(tensor.inputs) != 1: + return None + + node = tensor.inputs[0] + input_values = [] + for input_tensor in node.inputs: + value = self._evaluate_constant_tensor(input_tensor) + if value is None: + return None + input_values.append(value) + + if node.op == "Constant": + value = node.attrs.get("value") + if value is None: + return None + return np.asarray(value.values) + + if node.op == "Cast": + cast_dtype = onnx.helper.tensor_dtype_to_np_dtype(node.attrs["to"]) + return input_values[0].astype(cast_dtype) + + if node.op == "Reshape": + return np.reshape(input_values[0], input_values[1].astype(np.int64).tolist()) + + if node.op == "Concat": + return np.concatenate(input_values, axis = node.attrs["axis"]) + + if node.op == "Transpose": + return np.transpose(input_values[0], axes = node.attrs["perm"]) + + if node.op == "ConstantOfShape": + fill_value = node.attrs.get("value") + if fill_value is None: + scalar = np.array(0, dtype = np.float32) + else: + scalar = np.asarray(fill_value.values).reshape(-1)[0] + return np.full(input_values[0].astype(np.int64).tolist(), scalar, dtype = np.asarray(scalar).dtype) + + if node.op == "Slice": + data = input_values[0] + starts = input_values[1].astype(np.int64).reshape(-1) + ends = input_values[2].astype(np.int64).reshape(-1) + axes = input_values[3].astype(np.int64).reshape(-1) if len(input_values) >= 4 else np.arange(len(starts)) + steps = input_values[4].astype(np.int64).reshape(-1) if len(input_values) >= 5 else np.ones( + len(starts), dtype = np.int64) + + slices = [slice(None)] * data.ndim + for start, end, axis, step in zip(starts, ends, axes, steps): + slices[int(axis)] = slice(int(start), int(end), int(step)) + + return data[tuple(slices)] + + return None + def parseNode(self, node: gs.Node) -> bool: - ret = all([ - 'mode' in node.attrs, 'pads' in node.attrs, 'value' in node.attrs, - len(node.inputs) == 1, - len(node.outputs) == 1 - ]) + ret = all(['mode' in node.attrs, len(node.outputs) == 1]) if ret: self.operatorRepresentation['mode'] = node.attrs['mode'] + self.operatorRepresentation['value'] = 0 - try: - self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']] - except Exception as e: - self.operatorRepresentation['pads'] = node.attrs['pads'] + if 'pads' in node.attrs and len(node.inputs) == 1: + try: + self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']] + except Exception: + self.operatorRepresentation['pads'] = node.attrs['pads'] - self.operatorRepresentation['value'] = node.attrs['value'] + if 'value' in node.attrs: + self.operatorRepresentation['value'] = node.attrs['value'] + return True - return ret + if len(node.inputs) in (2, 3): + pads = self._evaluate_constant_tensor(node.inputs[1]) + if pads is None: + return False + + self.operatorRepresentation['pads'] = [int(p) for p in np.asarray(pads).reshape(-1)] + + if len(node.inputs) == 3: + value = self._evaluate_constant_tensor(node.inputs[2]) + if value is None: + return False + self.operatorRepresentation['value'] = np.asarray(value).reshape(-1)[0].item() + + return True + + return False def parseNodeCtxt(self, ctxt: NetworkContext, @@ -366,6 +441,16 @@ def parseNodeCtxt(self, self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape) self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape) + # Keep optional constant Pad inputs visible to the tiler. + # The template uses the decoded scalar/list values, but the tiled + # scheduler still tracks the original constant tensors as node inputs. + if len(node.inputs) >= 2: + pads_tensor = ctxt.lookup(node.inputs[1].name) + self.operatorRepresentation['pads_tensor'] = pads_tensor.name + if len(node.inputs) >= 3: + value_tensor = ctxt.lookup(node.inputs[2].name) + self.operatorRepresentation['value_tensor'] = value_tensor.name + return ctxt, True @@ -623,6 +708,118 @@ def parseNodeCtxt(self, return newCtxt, ret +class ReduceLogSumExpParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) < 1 or len(node.inputs) > 2 or len(node.outputs) != 1: + return False + + if 'keepdims' not in node.attrs: + return False + + self.operatorRepresentation['keepdims'] = int(node.attrs['keepdims']) + if len(node.inputs[0].shape) == 0: + return False + + if len(node.inputs) == 2: + if not isinstance(node.inputs[1], gs.Constant): + return False + axes = np.array(node.inputs[1].values, dtype = np.int64) + else: + if 'axes' not in node.attrs: + return False + axes = node.attrs['axes'] + if isinstance(axes, int): + axes = [axes] + axes = np.array(axes, dtype = np.int64) + + normalized_axes = [] + rank = len(node.inputs[0].shape) + for axis in axes: + normalized_axis = int(axis) + if normalized_axis < 0: + normalized_axis += rank + normalized_axes.append(normalized_axis) + + if len(normalized_axes) != 1: + return False + + axis = normalized_axes[0] + if axis < 0 or axis >= rank: + return False + + self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64) + + if self.operatorRepresentation['keepdims']: + output_shape = list(node.inputs[0].shape) + output_shape[axis] = 1 + else: + output_shape = list(node.inputs[0].shape[:axis]) + list(node.inputs[0].shape[axis + 1:]) + if len(output_shape) == 0: + output_shape = [1] + + node.outputs[0].shape = output_shape + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + + if len(node.inputs) == 2: + axes_buffer = ctxt.lookup(node.inputs[1].name) + axes_buffer._live = False + axes_buffer._deploy = False + axes = np.array(axes_buffer.values, dtype = np.int64) + else: + axes = np.array(self.operatorRepresentation['axes'], dtype = np.int64) + + normalized_axes = [] + for axis in axes: + normalized_axis = int(axis) + if normalized_axis < 0: + normalized_axis += len(data_in.shape) + normalized_axes.append(normalized_axis) + + if len(normalized_axes) != 1: + return ctxt, False + + axis = normalized_axes[0] + if axis < 0 or axis >= len(data_in.shape): + return ctxt, False + + outer_size = int(np.prod(data_in.shape[:axis])) if axis > 0 else 1 + inner_size = int(np.prod(data_in.shape[axis + 1:])) if axis + 1 < len(data_in.shape) else 1 + if self.operatorRepresentation['keepdims']: + output_shape = list(data_in.shape) + output_shape[axis] = 1 + else: + output_shape = list(data_in.shape[:axis]) + list(data_in.shape[axis + 1:]) + if len(output_shape) == 0: + output_shape = [1] + + data_out.shape = output_shape + node.outputs[0].shape = output_shape + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['data_in_shape'] = data_in.shape + self.operatorRepresentation['data_out_shape'] = output_shape + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64) + self.operatorRepresentation['axisLength'] = data_in.shape[axis] + self.operatorRepresentation['outerSize'] = outer_size + self.operatorRepresentation['innerSize'] = inner_size + + return ctxt, True + + class SoftmaxParser(NodeParser): def __init__(self): @@ -2703,14 +2900,11 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - # Verify the attributes (epsilon is mandatory, momentum and training_mode are optional) - if 'epsilon' not in node.attrs: - return False # Common Inputs: 5 (X, scale, B, mean, var) if len(node.inputs) < 5: return False - # Save the attributes, default values are provided if not present + # Save attributes (ONNX defaults when attributes are omitted) self.operatorRepresentation['epsilon'] = node.attrs.get('epsilon', 1e-5) self.operatorRepresentation['momentum'] = node.attrs.get('momentum', 0.9) self.operatorRepresentation['training_mode'] = node.attrs.get('training_mode', 0) @@ -2728,10 +2922,27 @@ def parseNodeCtxt(self, ctxt, node: gs.Node, channels_first: bool = True): self.operatorRepresentation[outputs[0]] = ctxt.lookup(node.outputs[0].name).name input_shape = ctxt.lookup(node.inputs[0].name).shape - # Save input shape information + param_shape = ctxt.lookup(node.inputs[1].name).shape + + if len(input_shape) >= 2 and len(param_shape) >= 1: + channel_count = param_shape[0] + channels_first_match = input_shape[1] == channel_count + channels_last_match = input_shape[-1] == channel_count + + # Prefer explicit evidence from the BN parameter length when the + # default layout would otherwise be ambiguous for mixed-layout graphs. + if channels_first_match != channels_last_match: + channels_first = channels_first_match + + # BatchNorm runs on flattened [N, C, L] for channels-first and [N, L, C] + # for channels-last, where L covers all spatial positions. self.operatorRepresentation['batch_size'] = input_shape[0] - self.operatorRepresentation['channel_size'] = input_shape[1] - self.operatorRepresentation['window_size'] = input_shape[2] + self.operatorRepresentation['channel_size'] = input_shape[1] if channels_first else input_shape[-1] + if channels_first: + self.operatorRepresentation['window_size'] = int(np.prod(input_shape[2:])) + else: + self.operatorRepresentation['window_size'] = int(np.prod(input_shape[1:-1])) + self.operatorRepresentation['channels_first'] = int(channels_first) return ctxt, True @@ -2783,8 +2994,8 @@ def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: boo stride_x, stride_y = 1, 1 if "strides" in node.attrs: - stride_y = node.attrs["strides"][0] - stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y + stride_x = node.attrs["strides"][0] + stride_y = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_x self.operatorRepresentation["stride_y"] = stride_y self.operatorRepresentation["stride_x"] = stride_x @@ -2865,6 +3076,80 @@ def parseNodeCtxt(self, return ctxt, False +class ConvTranspose2DParser(ConvTransposeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + # 2D ConvTranspose expects 4D input/output and 4D weight + wellFormed = super().parseNode(node) + ret = False + if wellFormed: + ret = all([ + # Make sure strides are 2D + len(node.attrs['strides']) == 2, + len(node.attrs['pads']) == 4, + len(node.attrs['dilations']) == 2, + ]) + if ret: + + self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] + self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) + self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) + self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0]) + self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1]) + self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) + self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + data_in = newCtxt.lookup(node.inputs[0].name) + data_out = newCtxt.lookup(node.outputs[0].name) + in_shape = data_in.shape + out_shape = data_out.shape + weight = newCtxt.lookup(node.inputs[1].name) + + if len(in_shape) != 4 or len(out_shape) != 4 or len(weight.shape) != 4: + return ctxt, False + + self.operatorRepresentation['batch'] = in_shape[0] + + if channels_first: + self.operatorRepresentation['ch_im_in'] = in_shape[1] + self.operatorRepresentation['dim_im_in_x'] = in_shape[2] + self.operatorRepresentation['dim_im_in_y'] = in_shape[3] + self.operatorRepresentation['ch_im_out'] = out_shape[1] + self.operatorRepresentation['dim_im_out_x'] = out_shape[2] + self.operatorRepresentation['dim_im_out_y'] = out_shape[3] + else: + self.operatorRepresentation['ch_im_in'] = in_shape[3] + self.operatorRepresentation['dim_im_in_x'] = in_shape[1] + self.operatorRepresentation['dim_im_in_y'] = in_shape[2] + self.operatorRepresentation['ch_im_out'] = out_shape[3] + self.operatorRepresentation['dim_im_out_x'] = out_shape[1] + self.operatorRepresentation['dim_im_out_y'] = out_shape[2] + + self.operatorRepresentation["batchOffsetIn"] = (self.operatorRepresentation["ch_im_in"] * + self.operatorRepresentation["dim_im_in_x"] * + self.operatorRepresentation["dim_im_in_y"]) + self.operatorRepresentation["batchOffsetOut"] = (self.operatorRepresentation["ch_im_out"] * + self.operatorRepresentation["dim_im_out_x"] * + self.operatorRepresentation["dim_im_out_y"]) + return newCtxt, True + return ctxt, False + + class SqrtParser(NodeParser): def __init__(self): diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index e05e897270..c0066bd78c 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -7,26 +7,27 @@ from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \ - BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \ - BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \ - BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ + BasicConv1DBindings, BasicConv2DBindings, BasicConvTranspose1DBindings, BasicConvTranspose2DBindings, \ + BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, \ + BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \ - BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ - BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \ - DummyBinding + BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceLogSumExpBindings, \ + BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, \ + BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, \ + BasicTransposeBindings, DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ - LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ - ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \ - SoftmaxLayer, SqrtLayer, TransposeLayer + LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceLogSumExpLayer, \ + ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ + SliceLayer, SoftmaxLayer, SqrtLayer, TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \ - DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ - GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ - IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ - Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + ConvTranspose2DParser, DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, \ + GELUParser, GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, \ + GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, \ + MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceLogSumExpParser, \ + ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \ + RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \ + iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -58,6 +59,7 @@ Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings) +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), BasicReduceLogSumExpBindings) ReduceSumMapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings) ReluMapper = NodeMapper(ReluParser(), [BasicReluBinding]) RequantShiftMapper = NodeMapper(RequantShiftParser(), BasicRQSBindings) @@ -71,7 +73,8 @@ QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings) -ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings) +ConvTranspose1DMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTranspose1DBindings) +ConvTranspose2DMapper = NodeMapper(ConvTranspose2DParser(), BasicConvTranspose2DBindings) SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings) # Dummy nodes are intended for development purposes only! @@ -105,6 +108,7 @@ 'Sqrt': SqrtLayer([SqrtMapper]), 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), + 'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), 'Relu': ReluLayer([ReluMapper]), 'RequantizediGELU': RQSiGELULayer([RQGELUMapper]), @@ -118,7 +122,7 @@ 'Quant': QuantLayer([QuantMapper]), 'Dequant': DequantLayer([DequantMapper]), 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), - 'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]) + 'ConvTranspose': ConvTransposeLayer([ConvTranspose1DMapper, ConvTranspose2DMapper]) # # For example, you can use the DummpyMapper, in case you want to test # # deployment or optimizations with GlobalAveragePool nodes but did not yet # # implement the corresponding kernel diff --git a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py index 5377c91ca0..0233a36c96 100644 --- a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py +++ b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py @@ -9,7 +9,7 @@ BEGIN_SINGLE_CORE BatchNorm_fp32( ${data_in}, ${scale}, ${bias}, ${mean}, ${variance}, - ${data_out}, ${batch_size}, ${channel_size}, ${window_size} + ${data_out}, ${batch_size}, ${channel_size}, ${window_size}, ${epsilon}, ${channels_first} ); END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py index 9bf864c91f..73bb130f78 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py @@ -4,8 +4,9 @@ from Deeploy.DeeployTypes import NodeTemplate -referenceTemplate = NodeTemplate(""" +reference1DTemplate = NodeTemplate(""" <% +bias_ptr = bias if 'bias' in locals() else "NULL" batchOffsetIn = ch_im_in * dim_im_in_y batchOffsetOut = ch_im_out * dim_im_out_y %> @@ -20,7 +21,7 @@ ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${stride_y}, - ${bias}, ${has_bias}, + ${bias_ptr}, ${has_bias}, ref_${data_out}_${data_out}, ${dim_im_out_y} ); @@ -29,3 +30,30 @@ } END_SINGLE_CORE """) + +reference2DTemplate = NodeTemplate(""" +<% +bias_ptr = bias if 'bias' in locals() else "NULL" +batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y +batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y +%> + +// 2D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE + ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for (uint32_t n=0; n<${batch}; ++n) { + ConvTranspose2d_fp32( + ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${bias_ptr}, ${has_bias}, + ref_${data_out}_${data_out}, ${dim_im_out_x}, ${dim_im_out_y} + ); + + ref_${data_out}_${data_in} += ${batchOffsetIn}; + ref_${data_out}_${data_out} += ${batchOffsetOut}; + } +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py index 3c8c2da501..0df2e901a0 100644 --- a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py @@ -8,7 +8,7 @@ // Float Mul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE for (uint32_t i=0;i<${size};i++){ - ${C}[i] = ${A}[i] * ${B}[0]; + ${C}[i] = ${A}[i] * ${B}[${0 if sizeB == 1 else 'i'}]; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py new file mode 100644 index 0000000000..98d04cfaf6 --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// ReduceLogSumExp (Name: ${nodeName}, Op: ${nodeOp}) +SINGLE_CORE ReduceLogSumExp_fp32_fp32(${data_in}, ${data_out}, ${outerSize}, ${axisLength}, ${innerSize}); +""") diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py index 091cb55a41..0e24d51635 100644 --- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py @@ -10,11 +10,23 @@ from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerModel import TilerModel -from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme class UntiledTileConstraint(TileConstraint): + @staticmethod + def _normalizedShape(shape) -> Tuple[int, ...]: + if isinstance(shape, int): + return (shape,) + + normalized = tuple(shape) + if len(normalized) == 0: + return (1,) + + return normalized + @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -35,7 +47,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.addTensorDimToModel(ctxt, tensorName) - for idx, shapeDim in enumerate(_buffer.shape): + shape = [_buffer.shape] if isinstance(_buffer.shape, int) else _buffer.shape + for idx, shapeDim in enumerate(shape): tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) return tilerModel @@ -53,22 +66,55 @@ def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], targetMemLevel: str, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: - outputCubes = [cube.rectangle for cube in absoluteOutputCubes] - - schedule = TilingSchedule({}, {}, [], []) repScheme = VariableReplacementScheme({}, {}) + inputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}] + outputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}] + inputBaseOffsets: Dict[str, List[int]] = {} + outputBaseOffsets: Dict[str, List[int]] = {} - for key, value in tilingSolution.tensorMemoryConstraints.items(): + addrNames: List[str] = [] + for key, value in operatorRepresentation.items(): + if not isinstance(value, str): + continue + + if value not in tilingSolution.tensorMemoryConstraints: + continue - assert len(value.memoryConstraints.keys()) == 1, f"{cls} should be untiled, but {value} is tiled!" + _buffer = ctxt.lookup(value) + if isinstance(_buffer, TransientBuffer): + continue - memKey = list(value.memoryConstraints.keys())[0] - memValue = value.memoryConstraints[memKey] + addrNames.append(key) + for key, value in tilingSolution.tensorMemoryConstraints.items(): _buffer = ctxt.lookup(key) if isinstance(_buffer, TransientBuffer): continue - assert memValue.shape == tuple(_buffer.shape) + fullShape = cls._normalizedShape(_buffer.shape) + memoryConstraints = list(value.memoryConstraints.values()) + + # Untiled tensors may still be materialized on multiple memory levels when the same + # full-shape buffer is replicated along the memory path (for example, an untiled Slice + # fed by an L3-backed producer). This is not true tiling and should remain acceptable + # here. We only reject the case where different memory levels carry different shapes. + assert all(cls._normalizedShape(memValue.shape) == fullShape for memValue in memoryConstraints), \ + f"{cls} should be untiled, but {value} carries multiple shapes across memory levels!" + + if len(addrNames) > 0: + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + for addrName in inputBaseOffsets: + buffer = ctxt.lookup(operatorRepresentation[addrName]) + shape = cls._normalizedShape(buffer.shape) + inputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape) + + for addrName in outputBaseOffsets: + buffer = ctxt.lookup(operatorRepresentation[addrName]) + shape = cls._normalizedShape(buffer.shape) + outputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) return repScheme, schedule diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py index 146bcf699e..4841187abe 100644 --- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py @@ -478,7 +478,12 @@ def _merge_matmul_add_fun(graph: gs.Graph, match: Match, name: str): matched_nodes = [m for k, m in match.nodes_map.items()] gemm = matched_nodes[0] add = matched_nodes[1] - _bias = add.inputs[0] if isinstance(add.inputs[0], gs.Constant) else add.inputs[1] + + constant_inputs = [inp for inp in add.inputs if isinstance(inp, gs.Constant)] + if len(constant_inputs) != 1: + return graph + + _bias = constant_inputs[0] _inputs = gemm.inputs + [_bias] _outputs = add.outputs diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6834ed8975 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -341,6 +341,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] +class ReduceLogSumExpChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + class ReluChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py index 2585b1a688..04233b31ce 100644 --- a/Deeploy/Targets/Neureka/Engine.py +++ b/Deeploy/Targets/Neureka/Engine.py @@ -4,6 +4,7 @@ from typing import List +import numpy as np import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper @@ -76,7 +77,18 @@ def isDWConv(self, node) -> bool: node.attrs['group'] != 1 and \ (node.attrs['strides'] == [1, 1] or self.enableStrides) + @staticmethod + def _isIntegerTensor(tensor: gs.Tensor) -> bool: + dtype = getattr(tensor, "dtype", None) + return dtype is not None and np.issubdtype(np.dtype(dtype), np.integer) + + def _hasSupportedTensorTypes(self, node: gs.Node) -> bool: + tensors = list(node.inputs) + list(node.outputs) + return all(self._isIntegerTensor(tensor) for tensor in tensors) + def canExecute(self, node: gs.Node) -> bool: + if not self._hasSupportedTensorTypes(node): + return False if self.enable3x3: return self.isPWConv(node) or self.isDWConv(node) or self.isDenseConv(node) else: diff --git a/Deeploy/Targets/Neureka/Platform.py b/Deeploy/Targets/Neureka/Platform.py index e83f7f20f4..af8762c595 100644 --- a/Deeploy/Targets/Neureka/Platform.py +++ b/Deeploy/Targets/Neureka/Platform.py @@ -6,6 +6,7 @@ import onnx_graphsurgeon as gs +from Deeploy.AbstractDataTypes import PointerClass, VoidType from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ RequantizedGemmToPwPass from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, TopologyOptimizer @@ -27,6 +28,12 @@ class NeurekaConstantBuffer(ConstantBuffer): allocTemplate = NodeTemplate("") deallocTemplate = NodeTemplate("") + def __init__(self, name: str = '', shape = [1], values = [0]): + super().__init__(name, shape, values) + # Some Neureka lowering paths inspect global constants before type inference + # has populated them with the final pointer type. + self._type = PointerClass(VoidType) + def _bufferRepresentation(self): operatorRepresentation = super()._bufferRepresentation() operatorRepresentation["_memoryLevel"] = getattr(self, "_memoryLevel", None) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 2c78978e23..b62c5986e1 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -14,13 +14,14 @@ from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration -from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ - GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ - GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ - QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, \ + DequantTemplate, FloatPadTemplate, FloatReduceLogSumExpTemplate, FloatReduceSumTemplate, GatherTemplate, \ + PadTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ + DequantChecker, GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, \ + MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, \ + RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, \ + TransposeChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -29,12 +30,13 @@ from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ - FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ - FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ - MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ - RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, ConvTransposeTemplate, DMASliceTemplate, \ + FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, \ + FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \ + FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, \ + RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ + SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ + iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -176,6 +178,35 @@ ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes ] +PULPBatchNormBindings = [ + NodeBinding( + BatchNormChecker( + [PointerClass(type), + PointerClass(type), + PointerClass(type), + PointerClass(type), + PointerClass(type)], [PointerClass(type)]), BatchNormalizationTemplate.referenceTemplate, ForkTransformer) + for type in FloatDataTypes +] + +PULPPad1DBindings = [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference1DTemplate, + ForkTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), FloatPadTemplate.reference1DTemplate, + ForkTransformer) for type in FloatDataTypes +] + +PULPPad2DBindings = [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference2DTemplate, + ForkTransformer) for type in IntegerDataTypes +] + [ + NodeBinding( + PadChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatPadTemplate.reference2DTemplate, + ForkTransformer) +] + PULPRQAddBindings = [ NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]), RQAddTemplate.referenceTemplate, ForkTransformer) @@ -249,6 +280,15 @@ ForkTransformer) for float_type in FloatDataTypes ] +PULPFloatConvTranspose2DBindings = [ + NodeBinding(ConvChecker( + [PointerClass(type), PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes +] + [ + NodeBinding(ConvChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes +] + PULPRQSMatrixVecBindings = [ NodeBinding( PULPLinearChecker([PointerClass(type1), @@ -322,6 +362,11 @@ FloatReduceSumTemplate.referenceTemplate, ClusterTransformer) ] +PULPReduceLogSumExpBindings = [ + NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceLogSumExpTemplate.referenceTemplate, ForkTransformer) +] + PULPUniformRQSBindings = [ NodeBinding( PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t), @@ -382,6 +427,9 @@ PULPConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) ] PULPiRMSNormBindings = [ diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py index 6c2aa30811..375ad75958 100644 --- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py +++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py @@ -29,7 +29,7 @@ class L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture) diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index 5c5951eaba..b71b0aead6 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -8,8 +8,8 @@ import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, \ - RQSConv2DParser, RQSParserInterface +from Deeploy.Targets.Generic.Parsers import Conv2DParser, ConvTranspose2DParser, GEMMParser, ReduceMeanParser, \ + RQSConv1DParser, RQSConv2DParser, RQSParserInterface class PULPConv2DParser(RQSConv2DParser): @@ -90,6 +90,8 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + if "n_cores" in node.attrs: + self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"]) return ret return False @@ -148,6 +150,8 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + if "n_cores" in node.attrs: + self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"]) return ret return False @@ -182,6 +186,51 @@ def parseNodeCtxt(self, return ctxt, False +class PULPConvTranspose2DParser(ConvTranspose2DParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + wellFormed = super().parseNode(node) + + if wellFormed: + # ConvTranspose kernels on PULP/Siracusa are emitted in CHW layout. + # This must be visible before broadcast() runs, otherwise output shapes + # are reinterpreted as NHWC and the last dimension gets clobbered. + self.operatorRepresentation['channels_first'] = True + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return wellFormed + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + if node.attrs.get("channels_first", True) == False: + return ctxt, False + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, True) + + if ret: + self.operatorRepresentation['data_in'] = newCtxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['weight'] = newCtxt.lookup(node.inputs[1].name).name + + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + self.operatorRepresentation["has_bias"] = "true" + self.operatorRepresentation["bias"] = newCtxt.lookup(node.inputs[2].name).name + + return newCtxt, True + + return ctxt, False + + class PULPDWConv1DParser(RQSConv1DParser): def __init__(self, noBiasHoisting = True): diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index f13e6451fb..fe0fdd737f 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -11,20 +11,20 @@ NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper -from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ - BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \ - GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ - ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ - RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \ - MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \ - SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \ - SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \ - iSoftmaxParser +from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicRQIntegerDivBinding +from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ + ConvTransposeLayer, GatherLayer, GELUGradLayer, GELULayer, GEMMLayer, LayerNormGradLayer, LayerNormLayer, \ + MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceLogSumExpLayer, ReduceMeanLayer, ReduceSumLayer, \ + ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ + SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ + TransposeLayer, iHardswishLayer, iRMSNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, DequantParser, FlattenParser, \ + GatherParser, GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, \ + MaxPool1DParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceLogSumExpParser, \ + ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \ + RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \ + SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \ + iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -32,24 +32,25 @@ from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \ PULPDWConv1DBinding from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer -from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ - PULPReduceMeanParser, PULPTallGEMMParser +from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPConvTranspose2DParser, \ + PULPDWConv1DParser, PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, \ + PULPMatrixVecParser, PULPReduceMeanParser, PULPTallGEMMParser from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ - PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ - PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \ - PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \ - PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \ - PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \ - PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \ - PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \ - PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \ - PULPUniformRQSTilingReadyBindings +from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPBatchNormTilingReadyBindings, \ + PULPConcatTilingReadyBindings, PULPConv2DTilingReadyBindings, PULPConvTranspose2DTilingReadyBindings, \ + PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUGradTilingReadyBindings, \ + PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \ + PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \ + PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, \ + PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \ + PULPPad1DTilingReadyBindings, PULPPad2DTilingReadyBindings, PULPReduceLogSumExpTilingReadyBindings, \ + PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \ + PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \ + PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \ + PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \ + PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ + PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -60,8 +61,8 @@ GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings) GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings) MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings) -Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) -Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) +Pad1DMapper = NodeMapper(Pad1DParser(), PULPPad1DTilingReadyBindings) +Pad2DMapper = NodeMapper(Pad2DParser(), PULPPad2DTilingReadyBindings) ReshapeMapper = NodeMapper(ReshapeParser(), PULPFlattenTilingReadyBindings) TransposeMapper = NodeMapper(TransposeParser(), PULPTransposeTilingReadyBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), PULPFlattenTilingReadyBindings) @@ -70,6 +71,7 @@ UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings) ReduceMeanMapper = NodeMapper(PULPReduceMeanParser(), PULPReduceMeanTilingReadyBindings) +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), PULPReduceLogSumExpTilingReadyBindings) ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings) MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings) RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) @@ -81,6 +83,7 @@ Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings) FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings) DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings) +ConvTranspose2DMapper = NodeMapper(PULPConvTranspose2DParser(), PULPConvTranspose2DTilingReadyBindings) GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings) FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings) MatrixVecMapper = NodeMapper(PULPMatrixVecParser(), PULPRQSMatrixVecTilingReadyBindings) @@ -111,8 +114,10 @@ QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +BatchNormalizationMapper = NodeMapper(BatchNormParser(), PULPBatchNormTilingReadyBindings) PULPMapping = { 'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]), + 'ConvTranspose': ConvTransposeLayer([ConvTranspose2DMapper]), 'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]), 'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]), 'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]), @@ -128,6 +133,7 @@ 'iSoftmax': SoftmaxLayer([Softmax_int8_Mapper]), 'Softmax': SoftmaxLayer([SoftmaxMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), + 'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), 'RequantShift': RequantShiftLayer([UniformRequantShiftMapper, RequantShiftMapper]), 'Add': AddLayer([AddMapper]), @@ -148,6 +154,7 @@ 'RequantizediHardswish': RQSiHardswishLayer([RQSiHardswishMapper]), 'Quant': QuantLayer([QuantMapper]), 'Dequant': QuantLayer([DequantMapper]), + 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py new file mode 100644 index 0000000000..5ef8d5855e --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +reference2DTemplate = NodeTemplate(""" +<% +batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y +batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y +%> + +// 2D FP ConvTranspose CHW on PULPOpen/Siracusa (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + ref_${data_out}_${data_in}, + ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ${weight}, + ${ch_im_out}, ${group}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${dilation_x}, ${dilation_y}, + ${padding_y_top}, ${padding_y_bottom}, + ${padding_x_left}, ${padding_x_right}, + ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, + ${dim_im_out_x}, ${dim_im_out_y} + ); + + ref_${data_out}_${data_in} += ${batchOffsetIn}; + ref_${data_out}_${data_out} += ${batchOffsetOut}; +} +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index bfa893db94..bd4f108ab3 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -18,10 +18,12 @@ def __init__(self, templateStr): def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + n_cores = int(operatorRepresentation["n_cores"]) + # Conservative fallback used during Autoencoder2D debug: + # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8) # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. - im2col_dim = (operatorRepresentation["weight_type"].typeWidth // - 8) * operatorRepresentation["n_cores"] * operatorRepresentation[ - 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * n_cores * operatorRepresentation[ + 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] im2col_name = operatorRepresentation['nodeName'] + "_buffer" @@ -48,9 +50,12 @@ def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + n_cores = int(operatorRepresentation["n_cores"]) + # Conservative fallback used during Autoencoder2D debug: + # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8) # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. - im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[ - "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // + 8) * n_cores * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] im2col_name = operatorRepresentation['nodeName'] + "_buffer" diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py index ced6c3cbcf..1d1d6a70bc 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py @@ -14,18 +14,33 @@ uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size}); if (${nodeName}_start < ${nodeName}_end) { +% if sizeB == 1: float32_t ${nodeName}_scalar = ${B}[0]; +% endif uint32_t ${nodeName}_unroll_end = ${nodeName}_start + ((${nodeName}_end - ${nodeName}_start) / 6) * 6; for (uint32_t i = ${nodeName}_start; i < ${nodeName}_unroll_end; i += 6) { +% if sizeB == 1: ${C}[i + 0] = ${A}[i + 0] * ${nodeName}_scalar; ${C}[i + 1] = ${A}[i + 1] * ${nodeName}_scalar; ${C}[i + 2] = ${A}[i + 2] * ${nodeName}_scalar; ${C}[i + 3] = ${A}[i + 3] * ${nodeName}_scalar; ${C}[i + 4] = ${A}[i + 4] * ${nodeName}_scalar; ${C}[i + 5] = ${A}[i + 5] * ${nodeName}_scalar; +% else: + ${C}[i + 0] = ${A}[i + 0] * ${B}[i + 0]; + ${C}[i + 1] = ${A}[i + 1] * ${B}[i + 1]; + ${C}[i + 2] = ${A}[i + 2] * ${B}[i + 2]; + ${C}[i + 3] = ${A}[i + 3] * ${B}[i + 3]; + ${C}[i + 4] = ${A}[i + 4] * ${B}[i + 4]; + ${C}[i + 5] = ${A}[i + 5] * ${B}[i + 5]; +% endif } for (uint32_t i = ${nodeName}_unroll_end; i < ${nodeName}_end; i++) { +% if sizeB == 1: ${C}[i] = ${A}[i] * ${nodeName}_scalar; +% else: + ${C}[i] = ${A}[i] * ${B}[i]; +% endif } } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index ae9ca96b3c..bd6e5dc290 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -1060,4 +1060,4 @@ def serializeTilingSolution( tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) - return variableReplacementSchedule, tilingSchedule \ No newline at end of file + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py new file mode 100644 index 0000000000..b3b6756c41 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ConvTranspose2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register all tensors that participate in the tiled operator. The tiler will + # create symbolic variables for each tensor dimension and solve over them. + inputBufferName = parseDict['data_in'] + weightBufferName = parseDict['weight'] + outputBufferName = parseDict['data_out'] + + for bufferName in [inputBufferName, weightBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # Symbolic dimensions for NCHW input and output, and [Cin, Cout/group, Kh, Kw] weights. + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Geometrical constraints: batch is preserved, the weight Cin must match input Cin, + # and in this first implementation H/W are not tiled, so they stay equal to the + # full tensor shapes known from the context. + tilerModel.addConstraint(outputBatchVar == inputBatchVar) + tilerModel.addConstraint(weightInChannelVar == inputChannelVar) + tilerModel.addConstraint(outputHeightVar == ctxt.lookup(outputBufferName).shape[2]) + tilerModel.addConstraint(outputWidthVar == ctxt.lookup(outputBufferName).shape[3]) + tilerModel.addConstraint(inputHeightVar == ctxt.lookup(inputBufferName).shape[2]) + tilerModel.addConstraint(inputWidthVar == ctxt.lookup(inputBufferName).shape[3]) + tilerModel.addConstraint(weightHeightVar == ctxt.lookup(weightBufferName).shape[2]) + tilerModel.addConstraint(weightWidthVar == ctxt.lookup(weightBufferName).shape[3]) + + if parseDict['group'] == 1: + # For the regular case, the only tiled dimension is Cout, so the output tile + # channel count must match the weight tile channel count. + tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) + else: + # Grouped ConvTranspose is kept conservative for now: no real Cout slicing is + # enforced on the weight tensor because the grouped layout needs dedicated handling. + tilerModel.addConstraint(outputChannelVar == ctxt.lookup(outputBufferName).shape[1]) + tilerModel.addConstraint(weightOutChannelVar == ctxt.lookup(weightBufferName).shape[1]) + + if parseDict.get('has_bias', "false") == "true": + # Bias follows the tiled output channels one-to-one. + biasBufferName = parseDict['bias'] + tilerModel.addTensorDimToModel(ctxt, biasBufferName) + biasChannelVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0) + tilerModel.addConstraint(biasChannelVar == outputChannelVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + inputShape = ctxt.lookup(inputBufferName).shape + outputShape = ctxt.lookup(outputBufferName).shape + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + + # Policy constraints: keep the full input resident and tile only along output + # channels. This is the simplest correct strategy for ConvTranspose because it + # avoids the inverse spatial dependency problem on H/W tiles. + tilerModel.addConstraint(inputBatchVar == inputShape[0]) + tilerModel.addConstraint(inputChannelVar == inputShape[1]) + tilerModel.addConstraint(outputBatchVar == outputShape[0]) + + if parseDict['group'] == 1 and parseDict["ch_im_out"] >= 8: + # Do not create tiny channel tiles unless necessary. This reduces overhead + # from DMA/codegen and keeps the kernel granularity reasonable. + tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + # The solver gives us output tiles. For each output tile we must reconstruct the + # exact input/weight/bias tiles that must be transferred into the target memory. + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'weight', 'data_out'] + has_bias = operatorRepresentation.get('has_bias', "false") == "true" + if has_bias: + addrNames.append('bias') + + # Resolve base addresses in the target memory level (typically L1) for all tensors + # used by this tiled operator. + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + inputShape = ctxt.lookup(operatorRepresentation['data_in']).shape + weightShape = ctxt.lookup(operatorRepresentation['weight']).shape + + inputInCubes = [] + inputWeightCubes = [] + inputBiasCubes = [] + + replacements = { + "ch_im_out": [], + "batch": [], + } + replacementTypes = { + "ch_im_out": PointerClass(uint16_t), + "batch": PointerClass(uint8_t), + } + + for cube in outputCubes: + # Output tiles are NCHW. Since we only tile along C_out, only the channel + # offset/size are relevant here. + _, cOffset, _, _ = cube.offset + nSize, cSize, _, _ = cube.dims + + # Input is kept whole for every tile in this first implementation. + inputInCubes.append(HyperRectangle((0, 0, 0, 0), tuple(inputShape))) + + if operatorRepresentation['group'] == 1: + # Slice the weight tensor along the output-channel axis so the kernel sees + # only the filters needed for the current output tile. + weightOffset = (0, cOffset, 0, 0) + weightDims = (weightShape[0], cSize, weightShape[2], weightShape[3]) + else: + # Conservative grouped fallback: use full weights until grouped slicing is + # implemented explicitly for ConvTranspose tiling. + weightOffset = (0, 0, 0, 0) + weightDims = tuple(weightShape) + + inputWeightCubes.append(HyperRectangle(weightOffset, weightDims)) + + if has_bias: + if operatorRepresentation['group'] == 1: + # Bias is sliced exactly like the output channels. + inputBiasCubes.append(HyperRectangle((cOffset,), (cSize,))) + else: + inputBiasCubes.append(HyperRectangle((0,), (ctxt.lookup(operatorRepresentation['bias']).shape[0],))) + + # These replacements are injected into the template so each kernel invocation + # uses the tile-local shape instead of the global layer shape. + replacements["ch_im_out"].append(cSize) + replacements["batch"].append(nSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Build the per-tile DMA/load schedule: which tensor cubes must be present before + # computing each output cube. + if has_bias: + for inCube, weightCube, biasCube in zip(inputInCubes, inputWeightCubes, inputBiasCubes): + inputLoadSchedule.append({"data_in": inCube, "weight": weightCube, "bias": biasCube}) + else: + for inCube, weightCube in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": inCube, "weight": weightCube}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py index ee7e448be6..fb55dd536d 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py @@ -6,7 +6,7 @@ from typing import Dict, List, Tuple from Deeploy.AbstractDataTypes import PointerClass -from Deeploy.CommonExtensions.DataTypes import int8_t +from Deeploy.CommonExtensions.DataTypes import uint16_t from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint @@ -209,10 +209,10 @@ def serializeTilingSolution( replacements["N"] = [NSize] * len(outputCubes) replacementTypes = { - "M": PointerClass(int8_t), - "N": PointerClass(int8_t), - "O": PointerClass(int8_t), - "batch": PointerClass(int8_t) + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint16_t) } # Update load schedule lists diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py new file mode 100644 index 0000000000..c7cce27d06 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class Pad2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + buffersOfInterest = [inputBufferName, outputBufferName] + if 'pads_tensor' in parseDict: + buffersOfInterest.append(parseDict['pads_tensor']) + if 'value_tensor' in parseDict: + buffersOfInterest.append(parseDict['value_tensor']) + + for bufferName in buffersOfInterest: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchDim = 0 + channelDim = 1 + heightDim = 2 + widthDim = 3 + else: + batchDim = 0 + heightDim = 1 + widthDim = 2 + channelDim = 3 + + inputBatchVar = tilerModel.getTensorDimVar(inputBufferName, batchDim) + inputHeightVar = tilerModel.getTensorDimVar(inputBufferName, heightDim) + inputWidthVar = tilerModel.getTensorDimVar(inputBufferName, widthDim) + inputChannelVar = tilerModel.getTensorDimVar(inputBufferName, channelDim) + + outputBatchVar = tilerModel.getTensorDimVar(outputBufferName, batchDim) + outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim) + outputWidthVar = tilerModel.getTensorDimVar(outputBufferName, widthDim) + outputChannelVar = tilerModel.getTensorDimVar(outputBufferName, channelDim) + + tilerModel.addConstraint(outputBatchVar == inputBatchVar) + tilerModel.addConstraint(outputWidthVar == inputWidthVar + 2 * parseDict['pad_x']) + tilerModel.addConstraint(outputChannelVar == inputChannelVar) + + # Height tiles may include top/bottom padding only at the boundary tiles. + tilerModel.addConstraint(outputHeightVar >= inputHeightVar) + tilerModel.addConstraint(outputHeightVar <= inputHeightVar + 2 * parseDict['pad_y']) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchDim = 0 + channelDim = 1 + heightDim = 2 + widthDim = 3 + else: + batchDim = 0 + heightDim = 1 + widthDim = 2 + channelDim = 3 + + inputBuffer = ctxt.lookup(inputBufferName) + outputBuffer = ctxt.lookup(outputBufferName) + + # Keep batch, width and channels whole to preserve contiguous NHWC row copies. + tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, batchDim) == inputBuffer.shape[batchDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, batchDim) == outputBuffer.shape[batchDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, widthDim) == inputBuffer.shape[widthDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, widthDim) == outputBuffer.shape[widthDim]) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(inputBufferName, channelDim) == inputBuffer.shape[channelDim]) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(outputBufferName, channelDim) == outputBuffer.shape[channelDim]) + + # Avoid pure-padding tiles and avoid tail tiles with a single padding row. + outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim) + tilerModel.addConstraint(outputHeightVar >= (parseDict['pad_y'] + 1)) + + return tilerModel + + @staticmethod + def _computeInputCube(outputCube: HyperRectangle, parseDict: Dict, + outputShape: Tuple[int, ...]) -> Tuple[HyperRectangle, int]: + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchOffset, _, outputHOffset, _ = outputCube.offset + batchSize, _, outputHSize, _ = outputCube.dims + inputChannels = parseDict['dim_im_in_ch'] + inputWidth = parseDict['dim_im_in_y'] + else: + batchOffset, outputHOffset, _, _ = outputCube.offset + batchSize, outputHSize, _, _ = outputCube.dims + inputChannels = parseDict['dim_im_in_ch'] + inputWidth = parseDict['dim_im_in_y'] + + padTop = parseDict['pad_y'] + inputHeightTotal = parseDict['dim_im_in_x'] + outputHeightTotal = outputShape[2] if channels_first else outputShape[1] + + dataStart = padTop + dataEnd = padTop + inputHeightTotal + tileEnd = outputHOffset + outputHSize + + localPadTop = max(dataStart - outputHOffset, 0) + localPadBottom = max(tileEnd - dataEnd, 0) + + inputStart = max(outputHOffset, dataStart) - dataStart + inputEnd = min(tileEnd, dataEnd) - dataStart + inputHeight = max(inputEnd - inputStart, 1) + + if channels_first: + inCube = HyperRectangle((batchOffset, 0, inputStart, 0), + (batchSize, inputChannels, inputHeight, inputWidth)) + else: + inCube = HyperRectangle((batchOffset, inputStart, 0, 0), + (batchSize, inputHeight, inputWidth, inputChannels)) + + _ = localPadBottom # Bottom padding is encoded by the output height itself. + return inCube, localPadTop + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements: Dict[str, List[int]] = { + "batch": [], + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_in_ch": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "dim_im_out_ch": [], + "pad_y": [], + "data_out_size": [], + } + + replacementTypes = { + "batch": PointerClass(uint16_t), + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_in_ch": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "dim_im_out_ch": PointerClass(uint16_t), + "pad_y": PointerClass(uint16_t), + "data_out_size": PointerClass(uint16_t), + } + + outputShape = tuple(ctxt.lookup(operatorRepresentation['data_out']).shape) + channels_first = bool(operatorRepresentation.get('channels_first', 1)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for outCube in outputCubes: + inCube, localPadTop = cls._computeInputCube(outCube, operatorRepresentation, outputShape) + + replacements["batch"].append(outCube.dims[0]) + replacements["pad_y"].append(localPadTop) + replacements["data_out_size"].append(int(np.prod(outCube.dims))) + + if channels_first: + replacements["dim_im_in_ch"].append(inCube.dims[1]) + replacements["dim_im_in_x"].append(inCube.dims[2]) + replacements["dim_im_in_y"].append(inCube.dims[3]) + replacements["dim_im_out_ch"].append(outCube.dims[1]) + replacements["dim_im_out_x"].append(outCube.dims[2]) + replacements["dim_im_out_y"].append(outCube.dims[3]) + else: + replacements["dim_im_in_x"].append(inCube.dims[1]) + replacements["dim_im_in_y"].append(inCube.dims[2]) + replacements["dim_im_in_ch"].append(inCube.dims[3]) + replacements["dim_im_out_x"].append(outCube.dims[1]) + replacements["dim_im_out_y"].append(outCube.dims[2]) + replacements["dim_im_out_ch"].append(outCube.dims[3]) + + inputLoadSchedule.append({"data_in": inCube}) + outputLoadSchedule.append({"data_out": outCube}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py index 5309300659..4a1993c45e 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py @@ -19,6 +19,12 @@ class SliceTileConstraint(TileConstraint): + @staticmethod + def _resolveImmediate(value, ctxt: NetworkContext): + if isinstance(value, str): + return np.asarray(ctxt.lookup(value).values).astype(np.int64).reshape(-1) + return np.asarray(value).astype(np.int64).reshape(-1) + @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -31,8 +37,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw inputShape = parseDict['data_in_shape'] # Get other necessary information - sliceAxes = parseDict['axes'] - sliceSteps = parseDict['steps'] + sliceAxes = SliceTileConstraint._resolveImmediate(parseDict['axes'], ctxt) + sliceSteps = SliceTileConstraint._resolveImmediate(parseDict['steps'], ctxt) # ===== ADD I/O DIMENSIONS TO THE MODEL AS VARIABLES ===== for bufferName in [inputBufferName, outputBufferName]: @@ -83,10 +89,14 @@ def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict in_cube_offset = list(outputCube.offset).copy() # Iterate through the sliced axes - for idx, ax in enumerate(parseDict['axes']): + sliceAxes = np.asarray(parseDict['axes']).astype(np.int64).reshape(-1) + sliceStarts = np.asarray(parseDict['starts']).astype(np.int64).reshape(-1) + sliceSteps = np.asarray(parseDict['steps']).astype(np.int64).reshape(-1) + + for idx, ax in enumerate(sliceAxes): # Get current sliced ax parameters - start = parseDict['starts'][idx] - step = parseDict['steps'][idx] + start = sliceStarts[idx] + step = sliceSteps[idx] # Compute input cube parameters for the current axis in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1 @@ -149,12 +159,18 @@ def serializeTilingSolution( outputLoadSchedule = [] for out_cube in outputCubes: + immediateOperatorRepresentation = operatorRepresentation.copy() + immediateOperatorRepresentation['axes'] = cls._resolveImmediate(operatorRepresentation['axes'], ctxt) + immediateOperatorRepresentation['starts'] = cls._resolveImmediate(operatorRepresentation['starts'], ctxt) + immediateOperatorRepresentation['steps'] = cls._resolveImmediate(operatorRepresentation['steps'], ctxt) + # Compute input cube - in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation) + in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, + parseDict = immediateOperatorRepresentation) # Compute new ends for replacement new_ends = list() - for ax in operatorRepresentation['axes']: + for ax in immediateOperatorRepresentation['axes']: new_ends.append(in_cube.offset[ax] + in_cube.dims[ax]) # Append replacement elements diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 901106459e..75856b0314 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -14,10 +14,12 @@ from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint -from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ - PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ - PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \ - PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \ +from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint +from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPBatchNormBindings, PULPConcatBindings, \ + PULPFloatConv2DBindings, PULPFloatConvTranspose2DBindings, PULPFloatDWConv2DBindings, PULPFloatGELUBinding, \ + PULPFloatGELUGradBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \ + PULPiRQSGELUBindings, PULPLayernormBinding, PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, \ + PULPMaxPool2DBindings, PULPMulBindings, PULPPad1DBindings, PULPPad2DBindings, PULPReduceLogSumExpBindings, \ PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \ PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \ PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \ @@ -25,6 +27,7 @@ PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \ RQConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ConvTransposeTileConstraint import ConvTranspose2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ RQDWConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint @@ -40,7 +43,6 @@ from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings @@ -57,6 +59,12 @@ PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings, tileConstraint = Conv2DTileConstraint()) +PULPConv2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings, + tileConstraint = UntiledTileConstraint()) + +PULPConvTranspose2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvTranspose2DBindings, + tileConstraint = ConvTranspose2DTileConstraint()) + PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings, tileConstraint = DWConv2DTileConstraint()) @@ -95,6 +103,9 @@ PULPMaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings, tileConstraint = MaxPoolCTileConstraint()) +PULPMaxPool2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings, + tileConstraint = UntiledTileConstraint()) + PULPRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSBindings, tileConstraint = RequantShiftTileConstraint()) @@ -152,11 +163,25 @@ PULPReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceSumBindings, tileConstraint = ReduceSumTileConstraint()) +_PULPReduceLogSumExpBindings = copy.deepcopy(PULPReduceLogSumExpBindings) + +PULPReduceLogSumExpTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPReduceLogSumExpBindings, + tileConstraint = UntiledTileConstraint()) + PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings, tileConstraint = SGDTileConstraint()) PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings, - tileConstraint = SliceTileConstraint()) + tileConstraint = UntiledTileConstraint()) PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, tileConstraint = ReduceMeanTileConstraint()) + +PULPBatchNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPBatchNormBindings, + tileConstraint = UntiledTileConstraint()) + +PULPPad1DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad1DBindings, + tileConstraint = UntiledTileConstraint()) + +PULPPad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad2DBindings, + tileConstraint = UntiledTileConstraint()) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py index 51f87534ea..1cbd64bbe7 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py @@ -187,6 +187,7 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[ if isFinalMemoryLevel: minimizedTransfers = [] + fallbackToSharedOuterShape = False for rect in transfers: paddedRect = HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank)) minRect, newMinOuterShape = minimizeRectangle(paddedRect, outerShape) @@ -194,13 +195,26 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[ minOuterShape = newMinOuterShape else: if minOuterShape != newMinOuterShape: - rectStr = "\n".join(str(trans) for trans in transfers[:transfers.index(rect)]) - raise RuntimeError(f"""Currently support a single minimal outer shape. -Old minOuterShape: {minOuterShape} vs. new minOuterShape {newMinOuterShape}. -New minOuterShape produced by outerDims: {outerShape} and rect: {rect}. -Old minOuterShape produced by outerDims: {outerShape} and rects: -{rectStr}""") + # Keep a shared outer shape when different tiles of the same external tensor + # minimize differently. This shows up for L3->L2 transposed DMA on residual + # tiles: a full-width tile may collapse to 1D, while the tail tile keeps a + # higher-rank shape. Falling back to the common padded shape is conservative + # but preserves the existing tiling structure and avoids introducing + # layer-specific hacks here. + # + # A better long-term fix would be earlier in tiling propagation: if an + # intermediate layer such as a Transpose fits entirely in L2, keep it whole + # there instead of inheriting a downstream residual tiling split. That would + # minimize L3 traffic rather than only making DMA legalization robust. + fallbackToSharedOuterShape = True + break minimizedTransfers.append(minRect) + if fallbackToSharedOuterShape: + minimizedTransfers = [ + HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank)) + for rect in transfers + ] + minOuterShape = outerShape else: minimizedTransfers = [HyperRectangle((0,), (int(np.prod(rect.dims)),)) for rect in transfers] minOuterShape = (int(np.prod(outerShape)),) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py index 8a0c1b9b54..bbcbc512cc 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py @@ -70,7 +70,21 @@ def _hoistReference(self, shape: Tuple[int, ...] = (1,), offset: Union[int, str, VariableBuffer] = 0, override_type: Optional[Type[BaseType]] = None) -> _ReferenceBuffer: - ref = ctxt.hoistReference(self.prefix + name, reference, shape, offset, override_type) + refName = self.prefix + name + if ctxt.is_local(refName): + ref = ctxt.lookup(refName) + assert isinstance(ref, _ReferenceBuffer) + assert ref._referenceName == reference.name + assert tuple(ref.shape) == tuple(shape) + expectedOffset = offset.name if isinstance(offset, VariableBuffer) else offset + assert ref._offset == expectedOffset + if override_type is not None: + assert ref._type == PointerClass(override_type) + else: + assert ref._type == reference._type + return ref + + ref = ctxt.hoistReference(refName, reference, shape, offset, override_type) ref._memoryLevel = self.memory return ref diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c index e2b0449fb5..20fb980510 100644 --- a/DeeployTest/Platforms/Generic/main.c +++ b/DeeployTest/Platforms/Generic/main.c @@ -25,6 +25,23 @@ int main() { printf("Running network...\r\n"); RunNetwork(0, 1); + const char *dump_outputs = getenv("DEEPLOY_DUMP_OUTPUTS"); + if (dump_outputs && dump_outputs[0] != '\0') { + for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { + uint32_t count = DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); + printf("OUTPUT %u %u\r\n", buf, count); + for (uint32_t i = 0; i < count; i++) { + OUTPUTTYPE actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i]; +#if ISOUTPUTFLOAT == 1 + printf("%.9g\r\n", (double)actual); +#else + printf("%d\r\n", actual); +#endif + } + } + return 0; + } + int32_t tot_err = 0; uint32_t tot = 0; OUTPUTTYPE diff; @@ -62,4 +79,4 @@ int main() { printf("Errors: %d out of %d \r\n", tot_err, tot); return tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c index 11d889e48d..1252edd518 100644 --- a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c +++ b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c @@ -4,6 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include + #include "CycleCounter.h" #include "Network.h" #include "dory_mem.h" @@ -13,6 +15,8 @@ #define MAINSTACKSIZE 8000 #define SLAVESTACKSIZE 3800 +#define FLOAT_ABS_TOL 1e-4f +#define FLOAT_REL_TOL 1e-5f struct pi_device cluster_dev; @@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) { float expected_val = expected[i]; float actual_val = actual[i]; float diff = expected_val - actual_val; + float abs_diff = fabsf(diff); + float scale = fabsf(expected_val); + float abs_actual = fabsf(actual_val); + if (abs_actual > scale) { + scale = abs_actual; + } + float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale; - if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) { + if ((abs_diff > tolerance) || isnan(diff)) { local_err_count += 1; printf("Expected: %10.6f ", expected_val); @@ -125,39 +136,38 @@ int main(void) { compbuf = DeeployNetwork_outputs[buf]; } - if (ISOUTPUTFLOAT) { - float_error_count = 0; - float_compare_args.expected = testOutputVector[buf]; - float_compare_args.actual = compbuf; - float_compare_args.num_elements = - DeeployNetwork_outputs_bytes[buf] / sizeof(float); - float_compare_args.output_buf_index = buf; - float_compare_args.err_count = &float_error_count; - - pi_cluster_task(&cluster_task, CompareFloatOnCluster, - &float_compare_args); - cluster_task.stack_size = MAINSTACKSIZE; - cluster_task.slave_stack_size = SLAVESTACKSIZE; - pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); - - tot_err += float_error_count; - } else { - - for (uint32_t i = 0; - i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { - OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; - OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; - int32_t error = expected - actual; - OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); - - if (diff) { - tot_err += 1; - printf("Expected: %4d ", expected); - printf("Actual: %4d ", actual); - printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); - } +#if ISOUTPUTFLOAT == 1 + float_error_count = 0; + float_compare_args.expected = testOutputVector[buf]; + float_compare_args.actual = compbuf; + float_compare_args.num_elements = + DeeployNetwork_outputs_bytes[buf] / sizeof(float); + float_compare_args.output_buf_index = buf; + float_compare_args.err_count = &float_error_count; + + pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + tot_err += float_error_count; +#else + + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; + int32_t error = expected - actual; + OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); + + if (diff) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); } } +#endif if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) { pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]); } @@ -167,4 +177,4 @@ int main(void) { printf("Errors: %u out of %u \r\n", tot_err, tot_tested); return (int)tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c index 11d889e48d..1252edd518 100644 --- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c +++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c @@ -4,6 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include + #include "CycleCounter.h" #include "Network.h" #include "dory_mem.h" @@ -13,6 +15,8 @@ #define MAINSTACKSIZE 8000 #define SLAVESTACKSIZE 3800 +#define FLOAT_ABS_TOL 1e-4f +#define FLOAT_REL_TOL 1e-5f struct pi_device cluster_dev; @@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) { float expected_val = expected[i]; float actual_val = actual[i]; float diff = expected_val - actual_val; + float abs_diff = fabsf(diff); + float scale = fabsf(expected_val); + float abs_actual = fabsf(actual_val); + if (abs_actual > scale) { + scale = abs_actual; + } + float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale; - if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) { + if ((abs_diff > tolerance) || isnan(diff)) { local_err_count += 1; printf("Expected: %10.6f ", expected_val); @@ -125,39 +136,38 @@ int main(void) { compbuf = DeeployNetwork_outputs[buf]; } - if (ISOUTPUTFLOAT) { - float_error_count = 0; - float_compare_args.expected = testOutputVector[buf]; - float_compare_args.actual = compbuf; - float_compare_args.num_elements = - DeeployNetwork_outputs_bytes[buf] / sizeof(float); - float_compare_args.output_buf_index = buf; - float_compare_args.err_count = &float_error_count; - - pi_cluster_task(&cluster_task, CompareFloatOnCluster, - &float_compare_args); - cluster_task.stack_size = MAINSTACKSIZE; - cluster_task.slave_stack_size = SLAVESTACKSIZE; - pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); - - tot_err += float_error_count; - } else { - - for (uint32_t i = 0; - i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { - OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; - OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; - int32_t error = expected - actual; - OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); - - if (diff) { - tot_err += 1; - printf("Expected: %4d ", expected); - printf("Actual: %4d ", actual); - printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); - } +#if ISOUTPUTFLOAT == 1 + float_error_count = 0; + float_compare_args.expected = testOutputVector[buf]; + float_compare_args.actual = compbuf; + float_compare_args.num_elements = + DeeployNetwork_outputs_bytes[buf] / sizeof(float); + float_compare_args.output_buf_index = buf; + float_compare_args.err_count = &float_error_count; + + pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + tot_err += float_error_count; +#else + + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; + int32_t error = expected - actual; + OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); + + if (diff) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); } } +#endif if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) { pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]); } @@ -167,4 +177,4 @@ int main(void) { printf("Errors: %u out of %u \r\n", tot_err, tot_tested); return (int)tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz new file mode 100644 index 0000000000..54a2d380c8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx new file mode 100644 index 0000000000..87846a20cd --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx @@ -0,0 +1,26 @@ +pytorch2.5.1: + +input + conv.weight + conv.biasoutput +/conv/Conv"Conv* + dilations@@* +group* + kernel_shape@@* +pads@@@@* +strides@@ +main_graph*=B conv.weightJ$V#(7>dwV3{Nw=iػQ>T*B conv.biasJf=Z +input + + + + + +b +output + + + + + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz new file mode 100644 index 0000000000..af275c1945 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz new file mode 100644 index 0000000000..ec6380bb46 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx new file mode 100644 index 0000000000..3c7e61866d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz new file mode 100644 index 0000000000..229a14c1ca Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/TEST_REPORT.md b/DeeployTest/Tests/Models/Autoencoder2D/TEST_REPORT.md new file mode 100644 index 0000000000..dc6c8a789f --- /dev/null +++ b/DeeployTest/Tests/Models/Autoencoder2D/TEST_REPORT.md @@ -0,0 +1,280 @@ +# Test Report - Autoencoder2D + +Questo file raccoglie i risultati dei test eseguiti durante il debug e le osservazioni tecniche. +Aggiornare questo report ad ogni nuovo test. + +Nota organizzativa: +- dal 2026-05-04 questo report e' mantenuto in `Tests/Models/Autoencoder2D/TEST_REPORT.md` (prima era stato salvato per errore sotto `Autoencoder2D_GMM`). + +## Sessione corrente (2026-05-04) + +### 1) Generic - Autoencoder2D (tentativo iniziale) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- Esito: `FAILED` +- Errore principale: + - parsing fallito su nodo `encoderlayer1paddingPad` + - `Did not find adequate mapping for graph ... Candidates: ['Pad1DParser', 'Pad2DParser']` + +### 2) Generic - Autoencoder2D (retry) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 160` +- Note: + - warning di compilazione non bloccanti (unused vars / conversioni implicite). + +### 3) Siracusa no-tiling con simulazione +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D` +- Esito: `FAILED` +- Stato: + - build completata + - errore in fase gvsoc/runtime (`Invalid fetch request`), con fallimento target `gvsoc_Autoencoder2D`. + +### 4) Siracusa no-tiling senza simulazione (build-only) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D --skipsim` +- Esito: `PASSED` +- Note: + - confermato che `--skipsim` evita l'esecuzione della simulazione e valida solo generate/build. + +### 5) Siracusa tiled senza simulazione (default L2) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --skipsim` +- Esito: `FAILED` +- Errore principale: + - tiling/memory allocation fallita + - `Memory allocator failed ... L2 with capacity of -409076 bytes` + - `minimalloc` invocato con capacity negativa. + +### 6) Siracusa tiled senza simulazione (default L3, L2 aumentata) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --skipsim --defaultMemLevel=L3 --l2=3000000` +- Esito: `PASSED` +- Note: + - configurazione stabile per generate/build tiled su Autoencoder2D. + +### 7) Siracusa no-tiling con simulazione (retry) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D` +- Esito: `FAILED` +- Stato: + - build completata + - errore in fase post-build gvsoc: `Error copying file ... build_master/*.bin ... gvsoc_workdir/` + - durante l'esecuzione compare ancora `Invalid fetch request` su PE cluster. + +### 8) Siracusa tiled con simulazione (default L3, L2 aumentata) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- Esito: `FAILED` +- Stato: + - build completata + - in simulazione il network gira fino al confronto finale + - mismatch numerico completo: `Errors: 160 out of 160` + - runtime riportato: `12910631 cycles` + +### 9) testMVP Siracusa_w_neureka con `--doublebuffer` +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python testMVP.py -p Siracusa_w_neureka -t /workspaces/Deeploy/DeeployTest/Tests/Models/Autoencoder2D --doublebuffer` +- Esito: `FAILED` +- Errore principale: + - binding/tiling fallisce prima della simulazione + - `RuntimeError: ERROR: Some geometrical constraints are infeasible` + - stack principale in: + - `Deeploy/TilingExtension/TilerModel.py:269` (`debugConstraints`) + - `Deeploy/TilingExtension/TilerModel.py:358` (`trySolveModel`) + - `Deeploy/TilingExtension/TilerExtension.py:316` (`computeTilingSchedule`) +- Note tecniche: + - senza `--doublebuffer` lo stesso comando non fallisce in geometria, ma dopo in allocazione memoria (`minimalloc`, capacity L2 negativa). + - questo indica che il blocco introdotto da `--doublebuffer` rende il sistema di vincoli geometrici infeasible. + +### 10) Verifica path di import Python (run reale vs codice locale) +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `python -c "import Deeploy; print(Deeploy.__file__)"` +- Esito: + - import di default da `/app/Deeploy/Deeploy/__init__.py` (non da `/workspaces/Deeploy/...`). +- Verifica aggiuntiva: + - forzando `PYTHONPATH=/workspaces/Deeploy` il traceback punta al sorgente locale ma l'errore geometrico resta identico. +- Conclusione: + - il problema non dipende da una mismatch di package, ma da vincoli realmente infeasible nel flusso `--doublebuffer`. + +## Sessione corrente (2026-05-05) + +Obiettivo: +- riprendere il debug dopo i microblocchi `Autoencoder2D_MicroBlocks`; +- confermare i target richiesti: + - Generic + - Siracusa tiled con memoria standard L3 + - Siracusa tiled con Neureka +- capire perche' `Encoder_mini` era stato risolto ma `Autoencoder2D` completo falliva ancora. + +### 11) Generic - Autoencoder2D dopo fix ConvTranspose +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 160` +- Nota: + - il backend Generic non era piu' il problema per il modello completo. + +### 12) Siracusa tiled L3 - Autoencoder2D prima del fix Conv FP +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- Esito: `FAILED` +- Risultato: + - mismatch numerico completo: `Errors: 160 out of 160` + - runtime osservato: circa `12924041 cycles` +- Nota: + - il modello compilava ed eseguiva, ma l'output finale era completamente errato. + +### 13) Siracusa tiled L3 + Neureka - Autoencoder2D prima del fix Conv FP +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa_w_neureka.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- Esito: `FAILED` +- Risultato: + - mismatch numerico completo: `Errors: 160 out of 160` + - runtime osservato: circa `12961863 cycles` +- Nota: + - il fallimento era coerente con Siracusa tiled senza Neureka. + +### 14) Diagnostica per output intermedi +- Sono stati creati test ONNX intermedi in `/tmp/deeploy_diag_auto2d` e `/tmp/deeploy_diag_auto2d_enc`. +- Punti principali generati: + - encoder Conv/Pool/ReLU/Flatten/Gemm + - decoder Gemm/ConvTranspose/Conv/BatchNorm/last Conv/Slice +- Osservazione iniziale: + - `auto_enc_linear` falliva in Siracusa tiled, ma questo non indicava un bug GEMM. + - Risalendo la catena, `auto_relu_enc2` era gia' errato. + - Risalendo ancora, `auto_conv_enc1` falliva con `Errors: 6400 out of 6400`. +- Conclusione: + - la divergenza nasceva gia' dalla primissima Conv FP dell'encoder. + - i layer lineari dei microblocchi erano corretti; nel modello completo ricevevano input gia' corrotto. + +### 15) Causa trovata: overlap L1 tra bias e buffer im2col +- Nel C generato per `auto_conv_enc1`: + - il kernel chiamato era `PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC`; + - la bias era allocata in L1 a un offset che ricadeva dentro l'area usata dal buffer transient `im2col`; + - il kernel parallelizza sul numero di core e usa una porzione di `im2col` per ogni core. +- Causa tecnica: + - `PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize(...)` dimensionava il transient buffer usando `operatorRepresentation["n_cores"]`; + - il valore `operatorRepresentation["n_cores"]` arrivava da `generateNetwork.py --cores`; + - `deeployRunner.py` passava `args.cores` a CMake come `-DNUM_CORES=...`, ma non lo propagava anche agli argomenti di generazione; + - quindi `generateNetwork.py` usava il suo default `--cores=1`, mentre il C compilato/eseguito usava `NUM_CORES=8`; + - quindi veniva riservato spazio solo per 1 core, ma il kernel ne usava 8, sovrascrivendo la bias. +- Effetto: + - la prima Conv produceva output sbagliato; + - tutto il resto della rete divergeva, inclusi i GEMM successivi. + +### 16) Verifica diagnostica dopo fix Conv FP +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t /tmp/deeploy_diag_auto2d_enc/auto_conv_enc1 --defaultMemLevel=L3 --l2=3000000` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 6400` + - runtime: `583115 cycles` +- Verifica aggiuntiva: + - con `--cores=1`, il runner passa sia `-DNUM_CORES=1` a CMake sia `--cores=1` a `generateNetwork.py`; + - `auto_conv_enc1` passa anche a 1 core con `Errors: 0 out of 6400`; + - runtime osservato a 1 core: `1648612 cycles`. + +### 17) Generic - Autoencoder2D finale +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 160` + +### 18) Siracusa tiled L3 - Autoencoder2D finale +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 160` + - runtime: `14048962 cycles` + +### 19) Siracusa tiled L3 + Neureka - Autoencoder2D finale +- Comando: + - `cd /workspaces/Deeploy/DeeployTest` + - `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa_w_neureka.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- Esito: `PASSED` +- Risultato: + - `Errors: 0 out of 160` + - runtime: `14061183 cycles` + +## Cambiamenti al codice (solo fix efficaci) + +### Fix ConvTranspose stride parsing +- File: + - `Deeploy/Targets/Generic/Parsers.py` +- Modifica: + - nel parser comune `ConvTransposeParser`, `stride_x` e `stride_y` erano assegnati invertiti. + - Corretto in modo che: + - `stride_x = node.attrs["strides"][0]` + - `stride_y = node.attrs["strides"][1]` se presente, altrimenti uguale a `stride_x` +- Impatto: + - risolve il caso progressivo dei microblocchi dove una Conv dopo `ConvTranspose` produceva layout/risultati errati. + - `Encoder_mini` passa su Generic, Siracusa tiled L3 e Siracusa tiled L3 + Neureka. + +### Fix dimensionamento im2col per Conv FP PULP tiled +- File: + - `DeeployTest/testUtils/deeployRunner.py` + - `Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py` +- Modifica: + - il runner propaga ora `--cores=...` anche ai generation args, non solo a CMake: + - `--cores=` oppure `--cores=` + - `PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize(...)` + - `PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize(...)` + - il numero di core usato per dimensionare il transient buffer viene letto direttamente da `operatorRepresentation["n_cores"]`, ora coerente con `NUM_CORES`. + - la patch conservativa usata durante il debug resta commentata nel template: + - `# n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8)` +- Impatto: + - evita overlap in L1 tra `im2col` e bias/altre tile; + - mantiene corretto anche il caso reale a 1 core senza allocare inutilmente per 8 core; + - risolve `Autoencoder2D` completo su Siracusa tiled L3 e Siracusa tiled L3 + Neureka. + +## Discussione memoria (verificata su codice generato) + +### Domanda +I dati usati spesso (es. filtri convolutivi) vengono ricaricati da L3 ogni volta o rimangono in livelli intermedi? + +### Evidenza osservata +Nel codice generato (`DeeployTest/TEST_SIRACUSA/Tests/Models/Autoencoder2D/Network.c`): +- i pesi vengono inizialmente caricati in L3 (`cl_ram_malloc` + `load_file_to_ram(...)`) +- nelle closure tiled vengono copiati da L3 a L2 (`pi_cl_ram_copy_2d(... weight_ExternalToLocal ...)`) +- nel loop interno vengono trasferiti da L2 a L1 per il compute (`mchan_transfer_1d(... weight_ref ...)`) + +### Conclusione pratica +- L3 funziona da backing store capiente. +- L2 è staging/intermedio. +- L1 è il livello vicino al compute dove i tile vengono processati. +- Quindi non è un pattern "usa e rimetti sempre in L3 ad ogni micro-step"; c'è buffering e riuso nei livelli intermedi secondo la strategia di tiling. + +## Comandi di riferimento usati +- `python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- `python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D` +- `python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D --skipsim` +- `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --skipsim` +- `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --skipsim --defaultMemLevel=L3 --l2=3000000` +- `python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- `PYTHONPATH=/workspaces/Deeploy python deeployRunner_generic.py -t Tests/Models/Autoencoder2D` +- `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa_w_neureka.py -t Tests/Models/Autoencoder2D --defaultMemLevel=L3 --l2=3000000` +- `PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t /tmp/deeploy_diag_auto2d_enc/auto_conv_enc1 --defaultMemLevel=L3 --l2=3000000` diff --git a/DeeployTest/Tests/Models/Autoencoder2D/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/inputs.npz new file mode 100644 index 0000000000..fe562600e8 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/network.onnx new file mode 100644 index 0000000000..2d5cc658d2 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/outputs.npz new file mode 100644 index 0000000000..79dcb5faeb Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_GMM/README.md b/DeeployTest/Tests/Models/Autoencoder2D_GMM/README.md new file mode 100644 index 0000000000..6b84f4897f --- /dev/null +++ b/DeeployTest/Tests/Models/Autoencoder2D_GMM/README.md @@ -0,0 +1,114 @@ +# Autoencoder2D_GMM + +Questa cartella contiene il modello `Autoencoder2D_GMM` e i file necessari per usarlo con Deeploy e convertirlo in codice C per il target `Generic`. + +## Contenuto della cartella + +- `network.onnx` + Modello ONNX finale. E' il modello che viene dato in ingresso a Deeploy per la generazione del codice C. + +- `inputs.npz` + Input di test del modello. Vengono usati per eseguire il network durante la validazione. + +- `outputs.npz` + Output attesi del modello. Vengono usati come riferimento per verificare che il codice C prodotto da Deeploy dia il risultato corretto. + +- `README.md` + Questo file. + +## Cosa rappresenta il modello + +`Autoencoder2D_GMM` e' un modello composto da due sottosistemi principali: + +- un ramo `Autoencoder2D`, che produce l'output di ricostruzione +- un ramo `GMM`, integrato nello stesso grafo, che produce un secondo output + +Il modello ha quindi due output finali: + +- `reconstruction` +- `gmm_output` + +## Dove finisce il codice C generato + +Quando esegui Deeploy sul target `Generic`, i file C generati per questo modello vengono scritti in: + +- [Network.c](/workspaces/Deeploy/DeeployTest/TEST_GENERIC/Tests/Models/Autoencoder2D_GMM/Network.c) +- [Network.h](/workspaces/Deeploy/DeeployTest/TEST_GENERIC/Tests/Models/Autoencoder2D_GMM/Network.h) +- [testinputs.h](/workspaces/Deeploy/DeeployTest/TEST_GENERIC/Tests/Models/Autoencoder2D_GMM/testinputs.h) +- [testoutputs.h](/workspaces/Deeploy/DeeployTest/TEST_GENERIC/Tests/Models/Autoencoder2D_GMM/testoutputs.h) + +Questi file hanno il seguente ruolo: + +- `Network.c` + Contiene il codice C generato da Deeploy per eseguire l'inferenza del modello. + +- `Network.h` + Espone le funzioni principali del network e i puntatori ai buffer di input e output. + +- `testinputs.h` + Contiene gli input del file `inputs.npz` convertiti in array C. + +- `testoutputs.h` + Contiene gli output del file `outputs.npz` convertiti in array C. + +## Main di esecuzione + +Il programma che esegue il network generato e confronta i risultati si trova qui: + +- [main.c](/workspaces/Deeploy/DeeployTest/Platforms/Generic/main.c) + +Questo file: + +- inizializza il network +- copia gli input nei buffer del modello +- esegue `RunNetwork` +- confronta gli output reali con quelli di riferimento + +## Librerie C del target Generic + +Le implementazioni dei kernel C usati dal network generato si trovano in: + +- `/workspaces/Deeploy/TargetLibraries/Generic/src` + +Alcuni file utili da conoscere sono: + +- [MatMul_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/MatMul_fp32.c) +- [Gemm_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/Gemm_fp32.c) +- [Convolution_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/Convolution_fp32.c) +- [ConvTranspose2d_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c) +- [ReduceLogSumExp_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c) +- [Div_fp32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/Div_fp32.c) +- [Div_s32.c](/workspaces/Deeploy/TargetLibraries/Generic/src/Div_s32.c) + +In pratica: + +- `Network.c` descrive il tuo modello convertito +- le librerie in `TargetLibraries/Generic/src` implementano le operazioni numeriche chiamate da `Network.c` + +## Flusso di lavoro + +Il flusso standard e': + +1. prepari `network.onnx`, `inputs.npz` e `outputs.npz` +2. Deeploy genera il codice C del modello +3. il target `Generic` compila il codice generato insieme alle librerie C di supporto +4. `main.c` esegue il modello e verifica i risultati + +## Comando per fare tutto + +Per generare, compilare ed eseguire il modello sul target `Generic`: + +```bash +cd /workspaces/Deeploy/DeeployTest +python deeployRunner_generic.py -t Tests/Models/Autoencoder2D_GMM -v +``` + +## Nota importante + +Se modifichi uno di questi file: + +- `network.onnx` +- `inputs.npz` +- `outputs.npz` + +devi rigenerare il codice Deeploy prima di ricompilare, altrimenti i file C generati potrebbero non essere piu' allineati con il contenuto del modello o dei dati di test. diff --git a/DeeployTest/Tests/Models/Autoencoder2D_GMM/TEST_REPORT.md b/DeeployTest/Tests/Models/Autoencoder2D_GMM/TEST_REPORT.md new file mode 100644 index 0000000000..af8b5501d9 --- /dev/null +++ b/DeeployTest/Tests/Models/Autoencoder2D_GMM/TEST_REPORT.md @@ -0,0 +1,760 @@ +# Test Report - Autoencoder2D_GMM + +Questo file raccoglie i risultati dei test eseguiti durante il debug di `Autoencoder2D_GMM`, +le cause individuate e le modifiche applicate. + +Data sessione: 2026-05-06 + +## Obiettivo + +Portare `Autoencoder2D_GMM` sui target PULP/Siracusa, procedendo in ordine: + +1. Siracusa non-tiled +2. Siracusa tiled con memoria standard L3 +3. Siracusa tiled con Neureka + +Il modello contiene due rami: + +- `Autoencoder2D`, con output `reconstruction` +- testa `GMM`, con output `gmm_output` + +Il ramo GMM introduce operatori che erano supportati nel target `Generic`, ma non ancora +nel mapping PULP/Siracusa. + +## Stato iniziale + +Il modello `Autoencoder2D_GMM` funzionava sul target `Generic`, ma non su Siracusa. + +Comando di riferimento Generic: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_generic.py -t Tests/Models/Autoencoder2D_GMM +``` + +## 1) Siracusa non-tiled - primo tentativo + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_siracusa.py -t Tests/Models/Autoencoder2D_GMM +``` + +Esito: `FAILED` + +Errore principale: + +```text +RuntimeError: No mapping found for node gmm_modelgmmReduceLogSumExp with op type ReduceLogSumExp +``` + +### Causa + +`ReduceLogSumExp` era implementato nel target `Generic`, ma non era registrato nella +piattaforma PULP/Siracusa. + +Implementazioni Generic gia' presenti: + +- `Deeploy/Targets/Generic/Parsers.py` + - `ReduceLogSumExpParser` +- `Deeploy/Targets/Generic/Bindings.py` + - `BasicReduceLogSumExpBindings` +- `Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py` +- `TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c` +- `TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h` + +### Modifica applicata + +File modificato: + +- `Deeploy/Targets/PULPOpen/Platform.py` + +Sono stati importati e registrati: + +- `ReduceLogSumExpLayer` +- `ReduceLogSumExpParser` +- mapping `ReduceLogSumExp` + +Modifica concettuale: + +```python +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), PULPReduceLogSumExpTilingReadyBindings) + +PULPMapping = { + ... + 'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]), + ... +} +``` + +### Perche' + +Senza questa entry il parser PULP non aveva nessun mapper disponibile per il nodo ONNX +`ReduceLogSumExp`, quindi il grafo veniva rifiutato prima della generazione del codice. + +## 2) Siracusa non-tiled - secondo blocco + +Dopo l'aggiunta di `ReduceLogSumExp`, il parsing e' avanzato fino al nodo `Concat`. + +Esito: `FAILED` + +Errore principale: + +```text +PARSING FAILED - Backtracking exhausted at root! +Deepest successful exploration: Layer 13 'gmm_modelConcat' +Deepest layer available mappers: ['ConcatParser'] +RuntimeError: Did not find adequate mapping for graph! +``` + +Nodo ONNX coinvolto: + +```text +Concat /gmm_model/Concat ['/flatten/Flatten_output_0', 'onnx::Concat_154'] +``` + +### Causa + +Il binding `Concat` PULP supportava solo tipi interi: + +```python +PULPConcatBindings = [ + NodeBinding(... for type in IntegerDataTypes) +] +``` + +Il target `Generic`, invece, aveva gia' anche il caso `float32_t`. + +### Modifica applicata + +File modificato: + +- `Deeploy/Targets/PULPOpen/Bindings.py` + +Aggiunto binding `Concat` FP32: + +```python +PULPConcatBindings = [ + NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) +] +``` + +### Perche' + +La testa GMM concatena tensori `float32_t`. Il template `ConcatTemplate` era gia' riutilizzabile, +ma mancava il type binding PULP per FP32. + +## 3) Siracusa non-tiled - limite memoria L2 + +Dopo i fix precedenti, il modello non-tiled arriva a generare e compilare `Network.c`, +ma fallisce al link. + +Esito: `FAILED` + +Errore principale: + +```text +ld.lld: error: section '.l2_data' will not fit in region 'L2': overflowed by 89932 bytes +``` + +### Causa + +Il modello completo `Autoencoder2D_GMM` non entra nella L2 standard in configurazione +Siracusa non-tiled. + +Il runner non-tiled `deeployRunner_siracusa.py` non espone `--l2`, quindi non e' possibile +fare lo stesso override memoria usato nel flusso tiled. + +### Conclusione + +Siracusa non-tiled non e' bloccato da layer mancanti dopo le patch, ma da memoria L2 +insufficiente. Il target rilevante per il modello completo resta il flusso tiled con L3. + +## 4) Siracusa tiled L3 - primo tentativo + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D_GMM --defaultMemLevel=L3 --l2=3000000 +``` + +Esito: `FAILED` + +Errore principale: + +```text +AttributeError: 'NodeTemplate' object has no attribute 'tileConstraint' +``` + +### Causa + +Il primo mapping di `ReduceLogSumExp` riusava il binding Generic puro. Quel binding usa +un `NodeTemplate` senza `tileConstraint`, quindi non puo' essere attraversato dal tiler PULP. + +### Modifica applicata + +File modificati: + +- `Deeploy/Targets/PULPOpen/Bindings.py` +- `Deeploy/Targets/PULPOpen/Tiler.py` +- `Deeploy/Targets/PULPOpen/Platform.py` + +In `Bindings.py` e' stato aggiunto un binding PULP vero per `ReduceLogSumExp`: + +```python +PULPReduceLogSumExpBindings = [ + NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceLogSumExpTemplate.referenceTemplate, ForkTransformer) +] +``` + +In `Tiler.py` e' stato aggiunto il binding tiling-ready: + +```python +_PULPReduceLogSumExpBindings = copy.deepcopy(PULPReduceLogSumExpBindings) + +PULPReduceLogSumExpTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = _PULPReduceLogSumExpBindings, + tileConstraint = UntiledTileConstraint() +) +``` + +In `Platform.py`, il mapper PULP e' stato collegato a `PULPReduceLogSumExpTilingReadyBindings`. + +### Perche' + +`ReduceLogSumExp` e' una riduzione. Per questo step non e' stata inventata una nuova +tilizzazione matematica del kernel. La scelta conservativa e' stata renderlo compatibile +con il flusso tiled tramite `UntiledTileConstraint`, come gia' fatto per altri operatori +che devono restare atomici ma vivere in una rete tiled. + +Il passaggio da binding Generic a binding PULP era necessario anche per usare il transformer +memory-aware corretto. Con il binding Generic l'output del nodo non veniva allocato +correttamente nel flusso PULP, causando poi un accesso invalido nel kernel C. + +### Nota sulla scelta `UntiledTileConstraint` + +Il kernel numerico usato per `ReduceLogSumExp` e' ancora quello Generic: + +- `TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c` +- `Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py` + +La patch non introduce quindi una implementazione PULP ottimizzata della riduzione. +Introduce invece un'integrazione PULP/Siracusa memory-aware, in modo che il nodo possa +essere inserito correttamente in una rete tiled L3. + +La forma numerica stabile di `ReduceLogSumExp` e': + +```text +m = max(x_i) +out = log(sum(exp(x_i - m))) + m +``` + +Questa operazione ha una dipendenza globale lungo l'asse ridotto. Se l'asse della riduzione +venisse tagliato ingenuamente in tile indipendenti, ogni tile vedrebbe solo una parte dei +valori, calcolerebbe un massimo locale e una somma locale, e il risultato finale sarebbe +numericamente sbagliato. + +Un tiling corretto richiederebbe un kernel/constraint dedicato multi-pass: + +1. calcolo dei massimi locali per ogni tile; +2. riduzione dei massimi locali in un massimo globale; +3. calcolo delle somme parziali `sum(exp(x_i - max_globale))`; +4. riduzione delle somme parziali; +5. calcolo finale `log(sum_globale) + max_globale`. + +Per questo debug e' stato scelto `UntiledTileConstraint`: il nodo resta atomico, ma viene +gestito correttamente dal flusso tiled e puo' convivere con gli altri layer in L3. + +### Ha senso un kernel C PULP dedicato? + +Si', ma non era necessario per sbloccare questo modello. + +Ha senso implementarlo se: + +- `ReduceLogSumExp` diventa un collo di bottiglia di runtime; +- l'input della riduzione diventa troppo grande per essere tenuto atomico nel livello di + memoria scelto; +- serve sfruttare parallelismo sui core PULP per riduzioni lunghe; +- si vuole supportare davvero il tiling lungo l'asse ridotto. + +Nel caso corrente `Autoencoder2D_GMM` usa `ReduceLogSumExp` su una testa GMM piccola +rispetto al resto del modello. La soluzione conservativa e' quindi preferibile: meno codice +nuovo, minore rischio numerico, e test finali corretti su Siracusa tiled e Siracusa + Neureka. + +## 5) Siracusa tiled L3 - reference duplicata nel self-Mul + +Dopo il fix di `ReduceLogSumExp`, il tiler proseguiva ma falliva durante la codegen. + +Esito: `FAILED` + +Errore principale: + +```text +KeyError: 'Buffername TILING_CODEGEN_L1_gmm_modelgmmMul_gmm_modelAdd_output_0_tensor_ref was already in the local context!' +``` + +Nodo ONNX coinvolto: + +```text +Mul /gmm_model/gmm/Mul ['/gmm_model/Add_output_0', '/gmm_model/Add_output_0'] +``` + +### Causa + +Il nodo e' un self-Mul: stesso tensore usato come entrambi gli ingressi, cioe': + +```text +Mul(x, x) +``` + +La codegen tiled crea reference locali basate sul nome del buffer esterno. Con due ingressi +uguali, tentava di creare due volte la stessa reference nello stesso `NetworkContext`. + +### Modifica applicata + +File modificato: + +- `Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py` + +La funzione `_hoistReference(...)` ora riusa una reference locale gia' esistente se: + +- il nome coincide +- il buffer referenziato coincide +- shape e offset coincidono +- il tipo coincide + +Modifica concettuale: + +```python +refName = self.prefix + name +if ctxt.is_local(refName): + ref = ctxt.lookup(refName) + assert isinstance(ref, _ReferenceBuffer) + assert ref._referenceName == reference.name + assert tuple(ref.shape) == tuple(shape) + expectedOffset = offset.name if isinstance(offset, VariableBuffer) else offset + assert ref._offset == expectedOffset + ... + return ref + +ref = ctxt.hoistReference(refName, reference, shape, offset, override_type) +``` + +### Perche' + +Due ingressi dello stesso nodo possono puntare allo stesso tensore ONNX. In quel caso +duplicare la reference non serve ed e' sbagliato: va riusata la stessa reference locale. + +## 6) Siracusa tiled L3 - errore numerico su `gmm_output` + +Dopo la correzione della reference duplicata, il modello generava, compilava ed eseguiva, +ma falliva sul confronto finale. + +Esito: `FAILED` + +Risultato: + +```text +Expected: 113.107033 Actual: -39.171692 Diff: 152.278717 at Index 0 in Output 1 +Errors: 1 out of 241 +``` + +Osservazione: + +- `reconstruction` era corretta +- solo `gmm_output` era errato + +### Causa + +Il template PULP FP32 di `Mul` assumeva sempre che il secondo input fosse uno scalare: + +```c +float32_t scalar = B[0]; +C[i] = A[i] * scalar; +``` + +Questo e' corretto per `Mul(vettore, scalare)`, ma non per `Mul(vettore, vettore)`. + +Nel grafo GMM il nodo: + +```text +Mul('/gmm_model/Add_output_0', '/gmm_model/Add_output_0') +``` + +deve calcolare: + +```text +x * x +``` + +Il template PULP calcolava invece: + +```text +x * x[0] +``` + +Questo corrompeva la testa GMM prima di `MatMul`, `Add` e `ReduceLogSumExp`. + +Il template Generic era gia' corretto e distingueva: + +```python +B[0] se sizeB == 1 +B[i] altrimenti +``` + +### Modifica applicata + +File modificato: + +- `Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py` + +Il template PULP ora mantiene l'ottimizzazione scalare solo quando `sizeB == 1`, +altrimenti usa moltiplicazione elemento-per-elemento. + +Modifica concettuale: + +```mako +% if sizeB == 1: + float32_t ${nodeName}_scalar = ${B}[0]; +% endif + +... + +% if sizeB == 1: + ${C}[i] = ${A}[i] * ${nodeName}_scalar; +% else: + ${C}[i] = ${A}[i] * ${B}[i]; +% endif +``` + +### Perche' + +`Mul` ONNX supporta il caso vettore-vettore. La testa GMM lo usa esplicitamente per +calcolare un termine quadratico. Il target PULP era piu' restrittivo del target Generic +e produceva codice numericamente sbagliato. + +## 7) Siracusa tiled L3 - risultato finale + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D_GMM --defaultMemLevel=L3 --l2=3000000 +``` + +Esito: `PASSED` + +Risultato: + +```text +Errors: 0 out of 241 +Runtime: 15934289 cycles +``` + +## 8) Siracusa tiled L3 + Neureka - risultato finale + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa_w_neureka.py -t Tests/Models/Autoencoder2D_GMM --defaultMemLevel=L3 --l2=3000000 +``` + +Esito: `PASSED` + +Risultato: + +```text +Errors: 0 out of 241 +Runtime: 15868032 cycles +``` + +## Modifiche esatte applicate + +### `Deeploy/Targets/PULPOpen/Platform.py` + +- aggiunto import di `ReduceLogSumExpLayer` +- aggiunto import di `ReduceLogSumExpParser` +- aggiunto import di `PULPReduceLogSumExpTilingReadyBindings` +- aggiunto: + +```python +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), PULPReduceLogSumExpTilingReadyBindings) +``` + +- aggiunta entry: + +```python +'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]) +``` + +Motivo: + +- rendere visibile `ReduceLogSumExp` alla piattaforma PULP/Siracusa. + +### `Deeploy/Targets/PULPOpen/Bindings.py` + +- aggiunto import di `FloatReduceLogSumExpTemplate` +- aggiunto import di `ReduceLogSumExpChecker` +- aggiunto `PULPReduceLogSumExpBindings` +- aggiunto binding `Concat` FP32 a `PULPConcatBindings` + +Motivo: + +- usare `ReduceLogSumExp` nel flusso PULP con transformer memory-aware +- supportare `Concat` FP32 nella testa GMM + +### `Deeploy/Targets/PULPOpen/Tiler.py` + +- aggiunto import di `PULPReduceLogSumExpBindings` +- aggiunto: + +```python +PULPReduceLogSumExpTilingReadyBindings = TilingReadyNodeBindings( + nodeBindings = _PULPReduceLogSumExpBindings, + tileConstraint = UntiledTileConstraint() +) +``` + +Motivo: + +- permettere al tiler di attraversare `ReduceLogSumExp` senza spezzarlo in tile non supportati. + +### `Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py` + +- `_hoistReference(...)` ora riusa una reference locale gia' esistente quando e' equivalente. + +Motivo: + +- gestire nodi con lo stesso tensore usato piu' volte in input, come `Mul(x, x)`. + +### `Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py` + +- corretto il template FP32 PULP: + - `sizeB == 1`: usa `B[0]` come scalare + - `sizeB != 1`: usa `B[i]` + +Motivo: + +- supportare correttamente `Mul` elemento-per-elemento oltre al caso scalare. + +## Verifiche finali + +Verifica sintattica Python: + +```bash +python -m py_compile \ + Deeploy/Targets/PULPOpen/Bindings.py \ + Deeploy/Targets/PULPOpen/Platform.py \ + Deeploy/Targets/PULPOpen/Tiler.py \ + Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py \ + Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py +``` + +Esito: `PASSED` + +Test finali: + +- Siracusa tiled L3: `PASSED`, `Errors: 0 out of 241` +- Siracusa tiled L3 + Neureka: `PASSED`, `Errors: 0 out of 241` + +## Nota finale + +Il target Siracusa non-tiled non e' stato portato a `PASSED` per limite di memoria L2, +non per mancanza di layer dopo le patch. Il percorso funzionante per `Autoencoder2D_GMM` +e' quello tiled con `--defaultMemLevel=L3 --l2=3000000`. + +## Aggiornamento modello e tolleranza FP (2026-05-06) + +Dopo un aggiornamento di: + +- `Tests/Models/Autoencoder2D_GMM/network.onnx` +- `Tests/Models/Autoencoder2D_GMM/inputs.npz` +- `Tests/Models/Autoencoder2D_GMM/outputs.npz` + +il numero di output verificati e' passato da `241` a `161`. + +### Verifica Generic + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_generic.py -t Tests/Models/Autoencoder2D_GMM +``` + +Esito: `PASSED` + +Risultato: + +```text +Errors: 0 out of 161 +``` + +### Regressione apparente su Siracusa tiled + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D_GMM --defaultMemLevel=L3 --l2=3000000 +``` + +Esito iniziale: `FAILED` + +Errore osservato: + +```text +Expected: 619.625488 Actual: 619.625610 Diff: -0.000122 at Index 0 in Output 1 +Errors: 1 out of 161 +``` + +### Causa + +Il modello e i dati erano coerenti, infatti il target `Generic` passava. + +L'errore Siracusa era dovuto al criterio di confronto FP nel test runtime: + +```c +if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) +``` + +Questa e' una tolleranza solo assoluta. Con il nuovo modello `gmm_output` vale circa `620`, +quindi una differenza assoluta di `1.22e-4` corrisponde a un errore relativo di circa +`2e-7`. Questo e' compatibile con differenze normali tra implementazioni FP di `expf/logf` +e con l'esecuzione su target PULP. + +### Modifica applicata + +File modificati: + +- `DeeployTest/Platforms/Siracusa/src/deeploytest.c` +- `DeeployTest/Platforms/PULPOpen/src/deeploytest.c` + +La comparazione float ora usa tolleranza assoluta piu' relativa: + +```c +#define FLOAT_ABS_TOL 1e-4f +#define FLOAT_REL_TOL 1e-5f + +float abs_diff = fabsf(diff); +float scale = fabsf(expected_val); +float abs_actual = fabsf(actual_val); +if (abs_actual > scale) { + scale = abs_actual; +} +float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale; + +if ((abs_diff > tolerance) || isnan(diff)) { + ... +} +``` + +### Perche' + +La tolleranza assoluta resta utile per valori piccoli. La tolleranza relativa evita invece +di segnare come errore uno scarto numerico molto piccolo rispetto alla scala del valore. +Questo e' particolarmente rilevante per la testa GMM, dove `ReduceLogSumExp` usa `expf` +e `logf`. + +### Verifiche dopo la modifica + +Siracusa tiled L3: + +```text +Errors: 0 out of 161 +Runtime: 15914263 cycles +``` + +Siracusa tiled L3 + Neureka: + +```text +Errors: 0 out of 161 +Runtime: 15945410 cycles +``` + +## Pulizia warning MatMul tiled (2026-05-06) + +Durante la compilazione Siracusa tiled del modello aggiornato comparivano warning del tipo: + +```text +warning: implicit conversion changes signedness: 'int8_t' to 'unsigned int' +``` + +Il warning era generato nel `Network.c` tilizzato per il nodo `gmm_modelMatMul`. +La dimensione tilizzata `O` veniva materializzata come `int8_t *O_ref` e poi usata in: + +- pointer arithmetic; +- chiamata a `PULP_MatMul_fp32_fp32_fp32_unroll1x7`, che si aspetta dimensioni unsigned. + +### Causa + +In `Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py` i replacement +tilizzati di `MatMul` erano tipizzati come `int8_t`: + +```python +replacementTypes = { + "M": PointerClass(int8_t), + "N": PointerClass(int8_t), + "O": PointerClass(int8_t), + "batch": PointerClass(int8_t) +} +``` + +Queste grandezze sono dimensioni di tile, quindi non possono essere negative. +Il tipo signed non era semanticamente necessario. + +### Modifica applicata + +File modificato: + +- `Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py` + +Il tipo dei replacement e' stato allineato a `GEMMTileConstraint`, usando `uint16_t`: + +```python +replacementTypes = { + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint16_t) +} +``` + +### Perche' + +`M`, `N`, `O` e `batch` sono dimensioni non negative. Usare `uint16_t` elimina i warning +signed-to-unsigned e rende il codice generato piu' coerente con le API dei kernel PULP. + +### Verifica + +Comando: + +```bash +cd /workspaces/Deeploy/DeeployTest +PYTHONPATH=/workspaces/Deeploy python deeployRunner_tiled_siracusa.py -t Tests/Models/Autoencoder2D_GMM --defaultMemLevel=L3 --l2=3000000 +``` + +Esito: + +```text +Errors: 0 out of 161 +Runtime: 15946335 cycles +``` + +I warning `sign-conversion` nel `Network.c` generato sono spariti. + +Restano warning non legati al codice Deeploy generato: + +- `clang-15: warning: argument unused during compilation: '-nostartfiles'` +- `llvm-objdump: warning: failed to find source ... newlib ...` + +Questi provengono dal toolchain/debug info e non indicano una regressione numerica o di +generazione del modello. diff --git a/DeeployTest/Tests/Models/Autoencoder2D_GMM/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_GMM/inputs.npz new file mode 100644 index 0000000000..40e2d5e9f2 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_GMM/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_GMM/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_GMM/network.onnx new file mode 100644 index 0000000000..12800274dd Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_GMM/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_GMM/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_GMM/outputs.npz new file mode 100644 index 0000000000..a774bae2e7 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_GMM/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/inputs.npz new file mode 100644 index 0000000000..feccff239c Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/network.onnx new file mode 100644 index 0000000000..1841e24570 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/outputs.npz new file mode 100644 index 0000000000..281bb9c67f Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMR/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/inputs.npz new file mode 100644 index 0000000000..bf9d7a9807 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/network.onnx new file mode 100644 index 0000000000..bc5a018acc Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/outputs.npz new file mode 100644 index 0000000000..34a6da1215 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLRes/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/inputs.npz new file mode 100644 index 0000000000..fc5c4f1431 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/network.onnx new file mode 100644 index 0000000000..44db2d394f Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/outputs.npz new file mode 100644 index 0000000000..5e144188df Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLTra/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/inputs.npz new file mode 100644 index 0000000000..e35f7271e5 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/network.onnx new file mode 100644 index 0000000000..edc86b2c6a Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/outputs.npz new file mode 100644 index 0000000000..00b8a73177 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EncoderPCMRLin/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/inputs.npz new file mode 100644 index 0000000000..2eaecc5698 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/network.onnx new file mode 100644 index 0000000000..5cef598d23 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/outputs.npz new file mode 100644 index 0000000000..74e3b6eccb Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/Encoder_mini/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/inputs.npz new file mode 100644 index 0000000000..54a2d380c8 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/network.onnx new file mode 100644 index 0000000000..3b485d2c21 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/outputs.npz new file mode 100644 index 0000000000..77bcc55404 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D_MicroBlocks/EnoderCMR/outputs.npz differ diff --git a/DeeployTest/Tests/Models/GMM/inputs.npz b/DeeployTest/Tests/Models/GMM/inputs.npz new file mode 100644 index 0000000000..170ef9d86e Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/inputs.npz differ diff --git a/DeeployTest/Tests/Models/GMM/network.onnx b/DeeployTest/Tests/Models/GMM/network.onnx new file mode 100644 index 0000000000..83b97d257b Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/network.onnx differ diff --git a/DeeployTest/Tests/Models/GMM/outputs.npz b/DeeployTest/Tests/Models/GMM/outputs.npz new file mode 100644 index 0000000000..8793bae03e Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/outputs.npz differ diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 71b056e9df..b7d2488608 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -220,6 +220,10 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.extend(args.input_offset_map) if tiling: + if hasattr(args, 'cores'): + gen_args_list.append(f"--cores={args.cores}") + elif hasattr(args, 'num_cores'): + gen_args_list.append(f"--cores={args.num_cores}") if hasattr(args, 'defaultMemLevel') and args.defaultMemLevel: gen_args_list.append(f"--defaultMemLevel={args.defaultMemLevel}") if hasattr(args, 'doublebuffer') and args.doublebuffer: diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py index 8fa105d9f4..ca3a78aa16 100644 --- a/DeeployTest/test_siracusa_config.py +++ b/DeeployTest/test_siracusa_config.py @@ -19,6 +19,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias", "Kernels/FP32/Conv/Regular_2D_NoBias", "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias", + "Kernels/FP32/ConvTranspose2D", "Kernels/FP32/GEMM/Regular", "Kernels/FP32/MatMul", "Kernels/FP32/MaxPool/Regular_2D", diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a687d9a489..f85cd888bc 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -19,6 +19,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias": [6600], "Kernels/FP32/Conv/Regular_2D_NoBias": [1600], "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600], + "Kernels/FP32/ConvTranspose2D": [2000], "Kernels/FP32/GEMM/Regular": [8000], "Kernels/FP32/MatMul": [2000], "Kernels/FP32/MaxPool/Regular_2D": [2000], @@ -66,6 +67,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias": [8800], "Kernels/FP32/Conv/Regular_2D_NoBias": [2000], "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [8800], + "Kernels/FP32/ConvTranspose2D": [4000], "Kernels/FP32/GEMM/Regular": [8000], "Kernels/FP32/MatMul": [5000], "Kernels/FP32/MaxPool/Regular_2D": [5000], diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h index 22081701a3..2d8aff54c0 100644 --- a/TargetLibraries/Generic/inc/DeeployBasicMath.h +++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h @@ -34,6 +34,7 @@ #include "kernel/BatchNorm.h" #include "kernel/ConvTranspose1d_fp32.h" +#include "kernel/ConvTranspose2d_fp32.h" #include "kernel/Convolution.h" #include "kernel/DWConvolution.h" #include "kernel/Div.h" @@ -48,6 +49,7 @@ #include "kernel/RQDiv.h" #include "kernel/RQGELU.h" #include "kernel/RQHardswish.h" +#include "kernel/ReduceLogSumExp.h" #include "kernel/Relu.h" #include "kernel/RequantShift.h" #include "kernel/Softmax.h" diff --git a/TargetLibraries/Generic/inc/kernel/BatchNorm.h b/TargetLibraries/Generic/inc/kernel/BatchNorm.h index 72703f5fe2..e7af458f96 100644 --- a/TargetLibraries/Generic/inc/kernel/BatchNorm.h +++ b/TargetLibraries/Generic/inc/kernel/BatchNorm.h @@ -11,6 +11,6 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, const float32_t *beta, const float32_t *mean, const float32_t *var, float32_t *output, int N, int C, - int L); + int L, float epsilon, int channels_first); #endif // BATCHNORM_H diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h new file mode 100644 index 0000000000..67a62ef4e5 --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CONV_TRANSPOSE2D_FP32_H +#define CONV_TRANSPOSE2D_FP32_H + +#include +#include + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t K_h, uint32_t K_w, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out); + +#endif // CONV_TRANSPOSE2D_FP32_H diff --git a/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h new file mode 100644 index 0000000000..3e40d5cd2c --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h @@ -0,0 +1,16 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output, + uint32_t outer_size, uint32_t axis_length, + uint32_t inner_size); + +#endif // __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/src/BatchNorm_fp32.c b/TargetLibraries/Generic/src/BatchNorm_fp32.c index 1e94d63dbb..fd0e40e79d 100644 --- a/TargetLibraries/Generic/src/BatchNorm_fp32.c +++ b/TargetLibraries/Generic/src/BatchNorm_fp32.c @@ -8,8 +8,7 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, const float32_t *beta, const float32_t *mean, const float32_t *var, float32_t *output, int N, int C, - int L) { - const float epsilon = 1e-5f; + int L, float epsilon, int channels_first) { #pragma omp parallel for for (int c = 0; c < C; ++c) { float32_t c_mean = mean[c]; @@ -19,7 +18,12 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, float32_t denom = sqrtf(c_var + epsilon); for (int n = 0; n < N; ++n) { for (int l = 0; l < L; ++l) { - int index = n * C * L + c * L + l; + int index; + if (channels_first) { + index = n * C * L + c * L + l; + } else { + index = n * C * L + l * C + c; + } float32_t x = input[index]; float32_t norm = (x - c_mean) / denom; output[index] = c_gamma * norm + c_beta; diff --git a/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c new file mode 100644 index 0000000000..51efc078c7 --- /dev/null +++ b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t K_h, uint32_t K_w, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out) { + /* + input: [C_in, H_in, W_in] + weight: [C_in, C_out, K_h, K_w] + output: [C_out, H_out, W_out] + bias: [C_out] optionally + */ + + for (uint32_t c = 0; c < C_out; ++c) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + output[(c * H_out + h) * W_out + w] = 0.0f; + } + } + } + + for (uint32_t cout = 0; cout < C_out; ++cout) { + for (uint32_t cin = 0; cin < C_in; ++cin) { + for (uint32_t h_in = 0; h_in < H_in; ++h_in) { + for (uint32_t w_in = 0; w_in < W_in; ++w_in) { + float32_t val = input[(cin * H_in + h_in) * W_in + w_in]; + for (uint32_t kh = 0; kh < K_h; ++kh) { + uint32_t h_out = h_in * stride_h + kh; + if (h_out >= H_out) { + continue; + } + for (uint32_t kw = 0; kw < K_w; ++kw) { + uint32_t w_out = w_in * stride_w + kw; + if (w_out >= W_out) { + continue; + } + float32_t wgt = + weight[((cin * C_out + cout) * K_h + kh) * K_w + kw]; + output[(cout * H_out + h_out) * W_out + w_out] += val * wgt; + } + } + } + } + } + + if (has_bias) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + output[(cout * H_out + h) * W_out + w] += bias[cout]; + } + } + } + } +} diff --git a/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c new file mode 100644 index 0000000000..85ae40a60d --- /dev/null +++ b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" +#include + +void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output, + uint32_t outer_size, uint32_t axis_length, + uint32_t inner_size) { + + for (uint32_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + for (uint32_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) { + float32_t max_val = -INFINITY; + + for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) { + uint32_t input_idx = + (outer_idx * axis_length + axis_idx) * inner_size + inner_idx; + if (input[input_idx] > max_val) { + max_val = input[input_idx]; + } + } + + float32_t sum_exp = 0.0f; + for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) { + uint32_t input_idx = + (outer_idx * axis_length + axis_idx) * inner_size + inner_idx; + sum_exp += expf(input[input_idx] - max_val); + } + + output[outer_idx * inner_size + inner_idx] = logf(sum_exp) + max_val; + } + } +} diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c97..4800e6912d 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -29,9 +29,11 @@ #include "kernel/Matmul.h" #include "kernel/MaxPool.h" #include "kernel/RQiHardswish.h" +#include "kernel/Relu.h" #include "kernel/RequantShift.h" #include "kernel/Softmax.h" #include "kernel/UniformRequantShift.h" +#include "kernel/gemm.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index 3ebab54a0b..b5f0e57ea8 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -35,4 +35,14 @@ void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( uint32_t pad_left, uint32_t pad_right, float32_t *__restrict__ pContextBuffer); -#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ \ No newline at end of file +void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out, + uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h, + uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, + uint32_t pad_right, const float32_t *__restrict__ pSrcBias, + const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out, + uint32_t W_out); + +#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ diff --git a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h index b37487439f..021cb5f9b9 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h +++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h @@ -10,10 +10,10 @@ #include "DeeployPULPMath.h" void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t W, uint32_t H, uint32_t C, + uint32_t H, uint32_t W, uint32_t C, uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); -#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_ \ No newline at end of file +#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_ diff --git a/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c new file mode 100644 index 0000000000..8e46708273 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +__attribute__((noinline, optnone)) void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out, + uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h, + uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, + uint32_t pad_right, const float32_t *__restrict__ pSrcBias, + const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out, + uint32_t W_out) { + + (void)pad_bottom; + (void)pad_right; + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_out_chunk = + (C_out >> log2Core) + ((C_out & (NUM_CORES - 1)) != 0); + uint16_t ch_out_start = MIN(ch_out_chunk * core_id, C_out); + uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, C_out); + uint16_t ch_out_count = ch_out_stop - ch_out_start; + + if (ch_out_count == 0) { + return; + } + + uint32_t output_plane = H_out * W_out; + + for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) { + float32_t init = has_bias ? pSrcBias[cout] : 0.0f; + float32_t *out_ptr = pDstC + cout * output_plane; + for (uint32_t idx = 0; idx < output_plane; ++idx) { + out_ptr[idx] = init; + } + } + + uint32_t ch_in_per_group = C_in / groups; + uint32_t ch_out_per_group = C_out / groups; + + for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) { + uint32_t group_idx = cout / ch_out_per_group; + uint32_t cout_in_group = cout % ch_out_per_group; + + for (uint32_t cin_in_group = 0; cin_in_group < ch_in_per_group; + ++cin_in_group) { + uint32_t cin = group_idx * ch_in_per_group + cin_in_group; + + for (uint32_t h_in = 0; h_in < H_in; ++h_in) { + for (uint32_t w_in = 0; w_in < W_in; ++w_in) { + float32_t val = pSrcA[(cin * H_in + h_in) * W_in + w_in]; + + for (uint32_t kh = 0; kh < K_h; ++kh) { + int32_t h_out = + (int32_t)(h_in * stride_h + kh * dilation_h) - (int32_t)pad_top; + + if (h_out < 0 || h_out >= (int32_t)H_out) { + continue; + } + + for (uint32_t kw = 0; kw < K_w; ++kw) { + int32_t w_out = (int32_t)(w_in * stride_w + kw * dilation_w) - + (int32_t)pad_left; + + if (w_out < 0 || w_out >= (int32_t)W_out) { + continue; + } + + uint32_t weight_idx = + (((cin * ch_out_per_group + cout_in_group) * K_h + kh) * + K_w) + + kw; + uint32_t out_idx = + (cout * H_out + (uint32_t)h_out) * W_out + (uint32_t)w_out; + + pDstC[out_idx] += val * pSrcB[weight_idx]; + } + } + } + } + } + } +} diff --git a/TargetLibraries/PULPOpen/src/MaxPool.c b/TargetLibraries/PULPOpen/src/MaxPool.c index 3b630b97cc..5ae8a2adb9 100644 --- a/TargetLibraries/PULPOpen/src/MaxPool.c +++ b/TargetLibraries/PULPOpen/src/MaxPool.c @@ -8,7 +8,7 @@ #include "pmsis.h" void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t W, uint32_t H, uint32_t C, + uint32_t H, uint32_t W, uint32_t C, uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, @@ -60,4 +60,4 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, } } } -} \ No newline at end of file +} diff --git a/cmake/Util.cmake b/cmake/Util.cmake index 1e54dc680b..fc678864e8 100644 --- a/cmake/Util.cmake +++ b/cmake/Util.cmake @@ -4,10 +4,13 @@ macro(add_deeploy_library name) add_library(${ARGV}) - add_custom_command( - TARGET ${name} - POST_BUILD - COMMAND ${CMAKE_OBJDUMP} -dhS $ > $.s) + list(FIND ARGV "STATIC" __deeploy_static_idx) + if(__deeploy_static_idx EQUAL -1) + add_custom_command( + TARGET ${name} + POST_BUILD + COMMAND ${CMAKE_OBJDUMP} -dhS $ > $.s) + endif() endmacro() macro(add_deeploy_executable name)