diff --git a/.github/workflows/ci-platform-snitch-tiled.yml b/.github/workflows/ci-platform-snitch-tiled.yml index 5390d8ad16..4ecd45fba9 100644 --- a/.github/workflows/ci-platform-snitch-tiled.yml +++ b/.github/workflows/ci-platform-snitch-tiled.yml @@ -35,4 +35,4 @@ jobs: with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "kernels and singlebuffer and l2" + pytest-marker: "(kernels or models) and singlebuffer and l2" diff --git a/.github/workflows/ci-platform-snitch.yml b/.github/workflows/ci-platform-snitch.yml index c1ae694148..470d43efad 100644 --- a/.github/workflows/ci-platform-snitch.yml +++ b/.github/workflows/ci-platform-snitch.yml @@ -35,4 +35,12 @@ jobs: with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "kernels" + pytest-marker: kernels + + snitch-models: + needs: select-env + uses: ./.github/workflows/_runner-snitch.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: models diff --git a/.yamllint b/.yamllint index ca8d1f606b..8156f0b8e2 100644 --- a/.yamllint +++ b/.yamllint @@ -31,3 +31,5 @@ ignore: - "**/toolchain/" # Ignore all files in .git - "**/.git/**" + # Ignore all files in .venv + - "**/.venv/" diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index d9d768fabc..9367d9406b 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -3107,7 +3107,7 @@ def _exportGraph(self, folderPath, fileName): # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)] for tensor in constTensors: - if tensor.dtype != tensor.export_dtype: + if hasattr(tensor, 'export_dtype') and tensor.dtype != tensor.export_dtype: tensor.values = tensor.values.astype(tensor.export_dtype) model = gs.export_onnx(self.graph) diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..c40c812048 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -286,6 +286,9 @@ BasicConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, BasicTransformer) ] BasicQuantBindings = [ diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..dcc0273300 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -709,3 +709,52 @@ def computeOps(self): numPx = opRep['dim_im_out_x'] return numPx * opsPerPx + + +class RMSNormLayer(ONNXLayer): + """Layer support for the ONNX RMSNormalization operator. + + Supported opset: 23 + + It is computed as follows: + - XSquared = Mul(X, X) + - XSquaredMean = ReduceMean(XSquared) + - MeanSquareEpsilon = Add(XSquaredMean, epsilon) + - RMS = Sqrt(MeanSquareEpsilon) + - Normalized = Div(X, RMS) + - Y = Mul(Normalized, Scale) + + For more details, this is the official ONNX documentation: + https://onnx.ai/onnx/operators/onnx__RMSNormalization.html#rmsnormalization-23 + """ + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + inputSize = self.mapper.parser.operatorRepresentation['inputSize'] + NormalizedAxesSize = self.mapper.parser.operatorRepresentation['NormalizedAxesSize'] + scale = self.mapper.parser.operatorRepresentation['scale'] + + # a. XSquared = Mul(X, X) => inputSize ops + # b. XSquaredMean = ReduceMean(XSquared) + # => inputSize ops (additions) + (inputSize - NormalizedAxesSize) ops (divisions) + # c. MeanSquareEpsilon = Add(XSquaredMean, epsilon) => (inputSize - NormalizedAxesSize) ops + # d. RMS = Sqrt(MeanSquareEpsilon) => (inputSize - NormalizedAxesSize) ops + # e. Normalized = Div(X, RMS) => inputSize ops + # f. Y = Mul(Normalized, Scale) => 0 if all(Scale == 1.0), else inputSize ops + scale_ops = 0 if (scale == 1.0).all() else inputSize + ops = 6 * inputSize - 3 * NormalizedAxesSize + scale_ops + return ops + + +class HardSwishLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeOps(self): + # HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + # Operations: div + add + clip + mul + size = self.mapper.parser.operatorRepresentation['size'] + return size * 4 diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..e4e98976e4 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -11,6 +11,37 @@ from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer +def compute_broadcast_strides(shape1, shape2, out_shape): + """Compute strides for ONNX/NumPy-style broadcasting. + + Pads both input shapes from the left to match the output ndim, + then computes strides where broadcast dimensions (size 1) get stride 0. + + Example: + shape1=[8,8,8], shape2=[8] + -> strides1=[64,8,1], strides2=[0,0,1] + """ + ndim = len(out_shape) + + pad1 = [1] * (ndim - len(shape1)) + shape1 + pad2 = [1] * (ndim - len(shape2)) + shape2 + + def _calc_strides(padded_shape, out_shape): + strides = [] + stride = 1 + for i in range(ndim - 1, -1, -1): + if padded_shape[i] == 1 and out_shape[i] > 1: + strides.insert(0, 0) + else: + strides.insert(0, stride) + stride *= padded_shape[i] if padded_shape[i] > 1 else 1 + return strides + + strides1 = _calc_strides(pad1, out_shape) + strides2 = _calc_strides(pad2, out_shape) + return strides1, strides2 + + class ConcatParser(NodeParser): def __init__(self): @@ -55,6 +86,10 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels']) self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'])) + stash_type = node.attrs.get('stash_type', 1) + if stash_type != 1: + raise ValueError(f"iRMSNorm: only stash_type=1 (FP32) is supported, got {stash_type}") + return ret def parseNodeCtxt(self, @@ -70,8 +105,19 @@ def parseNodeCtxt(self, for idx, outputNode in enumerate(node.outputs): self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name - self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape) - self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1] + input_shape = list(ctxt.lookup(node.inputs[0].name).shape) + + axis = node.attrs.get('axis', -1) + if axis < 0: + axis = len(input_shape) + axis + + self.operatorRepresentation['inputSize'] = int(np.prod(input_shape)) + self.operatorRepresentation['NormalizedAxesSize'] = int(np.prod(input_shape[axis:])) + self.operatorRepresentation['scale'] = node.inputs[1].values + + # Keep old keys for C template compatibility + self.operatorRepresentation['size'] = int(np.prod(input_shape)) + self.operatorRepresentation['lastDimLength'] = int(input_shape[-1]) return ctxt, True @@ -471,23 +517,37 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) - return ret def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - data_in_1 = ctxt.lookup(node.inputs[0].name) data_in_2 = ctxt.lookup(node.inputs[1].name) data_out = ctxt.lookup(node.outputs[0].name) + self.operatorRepresentation['data_in_1'] = data_in_1.name self.operatorRepresentation['data_in_2'] = data_in_2.name self.operatorRepresentation['data_out'] = data_out.name - self.operatorRepresentation['size'] = np.prod(data_in_1.shape) + self.operatorRepresentation['size'] = np.prod(data_out.shape) + + # Check if broadcasting is needed + shape1 = list(data_in_1.shape) + shape2 = list(data_in_2.shape) + out_shape = list(data_out.shape) + + need_broadcast = (shape1 != out_shape) or (shape2 != out_shape) + self.operatorRepresentation['need_broadcast'] = need_broadcast + + if need_broadcast: + strides1, strides2 = compute_broadcast_strides(shape1, shape2, out_shape) + + self.operatorRepresentation['ndim'] = len(out_shape) + self.operatorRepresentation['strides1'] = strides1 + self.operatorRepresentation['strides2'] = strides2 + self.operatorRepresentation['out_shape'] = out_shape return ctxt, True @@ -2096,15 +2156,15 @@ def parseNodeCtxt(self, node: gs.Node, channels_first: bool = True) -> Tuple[NetworkContext, bool]: - inputs = ["input1", "input2"] - outputs = ["output"] + inputs = ["A", "B"] + outputs = ["C"] for idx, inputNode in enumerate(node.inputs): if idx < len(inputs): self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name for idx, outputNode in enumerate(node.outputs): self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name - self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['input1']).shape) + self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['A']).shape) return ctxt, True diff --git a/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py b/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py index 34236311a0..6dfb9faeab 100644 --- a/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py @@ -6,5 +6,5 @@ referenceTemplate = NodeTemplate(""" // Division (Name: ${nodeName}, Op: ${nodeOp}) -SINGLE_CORE Div_fp${input1_type.referencedType.typeWidth}_fp${input2_type.referencedType.typeWidth}_fp${output_type.referencedType.typeWidth}(${input1}, ${input2}, ${output}, ${size}); +SINGLE_CORE Div_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}(${A}, ${B}, ${C}, ${size}); """) diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..88b1b97cf8 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -6,7 +6,7 @@ import numpy as np -from Deeploy.AbstractDataTypes import Pointer +from Deeploy.AbstractDataTypes import FloatImmediate, Pointer from Deeploy.CommonExtensions.TypeCheckers.SignPropTypeChecker import SignPropTypeChecker from Deeploy.DeeployTypes import ConstantBuffer, OperatorRepresentation, VariableBuffer @@ -409,7 +409,10 @@ def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[ def _inferNumLevels(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[int]: - return [2**(4 * self.input_types[0].referencedType.typeWidth)] + input_type = self.input_types[0].referencedType + if issubclass(input_type, FloatImmediate): + return [2**(input_type.typeWidth)] + return [2**(4 * input_type.typeWidth)] def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: @@ -610,3 +613,25 @@ def _inferNumLevels(self, inputs: List[VariableBuffer], def _inferSignedness(self, inputs: List[VariableBuffer], operatorRepresentation: OperatorRepresentation) -> List[bool]: return [True] + + +class RMSNormChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + # RMSNorm: square, mean, sqrt, reciprocal, multiply + # Output precision similar to input + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + # RMSNorm output can be signed (depending on input signedness) + if inputs[0]._signed: + return [True] + else: + return [False] + + diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py index 25b150b553..4cb894bf93 100644 --- a/Deeploy/Targets/Snitch/Bindings.py +++ b/Deeploy/Targets/Snitch/Bindings.py @@ -7,16 +7,24 @@ from Deeploy.AbstractDataTypes import PointerClass from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \ - MemoryManagementGeneration + MemoryManagementGeneration, MemoryPassthroughGeneration from Deeploy.CommonExtensions.DataTypes import float32_t, int8_t, int32_t, uint8_t from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import iNoNormTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker +from Deeploy.Targets.Generic.Templates import ConcatTemplate, GatherTemplate, MatMulTemplate, iNoNormTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, DivChecker, GatherChecker, GEMMChecker, \ + HardswishChecker, MatMulChecker, MulChecker, ReshapeChecker, RMSNormChecker, RQAddChecker, SoftmaxChecker, \ + TransposeChecker, iNoNormChecker from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \ SnitchSynchCoresPass from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma -from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, FloatMatMulTemplate, \ + ReshapeTemplate, RQAddTemplate, TransposeTemplate, iSoftmaxTemplate +from Deeploy.Targets.Snitch.Templates.FloatAddTemplate import referenceTemplate as FloatAddTemplate +from Deeploy.Targets.Snitch.Templates.FloatDivTemplate import referenceTemplate as FloatDivTemplate +from Deeploy.Targets.Snitch.Templates.FloatHardSwishTemplate import referenceTemplate as FloatHardSwishTemplate +from Deeploy.Targets.Snitch.Templates.FloatMulTemplate import referenceTemplate as FloatMulTemplate +from Deeploy.Targets.Snitch.Templates.FloatRMSNormTemplate import referenceTemplate as FloatRMSNormTemplate from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template @@ -29,10 +37,11 @@ startRegion = "L2", endRegion = "L1") -BasicTransformer = CodeTransformation( +SkipTransformer = CodeTransformation( [SnitchSynchCoresPass(), ArgumentStructGeneration(), - MemoryManagementGeneration(), + MemoryPassthroughGeneration("L.*"), + MemoryPassthroughGeneration(), FutureGeneration()]) TiledTransformer = CodeTransformation([ @@ -45,6 +54,7 @@ ArgumentStructGeneration(), MemoryManagementGeneration("L1"), MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), + MemoryManagementGeneration("L2"), MemoryManagementGeneration() ]) @@ -69,7 +79,12 @@ SnitchAddBindings = [ NodeBinding(AddChecker([PointerClass(_type), PointerClass(_type)], [PointerClass(int32_t)]), AddTemplate.referenceTemplate, TiledTransformer) for _type in [int8_t] +] + [ + # fp32 support + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate, TiledTransformer) ] + SnitchGemmBindings = [ NodeBinding( GEMMChecker([PointerClass(int8_t), PointerClass(int8_t), @@ -90,3 +105,54 @@ PointerClass(int32_t) ], [PointerClass(int8_t)]), SnitchRqGemm_Template, TiledTransformer) ] + +SnitchRMSNormBindings = [ + NodeBinding(RMSNormChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatRMSNormTemplate, TiledTransformer) +] + +SnitchHardSwishBindings = [ + NodeBinding(HardswishChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatHardSwishTemplate, + TiledTransformer) +] + +SnitchDivBindings = [ + NodeBinding(DivChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatDivTemplate, TiledTransformer) +] + +SnitchMulBindings = [ + NodeBinding(MulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMulTemplate, TiledTransformer) +] + +# MatMul Bindings (Tiled) +SnitchMatMulBindings = [ + NodeBinding(MatMulChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]), + MatMulTemplate.referenceTemplate, TiledTransformer), + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatMatMulTemplate.referenceTemplate, TiledTransformer) +] + +# Concat Bindings (Tiled) +SnitchConcatBindings = [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, TiledTransformer) +] + +SnitchTransposeBindings = [ + NodeBinding(TransposeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + TransposeTemplate.referenceTemplate, TiledTransformer) +] + +# Reshape Bindings (pointer passthrough, no DMA needed) +SnitchReshapeBindings = [ + NodeBinding(ReshapeChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), ReshapeTemplate.referenceTemplate, + SkipTransformer) +] + +# Gather Bindings (Tiled) +SnitchGatherBindings = [ + NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(int32_t)], [PointerClass(float32_t)]), + GatherTemplate.referenceTemplate, TiledTransformer) +] diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py index 0051994686..685ead2b28 100644 --- a/Deeploy/Targets/Snitch/Parsers.py +++ b/Deeploy/Targets/Snitch/Parsers.py @@ -4,10 +4,12 @@ from typing import Tuple +import numpy as np import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import GEMMParser, RQGEMMParser +from Deeploy.Targets.Generic.Parsers import AddParser, DivParser, GEMMParser, MulParser, RQGEMMParser, \ + iHardswishParser, iRMSNormParser class SnitchGEMMParser(GEMMParser): @@ -72,3 +74,85 @@ def parseNodeCtxt(self, return ctxt, False return newCtxt, True + + +class SnitchRMSNormParser(iRMSNormParser): + """FP32 RMSNorm parser. Inherits parseNodeCtxt from iRMSNormParser.""" + + def parseNode(self, node: gs.Node) -> bool: + if node.op != 'RMSNorm': + return False + if len(node.inputs) != 2 or len(node.outputs) != 1: + return False + + eps = node.attrs.get('eps', node.attrs.get('epsilon', 1e-6)) + self.operatorRepresentation['eps'] = f"{float(eps):.10e}f" + + stash_type = node.attrs.get('stash_type', 1) + if stash_type != 1: + raise ValueError(f"RMSNorm: only stash_type=1 (FP32) is supported, got {stash_type}") + + return True + + +class SnitchHardSwishParser(iHardswishParser): + """FP32 HardSwish parser. Inherits parseNodeCtxt from iHardswishParser.""" + + def parseNode(self, node: gs.Node) -> bool: + if node.op != 'HardSwish': + return False + if len(node.inputs) != 1 or len(node.outputs) != 1: + return False + return True + + +class SnitchAddParser(AddParser): + """Inherits from Generic AddParser which already handles broadcasting.""" + + pass + + +class SnitchDivParser(DivParser): + """Inherits from Generic DivParser and adds scalar detection.""" + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False + + shape1 = list(ctxt.lookup(node.inputs[0].name).shape) + shape2 = list(ctxt.lookup(node.inputs[1].name).shape) + out_shape = list(ctxt.lookup(node.outputs[0].name).shape) + + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + self.operatorRepresentation['input1_is_scalar'] = (np.prod(shape1) == 1) + self.operatorRepresentation['input2_is_scalar'] = (np.prod(shape2) == 1) + self.operatorRepresentation['is_scalar'] = (np.prod(shape1) == 1 or np.prod(shape2) == 1) + + return ctxt, True + + +class SnitchMulParser(MulParser): + """Inherits from Generic MulParser and adds scalar detection.""" + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + ctxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return ctxt, False + + shape1 = list(ctxt.lookup(node.inputs[0].name).shape) + shape2 = list(ctxt.lookup(node.inputs[1].name).shape) + out_shape = list(ctxt.lookup(node.outputs[0].name).shape) + + self.operatorRepresentation['size'] = int(np.prod(out_shape)) + self.operatorRepresentation['input1_is_scalar'] = (np.prod(shape1) == 1) + self.operatorRepresentation['input2_is_scalar'] = (np.prod(shape2) == 1) + self.operatorRepresentation['is_scalar'] = (np.prod(shape1) == 1 or np.prod(shape2) == 1) + + return ctxt, True diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py index d62d1c3802..83dc778b17 100644 --- a/Deeploy/Targets/Snitch/Platform.py +++ b/Deeploy/Targets/Snitch/Platform.py @@ -2,45 +2,59 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List +from typing import List, Type import numpy as np +from Deeploy.AbstractDataTypes import Pointer, PointerClass, VoidType from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer -from Deeploy.Targets.Generic.Bindings import BasicGatherBindings, BasicLayerNormBindings, BasicMatMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicReshapeBindings, BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, GatherLayer, GEMMLayer, LayerNormLayer, MatMulLayer, PadLayer, \ - ReshapeLayer, RQGEMMLayer, RQIntegerDivLayer, SoftmaxLayer, iNoNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ - RQAddParser, RQIntegerDivParser, SoftmaxParser, UnsqueezeParser, iLayerNormParser, iNoNormParser, iSoftmaxParser +from Deeploy.Targets.Generic.Bindings import BasicLayerNormBindings, BasicPad1DBindings, BasicPad2DBindings, \ + BasicReshapeBindings, BasicRQIntegerDivBinding +from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, DivLayer, GatherLayer, GEMMLayer, HardSwishLayer, \ + LayerNormLayer, MatMulLayer, MulLayer, PadLayer, ReshapeLayer, RMSNormLayer, RQGEMMLayer, RQIntegerDivLayer, \ + SoftmaxLayer, TransposeLayer, iNoNormLayer +from Deeploy.Targets.Generic.Parsers import ConcatParser, GatherParser, MatMulParser, Pad1DParser, Pad2DParser, \ + ReshapeParser, RQAddParser, RQIntegerDivParser, SoftmaxParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \ + iNoNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import AddRequantMergePass, GEMMRequantMergePass, \ IntegerDivRequantMergePass, MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, RQSSplitPass, \ SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass from Deeploy.Targets.PULPOpen.Platform import RQAddMapper -from Deeploy.Targets.Snitch.Parsers import SnitchGEMMParser, SnitchRQGEMMParser +from Deeploy.Targets.Snitch.Parsers import SnitchAddParser, SnitchDivParser, SnitchGEMMParser, \ + SnitchHardSwishParser, SnitchMulParser, SnitchRMSNormParser, SnitchRQGEMMParser from Deeploy.Targets.Snitch.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchGemmTilingReadyBindings, \ - SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, SnitchRQAddTilingReadyBindings, \ - SnitchRqGemmTilingReadyBindings +from Deeploy.Targets.Snitch.Tiler import SnitchAddTileReadyBindings, SnitchConcatTilingReadyBindings, \ + SnitchDivTilingReadyBindings, SnitchGatherTilingReadyBindings, SnitchGemmTilingReadyBindings, \ + SnitchHardSwishTilingReadyBindings, SnitchiNoNormTilingReadyBindings, SnitchiSoftmaxTilingReadyBindings, \ + SnitchMatMulTilingReadyBindings, SnitchMulTilingReadyBindings, SnitchReshapeTilingReadyBindings, \ + SnitchRMSNormTilingReadyBindings, SnitchRQAddTilingReadyBindings, SnitchRqGemmTilingReadyBindings, \ + SnitchTransposeTilingReadyBindings -GatherMapper = NodeMapper(GatherParser(), BasicGatherBindings) Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) -UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings) - RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) +iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings) -MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings) +# All other mappers use TilingReadyBindings (works for both tiled and untiled) +GatherMapper = NodeMapper(GatherParser(), SnitchGatherTilingReadyBindings) +UnsqueezeMapper = NodeMapper(UnsqueezeParser(), SnitchReshapeTilingReadyBindings) +ReshapeMapper = NodeMapper(ReshapeParser(), SnitchReshapeTilingReadyBindings) +TransposeMapper = NodeMapper(TransposeParser(), SnitchTransposeTilingReadyBindings) +ConcatMapper = NodeMapper(ConcatParser(), SnitchConcatTilingReadyBindings) +MatMulMapper = NodeMapper(MatMulParser(), SnitchMatMulTilingReadyBindings) GemmMapper = NodeMapper(SnitchGEMMParser(), SnitchGemmTilingReadyBindings) RqGemmMapper = NodeMapper(SnitchRQGEMMParser(), SnitchRqGemmTilingReadyBindings) iSoftmaxMapper = NodeMapper(iSoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) SoftmaxMapper = NodeMapper(SoftmaxParser(), SnitchiSoftmaxTilingReadyBindings) iNoNormMapper = NodeMapper(iNoNormParser(), SnitchiNoNormTilingReadyBindings) -iLayerNormMapper = NodeMapper(iLayerNormParser(), BasicLayerNormBindings) RQAddMapper = NodeMapper(RQAddParser(), SnitchRQAddTilingReadyBindings) -AddMapper = NodeMapper(AddParser(), SnitchAddTileReadyBindings) +AddMapper = NodeMapper(SnitchAddParser(), SnitchAddTileReadyBindings) +RMSNormMapper = NodeMapper(SnitchRMSNormParser(), SnitchRMSNormTilingReadyBindings) +HardSwishMapper = NodeMapper(SnitchHardSwishParser(), SnitchHardSwishTilingReadyBindings) +DivMapper = NodeMapper(SnitchDivParser(), SnitchDivTilingReadyBindings) +MulMapper = NodeMapper(SnitchMulParser(), SnitchMulTilingReadyBindings) SnitchMapping = { 'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]), @@ -56,13 +70,20 @@ 'iLayerNorm': LayerNormLayer([iLayerNormMapper]), 'RequantizedAdd': AddLayer([RQAddMapper]), 'Add': AddLayer([AddMapper]), + 'RMSNorm': RMSNormLayer([RMSNormMapper]), + 'HardSwish': HardSwishLayer([HardSwishMapper]), + 'Div': DivLayer([DivMapper]), + 'Mul': MulLayer([MulMapper]), + 'Reshape': ReshapeLayer([ReshapeMapper]), + 'Transpose': TransposeLayer([TransposeMapper]), + 'Concat': ConcatLayer([ConcatMapper]), } class SnitchVariableBuffer(VariableBuffer): initTemplate = AllocateTemplate.snitchL2InitTemplate - allocTemplate = AllocateTemplate.snitchGenericAllocate + allocTemplate = AllocateTemplate.snitchGenericGuardedAllocate deallocTemplate = FreeTemplate.snitchGenericFree def _bufferRepresentation(self): @@ -83,7 +104,7 @@ def _bufferRepresentation(self): class SnitchTransientBuffer(TransientBuffer): initTemplate = AllocateTemplate.snitchL2InitTemplate - allocTemplate = AllocateTemplate.snitchGenericAllocate + allocTemplate = AllocateTemplate.snitchGenericGuardedAllocate deallocTemplate = FreeTemplate.snitchGenericFree # allocTemplate = AllocateTemplate.snitchL2AllocateTemplate @@ -105,6 +126,12 @@ class SnitchConstantBuffer(ConstantBuffer): allocTemplate = AllocateTemplate.snitchL2GlobalAllocateTemplate deallocTemplate = FreeTemplate.snitchL2GlobalTemplate + def __init__(self, name: str = '', shape = [1], values = [0]): + super().__init__(name, shape, values) + # Initialize _type with a default value to prevent AttributeError + # The actual type will be set later via annotateType + self._type: Type[Pointer] = PointerClass(VoidType) + def _bufferRepresentation(self): operatorRepresentation = super()._bufferRepresentation() diff --git a/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py b/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py index 6c1d898645..49decc5157 100644 --- a/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py @@ -55,3 +55,13 @@ // ${name} with size ${size} allocated in L2! % endif """) + +snitchGenericGuardedAllocate = NodeTemplate(""" +% if _memoryLevel == "L1": +if (snrt_is_dm_core()) { ${name} = (${type.typeName}) snrt_l1alloc(sizeof(${type.referencedType.typeName}) * ${size}); } +snrt_cluster_hw_barrier();\n +% else: +if (snrt_is_dm_core()) { ${name} = (${type.typeName}) snrt_l3alloc(sizeof(${type.referencedType.typeName}) * ${size}); } +snrt_cluster_hw_barrier();\n +% endif +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py new file mode 100644 index 0000000000..20b72e4d27 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatAddTemplate.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class _FloatAddTemplate(NodeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + # Always initialize these variables to avoid Mako errors + operatorRepresentation.setdefault('need_broadcast', False) + operatorRepresentation.setdefault('ndim', 0) + operatorRepresentation.setdefault('strides1_str', '{}') + operatorRepresentation.setdefault('strides2_str', '{}') + operatorRepresentation.setdefault('out_shape_str', '{}') + + # If broadcasting is required, generate the stride array strings + if operatorRepresentation['need_broadcast']: + strides1 = operatorRepresentation['strides1'] + strides2 = operatorRepresentation['strides2'] + out_shape = operatorRepresentation['out_shape'] + operatorRepresentation['strides1_str'] = '{' + ', '.join(map(str, strides1)) + '}' + operatorRepresentation['strides2_str'] = '{' + ', '.join(map(str, strides2)) + '}' + operatorRepresentation['out_shape_str'] = '{' + ', '.join(map(str, out_shape)) + '}' + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = _FloatAddTemplate(""" +// Snitch FP32 Add (Name: ${nodeName}, Op: ${nodeOp}) +% if need_broadcast: +{ + uint32_t strides1[${ndim}] = ${strides1_str}; + uint32_t strides2[${ndim}] = ${strides2_str}; + uint32_t out_shape[${ndim}] = ${out_shape_str}; + Add_fp32_broadcast(${data_in_1}, ${data_in_2}, ${data_out}, out_shape, strides1, strides2, ${ndim}, ${size}); +} +% else: +Add_fp32(${data_in_1}, ${data_in_2}, ${data_out}, ${size}); +% endif +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py new file mode 100644 index 0000000000..a4a7e05bd4 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatDivTemplate.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from mako.template import Template as MakoTemplate + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatDivTemplate(NodeTemplate): + """Template for FP32 Div operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # IMPORTANT: Must recompile self.template (Mako Template object), + # not just assign self.templateStr. NodeTemplate.generate() uses + # the pre-compiled self.template, not self.templateStr. + if is_scalar: + self.template = MakoTemplate(FloatDivScalarTemplateStr, strict_undefined = True) + else: + self.template = MakoTemplate(FloatDivTemplateStr, strict_undefined = True) + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise division +FloatDivTemplateStr = r""" +Div_fp32(${A}, ${B}, ${C}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatDivScalarTemplateStr = r""" +{ + float32_t scalar = ${B}[0]; + Div_fp32_scalar(${A}, scalar, ${C}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatDivTemplate(FloatDivTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py new file mode 100644 index 0000000000..1615282437 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatHardSwishTemplate.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatHardSwishTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + operatorRepresentation["size"] = int(np.prod(data_in.shape)) + + return ctxt, operatorRepresentation, [] + + +FloatHardSwishTemplateStr = r""" +HardSwish_fp32(${data_in}, ${data_out}, ${size}); +""" + +referenceTemplate = FloatHardSwishTemplate(FloatHardSwishTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py new file mode 100644 index 0000000000..939822c944 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMatMulTemplate.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +# Multi-core MatMul: all compute cores enter, kernel handles work distribution internally. +# Framework adds snrt_is_compute_core() guard and barriers via SnitchCoreFilterPass/SnitchSynchCoresPass. +referenceTemplate = NodeTemplate(""" +// Matmul (Name: ${nodeName}, Op: ${nodeOp}) +{ + ${A_type.typeName} ref_${data_out}_${A} = ${A}; + ${B_type.typeName} ref_${data_out}_${B} = ${B}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for(uint32_t i=0; i<${batch}; i++){ + matmul_fp32_opt( + ref_${data_out}_${A}, + ref_${data_out}_${B}, + ref_${data_out}_${data_out}, + ${M}, + ${N}, + ${O} + ); + + ref_${data_out}_${A} += ${M} * ${N}; + ref_${data_out}_${B} += ${N} * ${O}; + ref_${data_out}_${data_out} += ${M} * ${O}; + } +} +""") diff --git a/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py new file mode 100644 index 0000000000..ec2ec8595c --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatMulTemplate.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from mako.template import Template as MakoTemplate + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatMulTemplate(NodeTemplate): + """Template for FP32 Mul operation with dynamic template selection.""" + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + # Check if scalar broadcasting + is_scalar = operatorRepresentation.get('is_scalar', False) + + # IMPORTANT: Must recompile self.template (Mako Template object), + # not just assign self.templateStr. NodeTemplate.generate() uses + # the pre-compiled self.template, not self.templateStr. + if is_scalar: + self.template = MakoTemplate(FloatMulScalarTemplateStr, strict_undefined = True) + else: + self.template = MakoTemplate(FloatMulTemplateStr, strict_undefined = True) + + return ctxt, operatorRepresentation, [] + + +# Template for element-wise multiplication +FloatMulTemplateStr = r""" +Mul_fp32(${A}, ${B}, ${C}, ${size}); +""" + +# Template for scalar broadcasting (optimized) +FloatMulScalarTemplateStr = r""" +{ + float32_t scalar = ${B}[0]; + Mul_fp32_scalar(${A}, scalar, ${C}, ${size}); +} +""" + +# Create reference template with default (element-wise) +referenceTemplate = FloatMulTemplate(FloatMulTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py new file mode 100644 index 0000000000..f25bdf53c0 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/FloatRMSNormTemplate.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class FloatRMSNormTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + data_in = ctxt.lookup(operatorRepresentation["data_in"]) + input_shape = list(data_in.shape) + + operatorRepresentation["size"] = int(np.prod(input_shape)) + operatorRepresentation["lastDimLength"] = operatorRepresentation["NormalizedAxesSize"] + + return ctxt, operatorRepresentation, [] + + +FloatRMSNormTemplateStr = r""" +RMSNorm_fp32(${data_in}, ${weight}, ${data_out}, ${size}, ${lastDimLength}, ${eps}); +""" + +referenceTemplate = FloatRMSNormTemplate(FloatRMSNormTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py index 216ff35b9a..f8ff98b8cd 100644 --- a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py +++ b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py @@ -2,38 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Tuple - -from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation - - -class FloatSoftmaxTemplate(NodeTemplate): - - def __init__(self, templateStr): - super().__init__(templateStr) - - def alignToContext(self, ctxt: NetworkContext, - operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: - - data_in = ctxt.lookup(operatorRepresentation["data_in"]) - operatorRepresentation["seq_len"] = data_in.shape[2] - operatorRepresentation["input_samples"] = data_in.shape[-1] - - operatorRepresentation["kernelName"] = "Softmax_fp32" - - return ctxt, operatorRepresentation, [] - +from Deeploy.DeeployTypes import NodeTemplate +# Multi-core Softmax: all compute cores enter, kernel parallelizes across batch dimension. +# Framework adds snrt_is_compute_core() guard and barriers via SnitchCoreFilterPass/SnitchSynchCoresPass. FloatSoftmaxTemplateStr = r""" - uint32_t batch_size = ${size} / ${lastDimLength}; - uint32_t compute_num = 1; //snrt_cluster_compute_core_num(); - int32_t ldI = compute_num * ${input_samples}; - int32_t batch_offset = ${seq_len} * ${input_samples}; - - // JUNGVI: This implementation is broken and has memory leak. - if (snrt_hartid() == 0){ - ${kernelName}(${data_in}, ${data_out}, ldI, batch_offset, batch_size, ${seq_len}, ${input_samples}); - } +Softmax_fp32(${data_in}, ${data_out}, ${size}, ${lastDimLength}); """ -FloatSoftmax_Template = FloatSoftmaxTemplate(FloatSoftmaxTemplateStr) +FloatSoftmax_Template = NodeTemplate(FloatSoftmaxTemplateStr) diff --git a/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py new file mode 100644 index 0000000000..0e1fdf81b1 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/ReshapeTemplate.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer +from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate + + +class _SnitchReshapeTemplate(_ReshapeTemplate): + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + + ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation) + + bufferIn = ctxt.lookup(operatorRepresentation['data_in']) + assert isinstance(bufferIn, VariableBuffer) + bufferOut = ctxt.lookup(operatorRepresentation['data_out']) + assert isinstance(bufferOut, VariableBuffer) + + # Set alias so input and output share the same memory + bufferOut._alias = bufferIn.name + + return ctxt, operatorRepresentation, [] + + +# Reshape only reinterprets tensor shape without modifying data. +# Uses SkipTransformer (no DMA), consistent with PULPOpen. +referenceTemplate = _SnitchReshapeTemplate(""" +// Reshape (Name: ${nodeName}, Op: ${nodeOp}) +${data_out} = ${data_in}; +""") diff --git a/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py new file mode 100644 index 0000000000..e8e689d402 --- /dev/null +++ b/Deeploy/Targets/Snitch/Templates/TransposeTemplate.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, _Template + +# Two-stage header: <%text>${ escapes produce ${dimLen_N} template variables +# that survive the first render and get resolved during the second render +# (by operatorRepresentation in untiled mode, or TilingVariableReplacement in tiled mode) +_tileHeader = NodeTemplate(""" +const uint32_t _core_idx = snrt_cluster_core_idx(); +const uint32_t _core_num = snrt_cluster_compute_core_num(); + +% for i in range(numDims): +uint32_t dimLen_${i} = <%text>${${dimLenPtr[i]}<%text>}; +% endfor +""") + +_tileForLoop = NodeTemplate(""" +const uint32_t _baseChunk_${i} = dimLen_${i} / _core_num; +const uint32_t _leftover_${i} = dimLen_${i} - _baseChunk_${i} * _core_num; +const uint32_t _offset_${i} = _baseChunk_${i} * _core_idx + (_core_idx < _leftover_${i} ? _core_idx : _leftover_${i}); +const uint32_t _chunk_${i} = _core_idx < _leftover_${i} ? _baseChunk_${i} + 1 : _baseChunk_${i}; +for(uint32_t i_${i} = _offset_${i}; i_${i} < _offset_${i} + _chunk_${i}; i_${i}++) { +""") + +_forLoop = NodeTemplate(""" +for(uint32_t i_${i} = 0; i_${i} < dimLen_${i}; i_${i}++) { +""") + + +class SnitchTransposeTemplate(NodeTemplate): + + def __init__(self, templateStr: str): + self._indirectTemplate = _Template(templateStr) + self.subTemplates = {} + self.subTemplateGenerators = {} + + def alignToContext(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + shapeStr = "" + dimStr = "" + accessStr = "" + outAccessStr = "" + outShapeStr = "" + perm = operatorRepresentation['perm'] + data_in_shape = ctxt.lookup(operatorRepresentation['data_in']).shape + data_out_shape = ctxt.lookup(operatorRepresentation['data_out']).shape + + for idx, i in enumerate(perm[:-1]): + shapeStr += '[' + f"dimLen_{idx+1}" + ']' + outShapeStr += '[' + f"dimLen_{perm[idx+1]}" + ']' + + for dim in data_in_shape: + dimStr += '[' + str(dim) + ']' + + for idx, i in enumerate(perm): + accessStr += '[i_' + str(idx) + ']' + outAccessStr += '[i_' + str(i) + ']' + + fRep = operatorRepresentation.copy() + + fRep['shapeStr'] = shapeStr + fRep['outShapeStr'] = outShapeStr + fRep['outAccessStr'] = outAccessStr + fRep['dimStr'] = dimStr + fRep['accessStr'] = accessStr + fRep['data_out_shape'] = data_out_shape + + # Select the best dimension to parallelize: + # prefer dimensions >= 8 for good load balancing, otherwise pick the largest + parallelDims = [idx for idx, dim in enumerate(data_out_shape) if dim >= 8] + if len(parallelDims) > 0: + parallelDim = parallelDims[0] + else: + parallelDim = data_out_shape.index(max(data_out_shape)) + + forLoops = [] + dimLenPtrs = [] + for idx, i in enumerate(perm): + operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[idx] + dimLenPtrs.append(f"dimLen_{idx}") + if idx != parallelDim: + forLoops.append(_forLoop.generate({"i": i})) + else: + forLoops.append(_tileForLoop.generate({"i": i})) + + fRep['forLoops'] = forLoops + fRep['tileHeader'] = _tileHeader.generate({"numDims": len(perm), "dimLenPtr": dimLenPtrs}) + fRep['parallelDim'] = parallelDim + + self.template = _Template(self._indirectTemplate.render(**fRep)) + + return ctxt, operatorRepresentation, [] + + +referenceTemplate = SnitchTransposeTemplate(""" +// Transpose ${data_in_shape} -> ${data_out_shape} (Name: ${nodeName}, Op: ${nodeOp}) +${tileHeader} +% for idx, i in enumerate(perm): +${forLoops[idx]} +% endfor +((${data_in_type.referencedType.typeName} (*)${outShapeStr})<%text>${data_out})${outAccessStr} = ((${data_in_type.referencedType.typeName} (*)${shapeStr})<%text>${data_in})${accessStr}; +% for idx, i in enumerate(perm): +} +% endfor +""") diff --git a/Deeploy/Targets/Snitch/TileConstraints/FloatScalarBOPTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/FloatScalarBOPTileConstraint.py new file mode 100644 index 0000000000..9f51a8134e --- /dev/null +++ b/Deeploy/Targets/Snitch/TileConstraints/FloatScalarBOPTileConstraint.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class FloatScalarBOPTileConstraint(TileConstraint): + """Tile constraint for binary operators with scalar broadcasting support. + + Extends BOPTileConstraint with scalar handling: when one input has size 1, + it is loaded in full (not tiled) while the other input and output are tiled together. + Used by FP32 Div and Mul operators. + """ + + dataIn1Name = "A" + dataIn2Name = "B" + dataOutName = "C" + + @classmethod + def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + inputBuffer1Name = parseDict[cls.dataIn1Name] + inputBuffer2Name = parseDict[cls.dataIn2Name] + outputBufferName = parseDict[cls.dataOutName] + + tilerModel.addTensorDimToModel(ctxt, inputBuffer1Name) + tilerModel.addTensorDimToModel(ctxt, inputBuffer2Name) + tilerModel.addTensorDimToModel(ctxt, outputBufferName) + + input1Shape = list(ctxt.lookup(inputBuffer1Name).shape) + input2Shape = list(ctxt.lookup(inputBuffer2Name).shape) + + is_scalar = (np.prod(input2Shape) == 1) + + if is_scalar: + # Scalar: tile A and C together, B stays fixed + for dim in range(len(input1Shape)): + in1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + outVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(in1Var == outVar) + for dim in range(len(input2Shape)): + in2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + tilerModel.addConstraint(in2Var == input2Shape[dim]) + else: + # Element-wise: all three tensors tiled identically + for dim in range(len(input1Shape)): + in1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) + in2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) + outVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) + tilerModel.addConstraint(in1Var == in2Var) + tilerModel.addConstraint(in1Var == outVar) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = [cls.dataIn1Name, cls.dataIn2Name, cls.dataOutName] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements = {"size": []} + replacementTypes = {"size": PointerClass(uint16_t)} + + input2Shape = list(ctxt.lookup(operatorRepresentation[cls.dataIn2Name]).shape) + is_scalar = (np.prod(input2Shape) == 1) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for cube in outputCubes: + replacements["size"].append(np.prod(cube.dims)) + if is_scalar: + in2Cube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: in2Cube}) + else: + inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) + + for out in outputCubes: + outputLoadSchedule.append({cls.dataOutName: out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Snitch/TileConstraints/__init__.py b/Deeploy/Targets/Snitch/TileConstraints/__init__.py index 947a6fd82a..1fcabd7305 100644 --- a/Deeploy/Targets/Snitch/TileConstraints/__init__.py +++ b/Deeploy/Targets/Snitch/TileConstraints/__init__.py @@ -3,5 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from . import * +from .FloatScalarBOPTileConstraint import * +from .GemmTileConstraint import * from .iNoNormTileConstraint import * from .iSoftmaxTileConstraint import * +from .RqGemmTileConstraint import * diff --git a/Deeploy/Targets/Snitch/Tiler.py b/Deeploy/Targets/Snitch/Tiler.py index 475a425779..c8c340d6b8 100644 --- a/Deeploy/Targets/Snitch/Tiler.py +++ b/Deeploy/Targets/Snitch/Tiler.py @@ -3,11 +3,19 @@ # SPDX-License-Identifier: Apache-2.0 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint -from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, \ - SnitchiSoftmaxBindings, SnitchRQAddBindings, SnitchRqGemmBindings -from Deeploy.Targets.Snitch.TileConstraints import iNoNormTileConstraint, iSoftmaxTileConstraint -from Deeploy.Targets.Snitch.TileConstraints.GemmTileConstraint import GemmTileConstraint -from Deeploy.Targets.Snitch.TileConstraints.RqGemmTileConstraint import RqGemmTileConstraint +from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint +from Deeploy.Targets.Generic.TileConstraints.iRMSNormTileConstraint import iRMSNormTileConstraint +from Deeploy.Targets.Generic.TileConstraints.NOPTileConstraint import NOPTileConstraint +from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint +from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchConcatBindings, SnitchDivBindings, \ + SnitchGatherBindings, SnitchGemmBindings, SnitchHardSwishBindings, SnitchiNoNormBindings, SnitchiSoftmaxBindings, \ + SnitchMatMulBindings, SnitchMulBindings, SnitchReshapeBindings, SnitchRMSNormBindings, SnitchRQAddBindings, \ + SnitchRqGemmBindings, SnitchTransposeBindings +from Deeploy.Targets.Snitch.TileConstraints import FloatScalarBOPTileConstraint, GemmTileConstraint, \ + iNoNormTileConstraint, iSoftmaxTileConstraint, RqGemmTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings SnitchiSoftmaxTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchiSoftmaxBindings, @@ -23,3 +31,30 @@ SnitchAddTileReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchAddBindings, tileConstraint = AddTileConstraint()) + +SnitchRMSNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchRMSNormBindings, + tileConstraint = iRMSNormTileConstraint()) + +SnitchHardSwishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchHardSwishBindings, + tileConstraint = iHardswishTileConstraint()) + +SnitchDivTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchDivBindings, + tileConstraint = FloatScalarBOPTileConstraint()) + +SnitchMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMulBindings, + tileConstraint = FloatScalarBOPTileConstraint()) + +SnitchMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchMatMulBindings, + tileConstraint = MatMulTileConstraint()) + +SnitchConcatTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchConcatBindings, + tileConstraint = ConcatTileConstraint()) + +SnitchTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchTransposeBindings, + tileConstraint = TransposeTileConstraint()) + +SnitchReshapeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchReshapeBindings, + tileConstraint = NOPTileConstraint()) + +SnitchGatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = SnitchGatherBindings, + tileConstraint = GatherTileConstraint()) diff --git a/DeeployTest/Platforms/Snitch/main.c b/DeeployTest/Platforms/Snitch/main.c index a7251f3844..cff0e0b20a 100644 --- a/DeeployTest/Platforms/Snitch/main.c +++ b/DeeployTest/Platforms/Snitch/main.c @@ -25,20 +25,22 @@ int main(void) { uint32_t const num_compute_cores = snrt_global_compute_core_num(); #endif + // All cores call InitNetwork: allocations inside are DM-core guarded + // with barriers, so all cores must participate for barrier balance. +#ifndef NOPRINT + if (snrt_is_dm_core()) { + printf("Initializing...\r\n"); + } +#endif + InitNetwork(core_id, 1); + if (snrt_is_dm_core()) { #ifndef CI printf("Network running on %d of %d compute cores (+%d DM cores) on %d " "clusters\r\n", num_compute_cores, snrt_global_compute_core_num(), snrt_cluster_num() * snrt_cluster_dm_core_num(), snrt_cluster_num()); -#endif - -#ifndef NOPRINT - printf("Initializing...\r\n"); -#endif - InitNetwork(core_id, 1); -#ifndef CI for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { printf("testInputVector%d @ %p\r\n", buf, testInputVector[buf]); printf("DeeployNetwork_input_%d @ %p and %u elements\r\n", buf, diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz new file mode 100644 index 0000000000..eec4cee600 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx new file mode 100644 index 0000000000..7a146e5541 --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Hardswish/network.onnx @@ -0,0 +1,14 @@ + +hardswish_test_fp32: +* +inputoutputHardSwish_node" HardSwishhardswish_graph_fp32Z +input + + + +€b +output + + + +€B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz new file mode 100644 index 0000000000..074c937f5b Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Hardswish/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/inputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/RMSNorm/inputs.npz rename to DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/inputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/network.onnx similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/RMSNorm/network.onnx rename to DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/network.onnx diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/outputs.npz similarity index 100% rename from DeeployTest/Tests/Kernels/FP32/RMSNorm/outputs.npz rename to DeeployTest/Tests/Kernels/FP32/RMSNorm/separate_ops/outputs.npz diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/inputs.npz new file mode 100644 index 0000000000..9d14ca82f7 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/network.onnx new file mode 100644 index 0000000000..25a7a9b683 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/outputs.npz new file mode 100644 index 0000000000..6167f74042 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/single_fused_op/outputs.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/activations.npz b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/activations.npz new file mode 100644 index 0000000000..cbc68f9387 Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/activations.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/inputs.npz b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/inputs.npz new file mode 100644 index 0000000000..d8cfc58075 Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/inputs.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/network.onnx b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/network.onnx new file mode 100644 index 0000000000..a076676c4a Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/network.onnx differ diff --git a/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/outputs.npz b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/outputs.npz new file mode 100644 index 0000000000..d6dc22736f Binary files /dev/null and b/DeeployTest/Tests/Models/microLlama/FP32/microLlama1/outputs.npz differ diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama128/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama128/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama128/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama128/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama128/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama128/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama128/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama128/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama128/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama128/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama128/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama128/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama128/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama128/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama128/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama128/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama16_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama16_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama1_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama1_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama256/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama256/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama256/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama256/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama256/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama256/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama256/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama256/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama256/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama256/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama256/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama256/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama256/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama256/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama256/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama256/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama2_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama2_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama32_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama32_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama4_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama4_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama64_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama64_parallel/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8/outputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/activations.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8_parallel/activations.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/activations.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/inputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8_parallel/inputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/inputs.npz diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/network.onnx similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8_parallel/network.onnx rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/network.onnx diff --git a/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/outputs.npz similarity index 100% rename from DeeployTest/Tests/Models/microLlama/microLlama8_parallel/outputs.npz rename to DeeployTest/Tests/Models/microLlama/INT8/microLlama8_parallel/outputs.npz diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py index f029be7361..bc067955fd 100644 --- a/DeeployTest/generateNetwork.py +++ b/DeeployTest/generateNetwork.py @@ -73,7 +73,8 @@ def generateNetwork(args): test_inputs, test_outputs, graph = generateDebugConfig(inputs, outputs, activations, graph) else: - # Load as float64 and infer types later + # Load as float64 for uniform handling, but preserve original dtypes for type inference + test_input_original_dtypes = [inputs[x].dtype for x in inputs.files] test_inputs = [inputs[x].reshape(-1).astype(np.float64) for x in inputs.files] test_outputs = [outputs[x].reshape(-1).astype(np.float64) for x in outputs.files] @@ -122,7 +123,8 @@ def generateNetwork(args): _type = PointerClass(_type) else: - _type, offset = inferTypeAndOffset(values, signProp) + original_dtype = test_input_original_dtypes[index] if index < len(test_input_original_dtypes) else None + _type, offset = inferTypeAndOffset(values, signProp, original_dtype = original_dtype) inputTypes[f"input_{index}"] = _type inputOffsets[f"input_{index}"] = offset diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py index 01216984af..0f0c7ccdfc 100644 --- a/DeeployTest/testMVP.py +++ b/DeeployTest/testMVP.py @@ -68,7 +68,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg inputs = np.load(f'{args.dir}/inputs.npz') tensors = graph.tensors() - # Load as int64 and infer types later + # Load as float64 for uniform handling, but preserve original dtypes for type inference + test_input_original_dtypes = [inputs[x].dtype for x in inputs.files] test_inputs = [inputs[x].reshape(-1).astype(np.float64) for x in inputs.files] platform, signProp = mapPlatform(args.platform) @@ -83,7 +84,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg cluster.n_cores = args.cores for index, num in enumerate(test_inputs): - _type, offset = inferTypeAndOffset(num, signProp) + original_dtype = test_input_original_dtypes[index] if index < len(test_input_original_dtypes) else None + _type, offset = inferTypeAndOffset(num, signProp, original_dtype = original_dtype) inputTypes[f"input_{index}"] = _type inputOffsets[f"input_{index}"] = offset @@ -241,7 +243,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg if args.debug: test_inputs, test_outputs, graph = generateDebugConfig(inputs, outputs, activations, graph) else: - # Load as int64 and infer types later + # Load as float64 for uniform handling, but preserve original dtypes for type inference + test_input_original_dtypes = [inputs[x].dtype for x in inputs.files] test_inputs = [inputs[x].reshape(-1).astype(np.float64) for x in inputs.files] test_outputs = [outputs[x].reshape(-1).astype(np.float64) for x in outputs.files] @@ -280,7 +283,8 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg log.debug(f"Deployer: {deployer}") for index, num in enumerate(test_inputs): - _type, offset = inferTypeAndOffset(num, signProp) + original_dtype = test_input_original_dtypes[index] if index < len(test_input_original_dtypes) else None + _type, offset = inferTypeAndOffset(num, signProp, original_dtype = original_dtype) inputTypes[f"input_{index}"] = _type inputOffsets[f"input_{index}"] = offset diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py index 39a44d9442..1397dcbec3 100644 --- a/DeeployTest/testUtils/codeGenerate.py +++ b/DeeployTest/testUtils/codeGenerate.py @@ -10,6 +10,7 @@ from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkDeployer, VariableBuffer from Deeploy.Targets.MemPool.Platform import MemPoolPlatform from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform +from Deeploy.Targets.Snitch.Platform import SnitchPlatform _TEXT_ALIGN = 30 @@ -163,7 +164,8 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: C retStr += deployer.generateGlobalDefinitionCode() # WIESEP: Mempool assigns section attributes to intermediate buffers to allow . - if isinstance(deployer.Platform, MemPoolPlatform): + # Snitch also needs file-scope declarations for multi-core buffer sharing. + if isinstance(deployer.Platform, (MemPoolPlatform, SnitchPlatform)): retStr += deployer.generateInferenceInitializationCode() retStr += """ void RunNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){ diff --git a/DeeployTest/testUtils/typeMapping.py b/DeeployTest/testUtils/typeMapping.py index 232fd1e274..202dcac801 100644 --- a/DeeployTest/testUtils/typeMapping.py +++ b/DeeployTest/testUtils/typeMapping.py @@ -42,12 +42,20 @@ def isInteger(x: npt.NDArray) -> bool: return np.abs((x.astype(int) - x)).max() <= 0.001 -def inferMinimalType(values: np.ndarray, default: Type[BaseType] = int8_t) -> Type[BaseType]: +def inferMinimalType(values: np.ndarray, + default: Type[BaseType] = int8_t, + original_dtype: np.dtype = None) -> Type[BaseType]: # WIESEP: We cannot do type inference for empty arrays. if np.prod(values.shape) == 0: print(f"Warning: Empty input array for type inference for {values}!") return default + # For all-zero arrays, use original dtype to distinguish int vs float + if np.all(values == 0) and original_dtype is not None: + if np.issubdtype(original_dtype, np.floating): + return minimalFloatType(values) + return minimalIntegerType(values) + if isInteger(values): return minimalIntegerType(values) else: @@ -67,7 +75,9 @@ def signPropTypeAndOffset(_type: Type[IntegerImmediate]) -> Tuple[Type[IntegerIm return signedType, 2**(signedType.typeWidth - 1) -def inferTypeAndOffset(values: np.ndarray, signProp: bool = False) -> Tuple[Type[Pointer], int]: +def inferTypeAndOffset(values: np.ndarray, + signProp: bool = False, + original_dtype: np.dtype = None) -> Tuple[Type[Pointer], int]: """Infers the data type of the provided input array. Parameters @@ -77,13 +87,17 @@ def inferTypeAndOffset(values: np.ndarray, signProp: bool = False) -> Tuple[Type signProp : bool Whether to consider signedness when inferring the data type. + + original_dtype : np.dtype, optional + Original numpy dtype before float64 cast, used to resolve all-zero ambiguity. + Returns ------- Tuple[Type[BaseType], int] The inferred type and offset """ - _type = inferMinimalType(values) + _type = inferMinimalType(values, original_dtype = original_dtype) if signProp and issubclass(_type, IntegerImmediate): _type, offset = signPropTypeAndOffset(_type) diff --git a/DeeployTest/test_generic_config.py b/DeeployTest/test_generic_config.py index b0d8c659ca..e9e8480dd1 100644 --- a/DeeployTest/test_generic_config.py +++ b/DeeployTest/test_generic_config.py @@ -21,7 +21,7 @@ "Kernels/FP32/MaxPool/Regular_2D", "Kernels/FP32/Mul", "Kernels/FP32/LayerNorm", - "Kernels/FP32/RMSNorm", + "Kernels/FP32/RMSNorm/separate_ops", "Kernels/FP32/Pow/Scalar", "Kernels/FP32/Pow/Vector", "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean", diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..12f698008c 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -39,6 +39,7 @@ from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS from test_snitch_config import MODEL_TESTS as SNITCH_MODEL_TESTS from test_snitch_tiled_config import L2_SINGLEBUFFER_KERNELS as SNITCH_L2_SINGLEBUFFER_KERNELS +from test_snitch_tiled_config import L2_SINGLEBUFFER_MODELS as SNITCH_L2_SINGLEBUFFER_MODELS from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS @@ -536,6 +537,25 @@ def test_snitch_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, c run_and_assert_test(test_name, config, skipgen, skipsim) +@pytest.mark.snitch +@pytest.mark.models +@pytest.mark.parametrize("test_name", SNITCH_MODEL_TESTS, ids = SNITCH_MODEL_TESTS) +def test_snitch_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None: + platform_config = PLATFORM_CONFIGS["snitch"] + snitch_cmake_args = cmake_args + [f"NUM_CORES={platform_config['default_num_cores']}"] + config = create_test_config( + test_name = test_name, + platform = platform_config["platform"], + simulator = platform_config["simulator"], + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = snitch_cmake_args, + tiling = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + @pytest.mark.snitch_tiled @pytest.mark.kernels @pytest.mark.singlebuffer @@ -569,6 +589,37 @@ def test_snitch_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, too run_and_assert_test(test_name, config, skipgen, skipsim) +@pytest.mark.snitch_tiled +@pytest.mark.models +@pytest.mark.singlebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(SNITCH_L2_SINGLEBUFFER_MODELS, "L2-singlebuffer"), + ids = param_id, +) +def test_snitch_tiled_models_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + snitch_cmake_args = cmake_args + [f"NUM_CORES={SNITCH_DEFAULT_NUM_CORES}"] + config = create_test_config( + test_name = test_name, + platform = "Snitch", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = snitch_cmake_args, + tiling = True, + cores = SNITCH_DEFAULT_NUM_CORES, + l1 = l1, + l2 = 4000000, + default_mem_level = "L2", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + @pytest.mark.siracusa_neureka_tiled @pytest.mark.kernels @pytest.mark.singlebuffer diff --git a/DeeployTest/test_siracusa_neureka_tiled_config.py b/DeeployTest/test_siracusa_neureka_tiled_config.py index 68bd3dd96e..f3fe687064 100644 --- a/DeeployTest/test_siracusa_neureka_tiled_config.py +++ b/DeeployTest/test_siracusa_neureka_tiled_config.py @@ -31,7 +31,7 @@ "Models/miniMobileNet": [2000], "Kernels/Integer/Attention": [2500], "Models/Transformer": [15000], - "Models/microLlama/microLlama1": [10000], + "Models/microLlama/INT8/microLlama1": [10000], } # L3 double-buffer model tests @@ -53,5 +53,5 @@ L3_DOUBLEBUFFER_MODELS_WMEM = { "Models/miniMobileNet": [2000], "Kernels/Integer/Attention": [3500], - "Models/microLlama/microLlama1": [10000], + "Models/microLlama/INT8/microLlama1": [10000], } diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a687d9a489..7ceaeab9c3 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -106,9 +106,9 @@ "Models/miniMobileNet": [60000, 12000, 6000, 3000], "Models/miniMobileNetv2": [60000, 16000, 12000, 8000], "Kernels/Integer/Attention": [60000, 10000, 5000], - "Models/microLlama/microLlama1": [60000, 10000, 5000], - "Models/microLlama/microLlama8": [60000, 10000, 5000], - "Models/microLlama/microLlama8_parallel": [60000, 10000, 5000], + "Models/microLlama/INT8/microLlama1": [60000, 10000, 5000], + "Models/microLlama/INT8/microLlama8": [60000, 10000, 5000], + "Models/microLlama/INT8/microLlama8_parallel": [60000, 10000, 5000], "Models/MLPerf/KeywordSpotting": [64000], "Models/MLPerf/ImageClassification": [64000], "Models/MLPerf/AnomalyDetection": [64000], @@ -121,9 +121,9 @@ "Models/miniMobileNet": [60000, 24000, 12000, 6000], "Models/miniMobileNetv2": [60000, 32000, 24000, 16000], "Kernels/Integer/Attention": [60000, 20000, 10000, 5000], - "Models/microLlama/microLlama1": [60000, 20000, 10000], - "Models/microLlama/microLlama8": [60000, 20000, 10000], - "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama1": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama8": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama8_parallel": [60000, 20000, 10000], "Models/MLPerf/KeywordSpotting": [128000], "Models/MLPerf/ImageClassification": [128000], "Models/MLPerf/AnomalyDetection": [128000], @@ -137,7 +137,7 @@ "Models/miniMobileNetv2": [60000, 16000, 12000, 8000], "Kernels/Integer/Attention": [60000, 10000, 5000, 2500], "Models/Transformer": [60000, 30000, 15000], - "Models/microLlama/microLlama1": [60000, 10000, 5000], + "Models/microLlama/INT8/microLlama1": [60000, 10000, 5000], "Models/CCT/FP32/CCT_2_32_32_128": [128000], "Models/CCT_Train/CCT2_FT2": [128000], "Models/TinyViT/Demo": [4000], @@ -149,9 +149,9 @@ "Models/miniMobileNetv2": [60000, 32000, 24000, 16000], "Kernels/Integer/Attention": [60000, 20000, 10000, 5000], "Models/Transformer": [60000, 30000, 15000], - "Models/microLlama/microLlama1": [60000, 20000, 10000], - "Models/microLlama/microLlama8": [60000, 20000, 10000], - "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama1": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama8": [60000, 20000, 10000], + "Models/microLlama/INT8/microLlama8_parallel": [60000, 20000, 10000], "Models/CCT/FP32/CCT_2_32_32_128": [128000], "Models/CCT_Train/CCT2_FT2": [128000], "Models/TinyViT/Demo": [4000], diff --git a/DeeployTest/test_snitch_config.py b/DeeployTest/test_snitch_config.py index f51b2ede23..1a16cb8425 100644 --- a/DeeployTest/test_snitch_config.py +++ b/DeeployTest/test_snitch_config.py @@ -9,6 +9,12 @@ DEFAULT_NUM_CORES = 9 KERNEL_TESTS = [ + "Kernels/FP32/Add/Regular", + "Kernels/FP32/Div", + "Kernels/FP32/Hardswish", + "Kernels/FP32/MatMul", + "Kernels/FP32/Mul", + "Kernels/FP32/RMSNorm/single_fused_op", "Kernels/FP32/Softmax/Regular", "Kernels/Integer/Add/Large", "Kernels/Integer/Add/Regular", @@ -21,4 +27,6 @@ "Kernels/Integer/GEMM/TransB_RQ", ] -MODEL_TESTS = [] +MODEL_TESTS = [ + "Models/microLlama/FP32/microLlama1", +] diff --git a/DeeployTest/test_snitch_tiled_config.py b/DeeployTest/test_snitch_tiled_config.py index 3f81239fce..d40521f8d4 100644 --- a/DeeployTest/test_snitch_tiled_config.py +++ b/DeeployTest/test_snitch_tiled_config.py @@ -11,17 +11,23 @@ # L2 single-buffer tests with different L1 sizes # Format: {test_name: [L1_sizes]} L2_SINGLEBUFFER_KERNELS = { - "Kernels/Integer/Add/Large": [5000, 10000], - "Kernels/Integer/Softmax/Large": [5000, 10000], + "Kernels/FP32/Div": [2000, 5000, 10000], + "Kernels/FP32/Hardswish": [2000, 5000, 10000], + "Kernels/FP32/Mul": [2000, 5000, 10000], + "Kernels/FP32/RMSNorm/single_fused_op": [2000, 5000, 10000], "Kernels/FP32/Softmax/Regular": [2000, 5000, 10000], "Kernels/FP32/GEMM/Regular": [2000, 5000, 10000], "Kernels/FP32/GEMM/TransB": [2000, 5000, 10000], + "Kernels/Integer/Add/Large": [5000, 10000], + "Kernels/Integer/Softmax/Large": [5000, 10000], "Kernels/Integer/iNoNorm": [5000, 10000], "Kernels/Integer/Add/Regular_RQ": [5000, 10000], "Kernels/Integer/GEMM/Regular_RQPerRow": [2000, 5000], } -L2_SINGLEBUFFER_MODELS = {} +L2_SINGLEBUFFER_MODELS = { + "Models/microLlama/FP32/microLlama1": [10000, 20000], +} # Currently no double-buffer configurations in CI L2_DOUBLEBUFFER_KERNELS = {} diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h index d97cfecb7c..0b5a0e51fb 100644 --- a/TargetLibraries/Generic/inc/macros.h +++ b/TargetLibraries/Generic/inc/macros.h @@ -7,22 +7,28 @@ #ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_ #define __DEEPLOY_BASIC_MATH_MACROS_HEADER_ +#ifndef MAX #define MAX(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a > _b ? _a : _b; \ }) +#endif +#ifndef MIN #define MIN(a, b) \ ({ \ __typeof__(a) _a = (a); \ __typeof__(b) _b = (b); \ _a < _b ? _a : _b; \ }) +#endif +#ifndef CLAMP #define CLAMP(x, low, high) \ (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x))) +#endif #define inf 1.0f / 0.0f diff --git a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h index e44d3c20c6..9cc873f04b 100644 --- a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h +++ b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h @@ -12,9 +12,9 @@ #include #include -#define BEGIN_SINGLE_CORE if (core_id == 0) { +#define BEGIN_SINGLE_CORE if (snrt_cluster_core_idx() == 0) { #define END_SINGLE_CORE } -#define SINGLE_CORE if (core_id == 0) +#define SINGLE_CORE if (snrt_cluster_core_idx() == 0) #include "CycleCounter.h" #include "macros.h" @@ -23,8 +23,14 @@ #include "snrt.h" +#include "kernel/Add.h" +#include "kernel/Div.h" #include "kernel/Gemm.h" +#include "kernel/Gemm_fp32.h" +#include "kernel/HardSwish.h" #include "kernel/MatMul.h" +#include "kernel/Mul.h" +#include "kernel/RMSNrom.h" #include "kernel/RQGemm.h" #include "kernel/RQMatMul.h" #include "kernel/Softmax.h" diff --git a/TargetLibraries/Snitch/inc/kernel/Add.h b/TargetLibraries/Snitch/inc/kernel/Add.h new file mode 100644 index 0000000000..90881e55fd --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Add.h @@ -0,0 +1,18 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_ADD_KERNEL_HEADER_ +#define __DEEPLOY_MATH_ADD_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, uint32_t size); + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size); + +#endif // __DEEPLOY_MATH_ADD_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Div.h b/TargetLibraries/Snitch/inc/kernel/Div.h new file mode 100644 index 0000000000..e9b257a634 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Div.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_DIV_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/HardSwish.h b/TargetLibraries/Snitch/inc/kernel/HardSwish.h new file mode 100644 index 0000000000..a0cfdaac12 --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/HardSwish.h @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ +#define __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * HardSwish Activation Function + * + * Computes: HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + * + * Piecewise form: + * - When x <= -3: output = 0 + * - When -3 < x < 3: output = x * (x/6 + 0.5) + * - When x >= 3: output = x + * + * This is a computationally efficient approximation of Swish/SiLU activation + * commonly used in mobile neural networks and transformer models. + * + * data_in: Input tensor (FP32) + * data_out: Output tensor (FP32, same shape as input) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size); + +#endif // __DEEPLOY_MATH_HARDSWISH_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/MatMul.h b/TargetLibraries/Snitch/inc/kernel/MatMul.h index d4b9ba71ca..66e02c2d90 100644 --- a/TargetLibraries/Snitch/inc/kernel/MatMul.h +++ b/TargetLibraries/Snitch/inc/kernel/MatMul.h @@ -137,4 +137,21 @@ void MatMul_unrolled_2x2_parallel_s32_rv32im(int32_t const *__restrict__ pSrcA, uint32_t M, uint32_t N, uint32_t P); +/******************************************************************************/ +/* Matrix Multiplication (FP32, multi-core) */ +/******************************************************************************/ + +/* + * Matrix multiplication ---------------------------------- + * kernel = matmul_fp32_opt + * data type = 32-bit float + * multi-core = yes (splits M rows across compute cores internally) + * unrolling = 8 columns + * cleanup = yes + */ +void matmul_fp32_opt(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, uint32_t N, + uint32_t O); + #endif //__DEEPLOY_MATH_MATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Mul.h b/TargetLibraries/Snitch/inc/kernel/Mul.h new file mode 100644 index 0000000000..d851e2e3bf --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/Mul.h @@ -0,0 +1,44 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ +#define __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size); + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size); + +#endif // __DEEPLOY_MATH_MUL_FP32_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/RMSNrom.h b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h new file mode 100644 index 0000000000..16e25cd38c --- /dev/null +++ b/TargetLibraries/Snitch/inc/kernel/RMSNrom.h @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ +#define __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ + +#include "DeeploySnitchMath.h" + +/* + * RMS Normalization (Root Mean Square Normalization) + * + * Computes: output[i] = (input[i] / rms) * weight[i] + * where rms = sqrt(mean(input^2) + eps) + * + * data_in: Input tensor [batch, seq, hidden] or flattened [size] + * weight: Weight tensor [hidden_dim] + * data_out: Output tensor (same shape as input) + * size: Total number of elements (batch * seq * hidden) + * lastDimLength: Hidden dimension size + * eps: Epsilon for numerical stability (typically 1e-6) + * + * multi-core = yes + * parallelization = vector-wise (across batch * sequence) + */ +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps); + +#endif // __DEEPLOY_MATH_RMSNORM_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/inc/kernel/Softmax.h b/TargetLibraries/Snitch/inc/kernel/Softmax.h index c2d7596e7a..8e9d191053 100644 --- a/TargetLibraries/Snitch/inc/kernel/Softmax.h +++ b/TargetLibraries/Snitch/inc/kernel/Softmax.h @@ -9,8 +9,7 @@ #include "DeeploySnitchMath.h" -void softmax_fp32(float *input, float *output, int32_t ldI, - int32_t batch_offset, int32_t batch_size, int32_t seq_len, - int32_t input_samples); +void Softmax_fp32(float32_t *input, float32_t *output, uint32_t size, + uint32_t lastDimLength); -#endif // #define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_ \ No newline at end of file +#endif // #define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_ diff --git a/TargetLibraries/Snitch/src/Add_fp32.c b/TargetLibraries/Snitch/src/Add_fp32.c new file mode 100644 index 0000000000..785ff0c2b3 --- /dev/null +++ b/TargetLibraries/Snitch/src/Add_fp32.c @@ -0,0 +1,69 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +void Add_fp32(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + pOut[i] = pIn1[i] + pIn2[i]; + } +} + +void Add_fp32_broadcast(float32_t *pIn1, float32_t *pIn2, float32_t *pOut, + uint32_t *out_shape, uint32_t *strides1, + uint32_t *strides2, uint32_t ndim, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t chunkSize = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, num_elements; + if (core_id < remainder) { + num_elements = chunkSize + 1; + start = core_id * num_elements; + } else { + num_elements = chunkSize; + start = core_id * chunkSize + remainder; + } + + uint32_t end = start + num_elements; + + for (uint32_t i = start; i < end; i++) { + uint32_t idx1 = 0; + uint32_t idx2 = 0; + uint32_t tmp = i; + + for (int32_t d = ndim - 1; d >= 0; d--) { + uint32_t coord = tmp % out_shape[d]; + tmp /= out_shape[d]; + idx1 += coord * strides1[d]; + idx2 += coord * strides2[d]; + } + + pOut[i] = pIn1[idx1] + pIn2[idx2]; + } +} diff --git a/TargetLibraries/Snitch/src/Div_fp32.c b/TargetLibraries/Snitch/src/Div_fp32.c new file mode 100644 index 0000000000..07c3d3c5d4 --- /dev/null +++ b/TargetLibraries/Snitch/src/Div_fp32.c @@ -0,0 +1,89 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Division (FP32) + * + * Computes: output[i] = input1[i] / input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): divides all elements of input1 by input2[0] + * - If both have same size: element-wise division + * + * input1: Numerator tensor (float32) + * input2: Denominator tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Div_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Check if input2 is a scalar (size=1, broadcasted) + // Note: This assumes the parser has set input2_size correctly + // For now, we assume element-wise division (same size) + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] / input2[i]; + } +} + +/* + * Element-wise Division with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] / scalar + * + * input1: Numerator tensor (float32) + * scalar: Scalar denominator (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Div_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + float32_t inv_scalar = 1.0f / scalar; // Compute inverse once + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * inv_scalar; + } +} diff --git a/TargetLibraries/Snitch/src/HardSwish.c b/TargetLibraries/Snitch/src/HardSwish.c new file mode 100644 index 0000000000..b7e9679c64 --- /dev/null +++ b/TargetLibraries/Snitch/src/HardSwish.c @@ -0,0 +1,46 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +void HardSwish_fp32(float32_t *data_in, float32_t *data_out, uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize by dividing work across cores + uint32_t chunk_size = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start, end; + if (core_id < remainder) { + chunk_size += 1; + start = core_id * chunk_size; + } else { + start = core_id * chunk_size + remainder; + } + end = start + chunk_size; + + // HardSwish(x) = x * clip(x/6 + 0.5, 0, 1) + // Piecewise: + // x <= -3: output = 0 + // -3 < x < 3: output = x * (x/6 + 0.5) + // x >= 3: output = x + + for (uint32_t i = start; i < end; i++) { + float32_t x = data_in[i]; + float32_t clip_val = x / 6.0f + 0.5f; + + // Clamp to [0, 1] + if (clip_val < 0.0f) { + clip_val = 0.0f; + } else if (clip_val > 1.0f) { + clip_val = 1.0f; + } + + data_out[i] = x * clip_val; + } +} diff --git a/TargetLibraries/Snitch/src/MatMul_fp32.c b/TargetLibraries/Snitch/src/MatMul_fp32.c new file mode 100644 index 0000000000..337b735072 --- /dev/null +++ b/TargetLibraries/Snitch/src/MatMul_fp32.c @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Multi-core FP32 matrix multiplication (scalar, no SSR) + * + * Computes: Y = A * B + * A is M x N, B is N x O, Y is M x O + * All matrices in row-major layout. + * + * Splits M rows across compute cores internally. + * Uses a distinct function name to avoid being shadowed by + * the Generic single-core MatMul_fp32_fp32_fp32 (link order). + */ +void matmul_fp32_opt(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, uint32_t N, + uint32_t O) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t rows_per_core = M / numThreads; + uint32_t remainder = M % numThreads; + + uint32_t start_row, num_rows; + if (core_id < remainder) { + num_rows = rows_per_core + 1; + start_row = core_id * num_rows; + } else { + num_rows = rows_per_core; + start_row = core_id * rows_per_core + remainder; + } + + const uint32_t unroll = 8; + uint32_t O_block = O - (O % unroll); + + for (uint32_t i = start_row; i < start_row + num_rows; i++) { + uint32_t j; + for (j = 0; j < O_block; j += unroll) { + float32_t c0 = 0.0f; + float32_t c1 = 0.0f; + float32_t c2 = 0.0f; + float32_t c3 = 0.0f; + float32_t c4 = 0.0f; + float32_t c5 = 0.0f; + float32_t c6 = 0.0f; + float32_t c7 = 0.0f; + + for (uint32_t k = 0; k < N; k++) { + float32_t a = pSrcA[i * N + k]; + c0 += a * pSrcB[k * O + j + 0]; + c1 += a * pSrcB[k * O + j + 1]; + c2 += a * pSrcB[k * O + j + 2]; + c3 += a * pSrcB[k * O + j + 3]; + c4 += a * pSrcB[k * O + j + 4]; + c5 += a * pSrcB[k * O + j + 5]; + c6 += a * pSrcB[k * O + j + 6]; + c7 += a * pSrcB[k * O + j + 7]; + } + + pDstY[i * O + j + 0] = c0; + pDstY[i * O + j + 1] = c1; + pDstY[i * O + j + 2] = c2; + pDstY[i * O + j + 3] = c3; + pDstY[i * O + j + 4] = c4; + pDstY[i * O + j + 5] = c5; + pDstY[i * O + j + 6] = c6; + pDstY[i * O + j + 7] = c7; + } + + // Cleanup for remaining columns + for (; j < O; j++) { + float32_t sum = 0.0f; + for (uint32_t k = 0; k < N; k++) { + sum += pSrcA[i * N + k] * pSrcB[k * O + j]; + } + pDstY[i * O + j] = sum; + } + } +} diff --git a/TargetLibraries/Snitch/src/Mul_fp32.c b/TargetLibraries/Snitch/src/Mul_fp32.c new file mode 100644 index 0000000000..80d6bc9b33 --- /dev/null +++ b/TargetLibraries/Snitch/src/Mul_fp32.c @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" + +/* + * Element-wise Multiplication (FP32) + * + * Computes: output[i] = input1[i] * input2[i] + * + * Supports ONNX broadcasting rules: + * - If input2 is scalar (size=1): multiplies all elements of input1 by + * input2[0] + * - If both have same size: element-wise multiplication + * + * input1: First input tensor (float32) + * input2: Second input tensor (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise across input1 + */ +void Mul_fp32(float32_t *input1, float32_t *input2, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + // Parallelize across elements + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + // Element-wise multiplication + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * input2[i]; + } +} + +/* + * Element-wise Multiplication with scalar broadcasting (FP32) + * + * Computes: output[i] = input1[i] * scalar + * + * input1: Input tensor (float32) + * scalar: Scalar multiplier (float32) + * output: Output tensor (same shape as input1) + * size: Total number of elements in input1 + * + * multi-core = yes + * parallelization = element-wise + */ +void Mul_fp32_scalar(float32_t *input1, float32_t scalar, float32_t *output, + uint32_t size) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t elements_per_core = size / numThreads; + uint32_t remainder = size % numThreads; + + uint32_t start_elem, num_elems; + if (core_id < remainder) { + num_elems = elements_per_core + 1; + start_elem = core_id * num_elems; + } else { + num_elems = elements_per_core; + start_elem = core_id * elements_per_core + remainder; + } + + for (uint32_t i = start_elem; i < start_elem + num_elems; i++) { + output[i] = input1[i] * scalar; + } +} diff --git a/TargetLibraries/Snitch/src/RMSNrom_fp32.c b/TargetLibraries/Snitch/src/RMSNrom_fp32.c new file mode 100644 index 0000000000..9c615ce923 --- /dev/null +++ b/TargetLibraries/Snitch/src/RMSNrom_fp32.c @@ -0,0 +1,50 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeploySnitchMath.h" +#include + +void RMSNorm_fp32(float32_t *data_in, float32_t *weight, float32_t *data_out, + uint32_t size, uint32_t lastDimLength, float32_t eps) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t num_vectors = size / lastDimLength; + + // Parallelize across vectors (batch * sequence dimension) + uint32_t vectors_per_core = num_vectors / numThreads; + uint32_t remainder = num_vectors % numThreads; + + uint32_t start_vec, num_vecs; + if (core_id < remainder) { + num_vecs = vectors_per_core + 1; + start_vec = core_id * num_vecs; + } else { + num_vecs = vectors_per_core; + start_vec = core_id * vectors_per_core + remainder; + } + + for (uint32_t v = start_vec; v < start_vec + num_vecs; v++) { + float32_t *in_ptr = data_in + v * lastDimLength; + float32_t *out_ptr = data_out + v * lastDimLength; + + // Compute sum of squares + float32_t sum_sq = 0.0f; + for (uint32_t i = 0; i < lastDimLength; i++) { + sum_sq += in_ptr[i] * in_ptr[i]; + } + + // Compute RMS with epsilon + float32_t rms = sqrtf(sum_sq / (float32_t)lastDimLength + eps); + float32_t inv_rms = 1.0f / rms; + + // Apply normalization and weight + for (uint32_t i = 0; i < lastDimLength; i++) { + out_ptr[i] = in_ptr[i] * inv_rms * weight[i]; + } + } +} diff --git a/TargetLibraries/Snitch/src/Softmax_fp32.c b/TargetLibraries/Snitch/src/Softmax_fp32.c index b8abb27845..31795fe304 100644 --- a/TargetLibraries/Snitch/src/Softmax_fp32.c +++ b/TargetLibraries/Snitch/src/Softmax_fp32.c @@ -5,34 +5,63 @@ */ #include "DeeploySnitchMath.h" +#include -void Softmax_fp32(float32_t *input, float32_t *output, int32_t ldI, - int32_t batch_offset, int32_t batch_size, int32_t seq_len, - int32_t input_samples) { - - float32_t max_core = 0.0; // max value of the current core - float32_t sum = 0.0; // sum of the exp values of the current core - int32_t compute_id = snrt_global_compute_core_idx(); - int32_t row_offset = compute_id * input_samples; - for (int32_t b = 0; b < batch_size; b++) { - for (int32_t s = 0; s < seq_len; s++) { - max_core = -INFINITY; - sum = 0.0; - for (int32_t i = 0; i < input_samples; i++) { - if (input[row_offset + b * batch_offset + s * ldI + i] > max_core) { - max_core = input[row_offset + b * batch_offset + s * ldI + i]; - } - } - // compute the shifted value of the current row - for (int32_t i = 0; i < input_samples; i++) { - output[row_offset + b * batch_offset + s * ldI + i] = - expf(input[row_offset + b * batch_offset + s * ldI + i] - max_core); - sum += output[row_offset + b * batch_offset + s * ldI + i]; - } - // compute the softmax value of the current row - for (int32_t i = 0; i < input_samples; i++) { - output[row_offset + b * batch_offset + s * ldI + i] /= sum; - } +/* + * Multi-core FP32 Softmax + * + * Computes softmax along the last dimension: + * output[b][i] = exp(input[b][i] - max) / sum(exp(input[b][j] - max)) + * + * Parallelizes across the batch dimension (size / lastDimLength rows). + * + * input: Input tensor (float32) + * output: Output tensor (float32) + * size: Total number of elements + * lastDimLength: Length of the last dimension (softmax axis) + */ +void Softmax_fp32(float32_t *input, float32_t *output, uint32_t size, + uint32_t lastDimLength) { + + uint32_t core_id = snrt_global_compute_core_idx(); + uint32_t numThreads = snrt_global_compute_core_num(); + + uint32_t num_rows = size / lastDimLength; + + uint32_t rows_per_core = num_rows / numThreads; + uint32_t remainder = num_rows % numThreads; + + uint32_t start_row, num_rows_this_core; + if (core_id < remainder) { + num_rows_this_core = rows_per_core + 1; + start_row = core_id * num_rows_this_core; + } else { + num_rows_this_core = rows_per_core; + start_row = core_id * rows_per_core + remainder; + } + + for (uint32_t r = start_row; r < start_row + num_rows_this_core; r++) { + float32_t *in_row = input + r * lastDimLength; + float32_t *out_row = output + r * lastDimLength; + + // Find max for numerical stability + float32_t max_val = -INFINITY; + for (uint32_t i = 0; i < lastDimLength; i++) { + if (in_row[i] > max_val) + max_val = in_row[i]; + } + + // Compute exp and sum + float32_t sum = 0.0f; + for (uint32_t i = 0; i < lastDimLength; i++) { + out_row[i] = expf(in_row[i] - max_val); + sum += out_row[i]; + } + + // Normalize + float32_t inv_sum = 1.0f / sum; + for (uint32_t i = 0; i < lastDimLength; i++) { + out_row[i] *= inv_sum; } } }