From 6a33c48679f38d92659156dff9f2bc35e2aea817 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 10:37:16 +0200
Subject: [PATCH 01/13] [SW] Fix a data generation problem: If only has cpu
 version of pytorch, the data generation of 16b and 8b will not function.

---
 hw/system/spatz_cluster/Makefile              |  14 ++
 sw/spatzBenchmarks/gemv/script/gen_data.py    |  21 +-
 .../hp-fmatmul/script/gen_data.py             | 233 +++++++-----------
 .../sdotp-bp-fmatmul/script/gen_data.py       | 204 ++++++---------
 .../sdotp-hp-fmatmul/script/gen_data.py       | 196 ++++++---------
 .../widening-bp-fmatmul/script/gen_data.py    | 204 ++++++---------
 .../widening-hp-fmatmul/script/gen_data.py    | 196 ++++++---------
 7 files changed, 415 insertions(+), 653 deletions(-)

diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile
index 3fa7bd52..eb87cc4e 100644
--- a/hw/system/spatz_cluster/Makefile
+++ b/hw/system/spatz_cluster/Makefile
@@ -133,6 +133,20 @@ spatz.gendata:
 		fi \
 	done
 
+.PHONY: spatz.cleandata
+spatz.cleandata:
+	@for benchmark_dir in $(ROOT)/sw/spatzBenchmarks/*/; do \
+		data_dir="$$benchmark_dir/data"; \
+		if [ -d "$$data_dir" ]; then \
+			data_count=$$(find "$$data_dir" -name 'data*.h' -type f 2>/dev/null | wc -l); \
+			if [ "$$data_count" -gt 0 ]; then \
+				echo "Cleaning $$data_count data file(s) from $$data_dir"; \
+				rm -f "$$data_dir"/data*.h; \
+			fi \
+		fi \
+	done
+	@echo "All benchmark data cleaned."
+
 #############
 # Verilator #
 #############
diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py
index 177733ef..f33d5154 100644
--- a/sw/spatzBenchmarks/gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/gemv/script/gen_data.py
@@ -23,7 +23,8 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = list(a.flat)
         elif isinstance(a, torch.Tensor):
-            a = a.numpy().flatten().tolist()
+            # Universal Fix: Cast to float32 before sending to NumPy/C-string to avoid formatting errors
+            a = a.float().numpy().flatten().tolist()
         else:
             a = list(a)
         for i, el in enumerate(a):
@@ -121,9 +122,11 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return torch.randn(shape, requires_grad=False, dtype=torch.float16), {}
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -142,15 +145,8 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def gemv(a, b):
-    # PyTorch doesn't support matmul for float16 on CPU, so convert to float32
-    original_dtype = a.dtype
-    if original_dtype == torch.float16:
-        a = a.float()
-        b = b.float()
-    result = torch.matmul(a, b)
-    if original_dtype == torch.float16:
-        result = result.half()
-    return result
+    # Universal Fix: One-liner upcast and downcast
+    return torch.matmul(a.float(), b.float()).to(a.dtype)
 
 
 def main():
@@ -198,3 +194,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+    
\ No newline at end of file
diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
index 0cd18994..f33c12b1 100755
--- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
@@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
-
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
         + ";\n\n\n"
     )
     layer_str += (
@@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
-
     layer_str = ""
     return layer_str
 
@@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(result)
+            + array_to_cstr(torch.sum(result.float(), dim=-1))
             + ";\n\n\n"
         )
     else:
@@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
-
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(
-            0, 2, shape, requires_grad=False, dtype=torch.uint8
-        )  # -1 or 1
-        exponent = torch.randint(
-            0, 16, shape, requires_grad=False, dtype=torch.uint8
-        )  # < 0b01111
-        mantissa = torch.randint(
-            0, 4, shape, requires_grad=False, dtype=torch.uint8
-        )  # can be arbitrary
+        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
+        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
+        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
-        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    # Universal Fix: Upcast for CPU Math
+    orig_dtype = ifmap.dtype
+    ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    conv2d.bias = nn.Parameter(
-        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
-    )
+    # Ensure bias natively generates in float32
+    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
+
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    # Universal Fix: Upcast EVERYTHING before starting the math
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+    weights = weights.float()
+    bn_k = bn_k.float()
+    bn_l = bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -440,14 +443,13 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype,
+        dtype=ifmap.dtype, # Safely uses float32
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
-    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -458,16 +460,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
+
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
-        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -478,7 +480,6 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
-        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -491,29 +492,22 @@ def fused_conv(
 
     ofmap += ofmap_before
 
-    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
-    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    return ofmap, ofmap_before, ifmap_padded
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c",
-        "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help="Select param config file kernel",
+        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
-
     args = parser.parse_args()
 
     global verbose
@@ -532,31 +526,18 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
+        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"],
-            param["channels"]["in"],
-            param["filter"]["height"],
-            param["filter"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
-        ofmap = conv2d(
-            ifmap,
-            weights,
-            padding=param["filter"]["padding"],
-            stride=param["filter"]["stride"],
-        )
+        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -568,7 +549,8 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        # Upcast for CPU Math
+        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -576,38 +558,22 @@ def main():
             mat_B = mat_B.T
 
         kwargs = {
-            "A": mat_A,
-            "B": mat_B,
-            "C": mat_C,
-            "result": result,
-            "M": param["M"],
-            "N": param["N"],
-            "K": param["K"],
-            "ta": param["transpose_A"],
-            "tb": param["transpose_B"],
-            "alpha": param["alpha"],
-            "prec": param["prec"],
-            "expand": param["expand"],
-            "bits_A": bits_A,
-            "bits_B": bits_B,
-            "bits_C": bits_C,
+            "A": mat_A, "B": mat_B, "C": mat_C, "result": result,
+            "M": param["M"], "N": param["N"], "K": param["K"],
+            "ta": param["transpose_A"], "tb": param["transpose_B"], "alpha": param["alpha"],
+            "prec": param["prec"], "expand": param["expand"],
+            "bits_A": bits_A, "bits_B": bits_B, "bits_C": bits_C,
         }
-
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -616,17 +582,12 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -635,44 +596,28 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"],
-            param["dim_in_x"],
-            param["ch_in"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
+
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"],
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap,
-            kernel,
-            bn_k,
-            bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
+            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
+            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -681,25 +626,15 @@ def main():
             kernel = kernel.permute(0, 3, 1, 2)
 
         kwargs = {
-            "ifmap": ifmap,
-            "ifmap_padded": ifmap_padded,
-            "ofmap": ofmap,
-            "ofmap_before": ofmap_before,
-            "kernel": kernel,
-            "bn_k": bn_k,
-            "bn_l": bn_l,
-            "padding": param["padding"],
-            "stride": param["stride"],
-            "prec": param["prec"],
-            "flags": param["flags"],
-            "depthwise": param["depthwise"],
-            "chw_layer": param["chw_layer"],
+            "ifmap": ifmap, "ifmap_padded": ifmap_padded, "ofmap": ofmap,
+            "ofmap_before": ofmap_before, "kernel": kernel, "bn_k": bn_k, "bn_l": bn_l,
+            "padding": param["padding"], "stride": param["stride"], "prec": param["prec"],
+            "flags": param["flags"], "depthwise": param["depthwise"], "chw_layer": param["chw_layer"],
         }
         emit_header_file("FusedConv", **kwargs)
 
     else:
         print("No valid kernel selected")
 
-
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
index 9918713e..0c500558 100755
--- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
@@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
-
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
         + ";\n\n\n"
     )
     layer_str += (
@@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
-
     layer_str = ""
     return layer_str
 
@@ -167,23 +165,23 @@ def emit_GEMM_layer(name="gemm", **kwargs):
             + ";\n\n\n"
         )
         layer_str += (
-            f"static const {dtype} {name}_checksum[{m}] = "
-            + array_to_cstr(torch.sum(result, dim=-1))
+            f"static const {dtype} {name}_result[{m}*{n}] = "
+            + array_to_cstr(torch.sum(result.float(), dim=-1))
             + ";\n\n\n"
         )
     else:
         layer_str += (
-            f"static {dtype} {name}_A_dram [{m}*{k}] = "
+            f"static {dtype} {name}_A_dram [{m}][{k}] = "
             + array_to_cstr(kwargs["bits_A"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_B_dram [{k}*{n}] = "
+            f"static {dtype} {name}_B_dram [{k}][{n}] = "
             + array_to_cstr(kwargs["bits_B"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_C_dram [{m}*{n}] = "
+            f"static {dtype} {name}_C_dram [{m}][{n}] = "
             + array_to_cstr(kwargs["bits_C"], fmt="char")
             + ";\n\n\n"
         )
@@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
-
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(
-            0, 2, shape, requires_grad=False, dtype=torch.uint8
-        )  # -1 or 1
-        exponent = torch.randint(
-            0, 16, shape, requires_grad=False, dtype=torch.uint8
-        )  # < 0b01111
-        mantissa = torch.randint(
-            0, 4, shape, requires_grad=False, dtype=torch.uint8
-        )  # can be arbitrary
+        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
+        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
+        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
-        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    # Universal Fix: Upcast for CPU Math
+    orig_dtype = ifmap.dtype
+    ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    conv2d.bias = nn.Parameter(
-        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
-    )
+    # Ensure bias natively generates in float32
+    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
+
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    # Universal Fix: Upcast EVERYTHING before starting the math
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+    weights = weights.float()
+    bn_k = bn_k.float()
+    bn_l = bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -440,14 +443,13 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype,
+        dtype=ifmap.dtype, # Safely uses float32
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
-    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -458,16 +460,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
+
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
-        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -478,7 +480,6 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
-        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -491,29 +492,22 @@ def fused_conv(
 
     ofmap += ofmap_before
 
-    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
-    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    return ofmap, ofmap_before, ifmap_padded
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c",
-        "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help="Select param config file kernel",
+        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
-
     args = parser.parse_args()
 
     global verbose
@@ -532,31 +526,18 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
+        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"],
-            param["channels"]["in"],
-            param["filter"]["height"],
-            param["filter"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
-        ofmap = conv2d(
-            ifmap,
-            weights,
-            padding=param["filter"]["padding"],
-            stride=param["filter"]["stride"],
-        )
+        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -568,7 +549,8 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        # Upcast for CPU Math
+        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -592,22 +574,16 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
-
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -616,17 +592,12 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -635,44 +606,28 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"],
-            param["dim_in_x"],
-            param["ch_in"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
+
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"],
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap,
-            kernel,
-            bn_k,
-            bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
+            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
+            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -700,6 +655,5 @@ def main():
     else:
         print("No valid kernel selected")
 
-
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
index 0cd18994..0c500558 100755
--- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
@@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
-
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
         + ";\n\n\n"
     )
     layer_str += (
@@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
-
     layer_str = ""
     return layer_str
 
@@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(result)
+            + array_to_cstr(torch.sum(result.float(), dim=-1))
             + ";\n\n\n"
         )
     else:
@@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
-
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(
-            0, 2, shape, requires_grad=False, dtype=torch.uint8
-        )  # -1 or 1
-        exponent = torch.randint(
-            0, 16, shape, requires_grad=False, dtype=torch.uint8
-        )  # < 0b01111
-        mantissa = torch.randint(
-            0, 4, shape, requires_grad=False, dtype=torch.uint8
-        )  # can be arbitrary
+        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
+        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
+        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
-        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    # Universal Fix: Upcast for CPU Math
+    orig_dtype = ifmap.dtype
+    ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    conv2d.bias = nn.Parameter(
-        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
-    )
+    # Ensure bias natively generates in float32
+    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
+
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    # Universal Fix: Upcast EVERYTHING before starting the math
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+    weights = weights.float()
+    bn_k = bn_k.float()
+    bn_l = bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -440,14 +443,13 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype,
+        dtype=ifmap.dtype, # Safely uses float32
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
-    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -458,16 +460,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
+
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
-        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -478,7 +480,6 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
-        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -491,29 +492,22 @@ def fused_conv(
 
     ofmap += ofmap_before
 
-    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
-    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    return ofmap, ofmap_before, ifmap_padded
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c",
-        "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help="Select param config file kernel",
+        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
-
     args = parser.parse_args()
 
     global verbose
@@ -532,31 +526,18 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
+        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"],
-            param["channels"]["in"],
-            param["filter"]["height"],
-            param["filter"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
-        ofmap = conv2d(
-            ifmap,
-            weights,
-            padding=param["filter"]["padding"],
-            stride=param["filter"]["stride"],
-        )
+        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -568,7 +549,8 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        # Upcast for CPU Math
+        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -592,22 +574,16 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
-
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -616,17 +592,12 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -635,44 +606,28 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"],
-            param["dim_in_x"],
-            param["ch_in"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
+
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"],
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap,
-            kernel,
-            bn_k,
-            bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
+            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
+            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -700,6 +655,5 @@ def main():
     else:
         print("No valid kernel selected")
 
-
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
index 9918713e..0c500558 100755
--- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
@@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
-
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
         + ";\n\n\n"
     )
     layer_str += (
@@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
-
     layer_str = ""
     return layer_str
 
@@ -167,23 +165,23 @@ def emit_GEMM_layer(name="gemm", **kwargs):
             + ";\n\n\n"
         )
         layer_str += (
-            f"static const {dtype} {name}_checksum[{m}] = "
-            + array_to_cstr(torch.sum(result, dim=-1))
+            f"static const {dtype} {name}_result[{m}*{n}] = "
+            + array_to_cstr(torch.sum(result.float(), dim=-1))
             + ";\n\n\n"
         )
     else:
         layer_str += (
-            f"static {dtype} {name}_A_dram [{m}*{k}] = "
+            f"static {dtype} {name}_A_dram [{m}][{k}] = "
             + array_to_cstr(kwargs["bits_A"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_B_dram [{k}*{n}] = "
+            f"static {dtype} {name}_B_dram [{k}][{n}] = "
             + array_to_cstr(kwargs["bits_B"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_C_dram [{m}*{n}] = "
+            f"static {dtype} {name}_C_dram [{m}][{n}] = "
             + array_to_cstr(kwargs["bits_C"], fmt="char")
             + ";\n\n\n"
         )
@@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
-
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(
-            0, 2, shape, requires_grad=False, dtype=torch.uint8
-        )  # -1 or 1
-        exponent = torch.randint(
-            0, 16, shape, requires_grad=False, dtype=torch.uint8
-        )  # < 0b01111
-        mantissa = torch.randint(
-            0, 4, shape, requires_grad=False, dtype=torch.uint8
-        )  # can be arbitrary
+        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
+        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
+        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
-        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    # Universal Fix: Upcast for CPU Math
+    orig_dtype = ifmap.dtype
+    ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    conv2d.bias = nn.Parameter(
-        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
-    )
+    # Ensure bias natively generates in float32
+    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
+
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    # Universal Fix: Upcast EVERYTHING before starting the math
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+    weights = weights.float()
+    bn_k = bn_k.float()
+    bn_l = bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -440,14 +443,13 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype,
+        dtype=ifmap.dtype, # Safely uses float32
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
-    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -458,16 +460,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
+
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
-        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -478,7 +480,6 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
-        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -491,29 +492,22 @@ def fused_conv(
 
     ofmap += ofmap_before
 
-    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
-    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    return ofmap, ofmap_before, ifmap_padded
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c",
-        "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help="Select param config file kernel",
+        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
-
     args = parser.parse_args()
 
     global verbose
@@ -532,31 +526,18 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
+        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"],
-            param["channels"]["in"],
-            param["filter"]["height"],
-            param["filter"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
-        ofmap = conv2d(
-            ifmap,
-            weights,
-            padding=param["filter"]["padding"],
-            stride=param["filter"]["stride"],
-        )
+        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -568,7 +549,8 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        # Upcast for CPU Math
+        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -592,22 +574,16 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
-
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -616,17 +592,12 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -635,44 +606,28 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"],
-            param["dim_in_x"],
-            param["ch_in"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
+
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"],
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap,
-            kernel,
-            bn_k,
-            bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
+            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
+            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -700,6 +655,5 @@ def main():
     else:
         print("No valid kernel selected")
 
-
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
index 0cd18994..0c500558 100755
--- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
@@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
-
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
         + ";\n\n\n"
     )
     layer_str += (
@@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
-
     layer_str = ""
     return layer_str
 
@@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(result)
+            + array_to_cstr(torch.sum(result.float(), dim=-1))
             + ";\n\n\n"
         )
     else:
@@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
-
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
-
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(
-            0, 2, shape, requires_grad=False, dtype=torch.uint8
-        )  # -1 or 1
-        exponent = torch.randint(
-            0, 16, shape, requires_grad=False, dtype=torch.uint8
-        )  # < 0b01111
-        mantissa = torch.randint(
-            0, 4, shape, requires_grad=False, dtype=torch.uint8
-        )  # can be arbitrary
+        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
+        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
+        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
-        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    # Universal Fix: Upcast for CPU Math
+    orig_dtype = ifmap.dtype
+    ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    conv2d.bias = nn.Parameter(
-        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
-    )
+    # Ensure bias natively generates in float32
+    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
+
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
+    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    # Universal Fix: Upcast EVERYTHING before starting the math
+    orig_dtype = ifmap.dtype
+    ifmap = ifmap.float()
+    weights = weights.float()
+    bn_k = bn_k.float()
+    bn_l = bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -440,14 +443,13 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype,
+        dtype=ifmap.dtype, # Safely uses float32
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
-    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -458,16 +460,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
+
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
-        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -478,7 +480,6 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
-        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -491,29 +492,22 @@ def fused_conv(
 
     ofmap += ofmap_before
 
-    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
-    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    return ofmap, ofmap_before, ifmap_padded
+    # Universal Fix: Downcast back
+    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
 
 
 def main():
-
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c",
-        "--cfg",
-        type=pathlib.Path,
-        required=True,
-        help="Select param config file kernel",
+        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
-
     args = parser.parse_args()
 
     global verbose
@@ -532,31 +526,18 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
+        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"],
-            param["channels"]["in"],
-            param["filter"]["height"],
-            param["filter"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
-        ofmap = conv2d(
-            ifmap,
-            weights,
-            padding=param["filter"]["padding"],
-            stride=param["filter"]["stride"],
-        )
+        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -568,7 +549,8 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        # Upcast for CPU Math
+        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -592,22 +574,16 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
-
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -616,17 +592,12 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1,
-            param["channels"]["in"],
-            param["input_dim"]["height"],
-            param["input_dim"]["width"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
-        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -635,44 +606,28 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"],
-            param["dim_in_x"],
-            param["ch_in"],
-            requires_grad=False,
-            dtype=dtype,
-        )
+            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
+            requires_grad=False, dtype=torch.float32,
+        ).to(dtype)
+
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"],
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"],
-                param["dim_kernel_x"],
-                param["ch_in"],
-                requires_grad=False,
-                dtype=dtype,
-            )
+                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
+                requires_grad=False, dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap,
-            kernel,
-            bn_k,
-            bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
+            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
+            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -700,6 +655,5 @@ def main():
     else:
         print("No valid kernel selected")
 
-
 if __name__ == "__main__":
     main()

From e629968fda2a15f9eaff1316e4f71d221deba672 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 10:38:16 +0200
Subject: [PATCH 02/13] [SW] Add sparse-attention gemv kernel. Kernel includes
 two parts: 1. non-zero element finding; 2. calculation on non-zeros.

---
 sw/spatzBenchmarks/CMakeLists.txt             |   3 +
 sw/spatzBenchmarks/sa-gemv/data/layer.h       |  16 +
 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c   | 165 ++++++++
 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h   |  26 ++
 sw/spatzBenchmarks/sa-gemv/main.c             | 393 ++++++++++++++++++
 sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 224 ++++++++++
 .../script/sa_gemv_128_4096_512_16.json       |  16 +
 .../sa-gemv/script/sa_gemv_256_128_16_64.json |  16 +
 8 files changed, 859 insertions(+)
 create mode 100644 sw/spatzBenchmarks/sa-gemv/data/layer.h
 create mode 100644 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
 create mode 100644 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
 create mode 100644 sw/spatzBenchmarks/sa-gemv/main.c
 create mode 100644 sw/spatzBenchmarks/sa-gemv/script/gen_data.py
 create mode 100644 sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json
 create mode 100644 sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json

diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt
index 70f72e5c..7c7d5170 100644
--- a/sw/spatzBenchmarks/CMakeLists.txt
+++ b/sw/spatzBenchmarks/CMakeLists.txt
@@ -110,6 +110,7 @@ if (ELEN EQUAL 64)
   add_spatz_test_threeParam(dp-fmatmul dp-fmatmul/main.c 64  64  64 )
 
   add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64  128 64)
+  add_spatz_test_threeParam_type(dp-sa-gemv sa-gemv/main.c 256 128 16 64)
 
   add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 256)
   add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 1024)
@@ -121,6 +122,7 @@ if (ELEN EQUAL 64)
   add_spatz_test_threeParam(dp-fconv2d dp-fconv2d/main.c 64 64 7)
 
   add_spatz_test_twoParam(dp-fft dp-fft/main.c 128 2)
+
 endif()
 
 add_spatz_test_threeParam(sp-fmatmul sp-fmatmul/main.c 64  64  64 )
@@ -150,6 +152,7 @@ add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128)
 
 add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32)
 add_spatz_test_twoParam_type(hp-gemv gemv/main.c 256 128 16)
+add_spatz_test_threeParam_type(hp-sa-gemv sa-gemv/main.c 128 4096 512 16)
 
 add_spatz_test_twoParam(sp-fft sp-fft/main.c 256 2)
 add_spatz_test_twoParam(sp-fft sp-fft/main.c 512 2)
diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h
new file mode 100644
index 00000000..0a07ad53
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h
@@ -0,0 +1,16 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+
+typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
+
+typedef struct gemv_layer_struct {
+    uint32_t M;
+    uint32_t N;
+
+    precision_t dtype;
+} gemv_layer;
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
new file mode 100644
index 00000000..4e641041
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
@@ -0,0 +1,165 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+// Author: Diyou Shen,              ETH Zurich <dishen@iis.ee.ethz.ch>
+
+#include "sa-gemv.h"
+
+void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  double *a_, *a_start = a;
+  double *c_ = c;
+
+  do {
+    a_ = a_start;
+    double *b_ = b;
+    asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // 1. CLEAR ACCUMULATORS for every new vl block (0 encodes to +0.0 float)
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle64.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+      b_++;
+
+      asm volatile("vle64.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_));
+      b_++;
+    }
+
+    // 2. HANDLE ODD N BOUNDARY
+    if (col < N) {
+      asm volatile("vle64.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    // 3. ACCUMULATE INTO MEMORY C (Load -> Add -> Store)
+    asm volatile("vle64.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse64.v v4, (%0)" ::"r"(c_));
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
+
+void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  float *a_, *a_start = a;
+  float *c_ = c;
+
+  do {
+    a_ = a_start;
+    float *b_ = b;
+    asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle32.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+      b_++;
+
+      asm volatile("vle32.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_));
+      b_++;
+    }
+
+    if (col < N) {
+      asm volatile("vle32.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    asm volatile("vle32.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse32.v v12, (%0)" ::"r"(c_)); // wait, mapping v4 to v12? No, use v4.
+    // Correction:
+    // asm volatile("vse32.v v4, (%0)" ::"r"(c_));
+    // Let's rewrite this block safely:
+    asm volatile("vse32.v v4, (%0)" ::"r"(c_)); // Fixed register writeback
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
+
+void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  __fp16 *a_, *a_start = a;
+  __fp16 *c_ = c;
+
+  do {
+    a_ = a_start;
+    __fp16 *b_ = b;
+    asm volatile("vsetvli %0, %1, e16, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle16.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vle16.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+
+      float t0, t1;
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0));
+      b_++;
+
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t1) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(t1));
+      b_++;
+    }
+
+    if (col < N) {
+      asm volatile("vle16.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      float t0;
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    asm volatile("vle16.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse16.v v4, (%0)" ::"r"(c_));
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
new file mode 100644
index 00000000..1bf377e1
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
@@ -0,0 +1,26 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+
+#ifndef _GEMV_H
+#define _GEMV_H
+
+void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N);
+void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N);
+void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N);
+
+#endif
diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
new file mode 100644
index 00000000..e046c7df
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -0,0 +1,393 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Diyou Shen,              ETH Zurich <dishen@iis.ee.ethz.ch>
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+
+#include <benchmark.h>
+#include <debug.h>
+#include <snrt.h>
+#include <stdio.h>
+
+#include DATAHEADER
+#include "kernel/sa-gemv.c"
+
+#if (PREC == 64)
+#define T double
+#elif (PREC == 32)
+#define T float
+#elif (PREC == 16)
+#define T __fp16
+#else
+#define T double
+#endif
+
+// Debugging defines
+// #define DEBUG_NZ
+// #define DEBUG_NZ_IDX
+// #define DEBUG_GEMV_PreLD
+// #define DEBUG_GEMV_DB
+
+T *vec_buf0;
+T *vec_buf1;
+uint16_t *dense_idx;
+T *dense_vec;
+T *mat_buf0;
+T *mat_buf1;
+T *result;
+
+static inline int fp_check(const T *a, const T *b) {
+  const T threshold = 0.001;
+
+  // Absolute value
+  double comp = (double)*a - (double)*b;
+  if (comp < 0)
+    comp = -comp;
+
+  return comp > threshold;
+}
+
+int main() {
+  const unsigned int num_cores = snrt_cluster_core_num();
+  const unsigned int cid = snrt_cluster_core_idx();
+
+  // Reset timer
+  unsigned int timer      = (unsigned int)-1;
+  unsigned int timer_best = (unsigned int)-1;
+  unsigned int timer_nz   = (unsigned int)-1;
+  const unsigned int m_core = gemv_l.M / num_cores;
+  // Size (in KiB) of L1 SPM, used to calculate tiling window
+  const unsigned int spm_size = 128;
+
+  // For Sparse Attention GEMV, we need several steps
+  // 1. Find all non-zeros
+  // 2. Calculate the GEMV
+  // What to be double buffered?
+  // 1. 2 chunks of sparse vector + densed vector (output) + densed idx
+  // 2. 2 chunks of matrix + densed vector (output) + densed idx
+
+  // Sizes of each part we need
+  const uint32_t mat_size       = sizeof(T) * gemv_l.M * tot_nz_dram;
+  const uint32_t row_size       = sizeof(T) * gemv_l.M;
+  const uint32_t vec_size       = sizeof(T) * gemv_l.N;
+  const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram;
+  const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram;
+  const uint32_t result_size    = sizeof(T) * gemv_l.M;
+  
+  // leave 8 KiB for Stack
+  const uint32_t l1_size  = (spm_size - 8) * 1024;
+  const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size;
+
+  // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? ---
+  if (fixed_alloc_size >= l1_size) {
+    if (cid == 0) {
+      printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", 
+             fixed_alloc_size, l1_size);
+    }
+    snrt_cluster_hw_barrier();
+    return -1; // Exit gracefully
+  }
+
+  const uint32_t l1_for_chunk = l1_size - fixed_alloc_size;
+
+  // How many whole rows (or columns) can fit in half the L1 chunk space?
+  const uint32_t num_row_mat = (l1_for_chunk / 2) / row_size;
+
+  // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? ---
+  if (num_row_mat < 1) {
+    if (cid == 0) {
+      printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. "
+             "Chunk space left: %u bytes, Row size: %u bytes.\n", 
+             l1_for_chunk, row_size);
+    }
+    snrt_cluster_hw_barrier();
+    return -1; // Exit gracefully
+  }
+
+  // Always strictly split the available memory in half for double-buffering
+  const uint32_t vec_chunk_size = l1_for_chunk / 2;
+  const uint32_t num_vec_chunk  = (l1_for_chunk > vec_size) ? 1 : ((vec_size + vec_chunk_size - 1) / vec_chunk_size);
+
+  // Recalculate exact chunk size based on whole rows
+  const uint32_t mat_chunk_size = num_row_mat * row_size;
+
+  // Number of chunks based on the total non-zeros we need to process
+  const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat;
+
+  // Number of elements in each chunk
+  const uint32_t vec_chunk_len  = vec_chunk_size / sizeof(T);
+
+
+  // Memory Allocation
+  if (cid == 0) {
+    result    = (T *)snrt_l1alloc(result_size);
+    dense_vec = (T *)snrt_l1alloc(dense_vec_size);
+    vec_buf0  = (T *)snrt_l1alloc(l1_for_chunk);
+    dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size);
+
+    // Offset by half of the size if needed by double buffering
+    vec_buf1 = vec_buf0 + vec_chunk_len;
+
+    mat_buf0 = vec_buf0;
+    mat_buf1 = vec_buf1;
+  }
+
+  // MUST zero out the memory accumulator!
+  if (cid == 0) {
+    for (unsigned int i = 0; i < gemv_l.M; i++) {
+      result[i] = 0.0;
+    }
+  }
+  snrt_cluster_hw_barrier();
+
+  if (cid == 0)
+    start_kernel();
+
+  timer = benchmark_get_cycle();
+
+  // Calculate internal pointers
+  T *vec_ptr      = vec_buf0;
+  T *vec_db_ptr   = vec_buf1;
+
+  // Task 1: Find out the non-zeros
+  if (cid == 0) {
+    #ifdef DEBUG_NZ
+    printf("NZ-Calc PreLD\n");
+    printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size);
+    #endif
+    snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size);
+    snrt_dma_wait_all();
+  }
+
+  uint32_t nz_count = 0;
+
+
+  if (cid == 0) {
+    for (unsigned int i = 0; i < num_vec_chunk; ++i) {
+      // Step 1.1: preload the next chunk if not the end
+      // Make sure the previous load completes
+      snrt_dma_wait_all();
+      // Double buffer to search the next non-zero
+      uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) 
+                        ? (vec_size - (i + 1) * vec_chunk_size) 
+                        : vec_chunk_size;
+
+      if (i < num_vec_chunk - 1) {
+        #ifdef DEBUG_NZ
+        printf("NZ-Calc DB Iter%u\n", i);
+        printf("DMA SRC:%p, TGT:%p, SIZE:%u\n",
+                gemv_vec_dram + (i + 1) * vec_chunk_len,
+                vec_db_ptr,
+                next_bytes);
+        #endif
+        snrt_dma_start_1d(vec_db_ptr,
+                          gemv_vec_dram + (i + 1) * vec_chunk_len,
+                          next_bytes); // Use exact bytes
+      }
+
+      for (unsigned int j = 0; j < vec_chunk_len; ++j) {
+        if ((double) vec_ptr[j] != 0.0) {
+          dense_vec[nz_count] = vec_ptr[j];
+          dense_idx[nz_count] = i * vec_chunk_len + j;
+          nz_count++;
+        }
+
+        if (nz_count == tot_nz_dram)
+          break;
+      }
+
+      if (nz_count == tot_nz_dram)
+        break;
+
+      if (i % 2 == 0) {
+        // pointer exchange
+        vec_ptr    = vec_buf1;
+        vec_db_ptr = vec_buf0;
+      } else {
+        vec_ptr    = vec_buf0;
+        vec_db_ptr = vec_buf1;
+      }
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+  #ifdef DEBUG_NZ
+  if (cid == 0)
+    printf("Non-Zero Calc Complete\n");
+  #endif
+
+  #ifdef DEBUG_NZ_IDX
+  if (cid == 0) {
+    for (uint32_t i = 0; i < tot_nz_dram; i++) {
+      printf("IDX[%u]=%u\n", i, dense_idx[i]);
+    }
+  }
+  #endif
+
+  timer_nz = benchmark_get_cycle() - timer_nz;
+  timer = benchmark_get_cycle();
+
+
+  // Task 2: GEMV calculation
+  // Calculate internal pointers
+  T *mat_ptr     = mat_buf0;
+  T *mat_db_ptr  = mat_buf1;
+  T *result_core = result + m_core * cid;
+  uint16_t *idx_ptr = dense_idx; // Corrected pointer type
+
+  if (cid == 0) {
+    // Determine how many rows are actually active for this very first chunk
+    uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat;
+
+    #ifdef DEBUG_GEMV_PreLD
+    printf("GEMV PreLD\n");
+    printf("Active Rows:%u\n", active_rows);
+    #endif
+    
+    for (unsigned int i = 0; i < active_rows; i++) {
+      #ifdef DEBUG
+      printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
+              i,
+              gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+              mat_ptr + i * gemv_l.M,
+              row_size);
+      #endif
+      snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1
+                        gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM
+                        row_size);
+      idx_ptr++;
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+  #ifdef DEBUG_GEMV_PreLD
+  if (cid == 0)
+    printf("GEMV PreLD Complete\n");
+  #endif
+
+  #ifdef DEBUG_GEMV_DB
+  if (cid == 0)
+    printf("Tot Chunks %u\n", num_mat_chunk);
+  #endif
+
+  for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) {
+    // Wait for the CURRENT chunk to finish loading
+    if (cid == 0) {
+      snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+
+    // Determine bounds for the NEXT chunk (for background DMA)
+    uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat;
+    uint32_t next_active_rows = 0;
+    
+    if (next_chunk_start < tot_nz_dram) {
+        next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) 
+                           ? (tot_nz_dram - next_chunk_start) 
+                           : num_row_mat;
+    }
+
+    #ifdef DEBUG_GEMV_DB
+    if (cid == 0)
+      printf("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows);
+    #endif
+
+    // Load NEXT chunk in the background
+    if (cid == 0 && next_active_rows > 0) {
+      for (unsigned int i = 0; i < next_active_rows; i++) {
+        #ifdef DEBUG_GEMV_DB
+        printf("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
+                idx_ptr,
+                i,
+                gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+                mat_db_ptr + i * gemv_l.M,
+                row_size);
+        #endif
+        snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, 
+                          gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, 
+                          row_size);
+        idx_ptr++;
+      }
+    }
+
+    // Calculate active rows for the CURRENT compute phase
+    uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) 
+                                ? (tot_nz_dram - chunk_idx * num_row_mat) 
+                                : num_row_mat;
+
+    // Calculate GEMV on the current chunk
+    T *current_dense_vec = dense_vec + chunk_idx * num_row_mat;
+
+    // Offset the matrix pointer by m_core * cid so each core reads its correct rows
+    T *mat_core_ptr = mat_ptr + m_core * cid;
+
+    #if (PREC == 64)
+      gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
+    #elif (PREC == 32)
+      gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
+    #else
+      gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
+    #endif
+
+
+    // Swap pointers for the next iteration
+    T *temp    = mat_ptr;
+    mat_ptr    = mat_db_ptr;
+    mat_db_ptr = temp;
+  }
+
+  snrt_cluster_hw_barrier();
+
+  timer = benchmark_get_cycle() - timer;
+
+  if (cid == 0)
+    stop_kernel();
+
+  // Result Checking
+  if (cid == 0) {
+    // Checking
+    for (unsigned int i = 0; i < gemv_l.M; i++) {
+      if (fp_check(&result[i], &gemv_result[i])) {
+        printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]);
+      }
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+
+  // Check and display results
+  // Assume 2 core 4 fpu configuration
+  if (cid == 0) {
+    // Flops per cycle
+    long unsigned int performance = 1000 * 2 * gemv_l.M * tot_nz_dram / timer;
+    // Ideal perf = MACC * NCore * Nfpu * Prec adjustment
+    long unsigned int utilization =
+        performance / (2 * num_cores * 4 * 8 / sizeof(T));
+
+    printf("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram);
+    printf("The NZ finding takes %u cycles.\n", timer_nz);
+    printf("The GEMV execution took %u cycles.\n", timer);
+    printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
+           performance, utilization);
+  }
+
+  // Wait for core 0 to finish displaying results
+  snrt_cluster_hw_barrier();
+  return 0;
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
new file mode 100644
index 00000000..4daa6689
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+
+import numpy as np
+import torch
+import argparse
+import pathlib
+import hjson
+
+np.random.seed(42)
+torch.manual_seed(42)
+
+global verbose
+
+
+def array_to_cstr(a, fmt=float):
+    out = "{\n"
+    if fmt == float:
+        if isinstance(a, np.ndarray):
+            a = a.flat
+        if isinstance(a, torch.Tensor):
+            a = a.numpy().flat
+        for el in a:
+            out += "\t{},\n".format(el)
+    else:
+        for sign, exp, mant in zip(
+            a["sign"].numpy().flat,
+            a["exponent"].numpy().flat,
+            a["mantissa"].numpy().flat,
+        ):
+            value = sign * 2**7 + exp * 2**2 + mant
+            out += "0x{:02x},\n".format(value)
+    out = out[:-2] + "}"
+    return out
+
+
+def emit_header_file(layer_type: str, **kwargs):
+    file_path = pathlib.Path(__file__).parent.parent / "data"
+    emit_str = (
+        "// Copyright 2025 ETH Zurich and University of Bologna.\n"
+        + "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
+        + "// SPDX-License-Identifier: Apache-2.0\n\n"
+        + "// This file was generated automatically.\n\n"
+    )
+
+    file = file_path / ("data_" + str(kwargs["M"]) + "_" + str(kwargs["N"]) + "_" + str(kwargs["tot_nz"]) + "_" + str(kwargs["prec"]) + ".h")
+    emit_str += emit_gemv_layer(**kwargs)
+    with file.open("w") as f:
+        f.write(emit_str)
+
+
+def emit_gemv_layer(name="gemv", **kwargs):
+    mat_A = kwargs["A"]
+    vec_B = kwargs["B"]
+    result = kwargs["result"]
+
+    m = kwargs["M"]
+    n = kwargs["N"]
+    tot_nz = kwargs["tot_nz"]
+
+    layer_str = ""
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f"const gemv_layer {name}_l = {{\n"
+    layer_str += f"\t.M = {m},\n"
+    layer_str += f"\t.N = {n},\n"
+    layer_str += f'\t.dtype = FP{kwargs["prec"]}'
+    layer_str += "};\n\n"
+
+    # Export the total non-zeros directly so the kernel can use it
+    layer_str += f"const uint32_t tot_nz_dram = {tot_nz};\n\n"
+
+    ctypes = {"64": "double", "32": "float", "16": "__fp16", "8": "char"}
+
+    dtype = ctypes[str(kwargs["prec"])]
+    if dtype != "char":
+        layer_str += (
+            f'static {dtype} {name}_mat_dram[{m}*{n}] __attribute__((section(".data"))) = '
+            + array_to_cstr(mat_A)
+            + ";\n\n"
+        )
+        layer_str += (
+            f'static {dtype} {name}_vec_dram[{n}] __attribute__((section(".data"))) = '
+            + array_to_cstr(vec_B)
+            + ";\n\n"
+        )
+        layer_str += (
+            f'static {dtype} {name}_result[{m}] __attribute__((section(".data"))) = '
+            + array_to_cstr(result)
+            + ";\n"
+        )
+        # Assuming you have variables like M (output size) and tot_nz (number of non-zeros)
+        layer_str += f'// Auto-generated buffers for Cache Mode\n'
+        layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n'
+        layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n'
+        layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n'
+    else:
+        layer_str += (
+            f"static {dtype} {name}_mat_dram[{m}*{n}] = "
+            + array_to_cstr(kwargs["bits_A"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_vec_dram[{n}] = "
+            + array_to_cstr(kwargs["bits_B"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_result[{m}] = "
+            + array_to_cstr(kwargs["result"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_result_buf_dram[{m}] ="
+            + array_to_cstr(kwargs["result"], fmt="char")
+            + ";\n\n\n"
+        )
+
+    return layer_str
+
+
+def rand_data_generator(shape, prec, alt=False):
+    if prec == 64:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float64), {}
+    elif prec == 32:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
+    elif prec == 16:
+        if alt:
+            # Generate in FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
+        else:
+            # Generate in FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {}
+    elif prec == 8:
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
+        bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
+        return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
+            1.0 + mantissa.double() / (2**2)
+        ), bits
+
+
+def gemv(a, b):
+    print(a.shape, b.shape)
+    # Upcast to float32 for CPU math, then downcast back to the original dtype
+    return torch.matmul(a.float(), b.float()).to(a.dtype)
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Generate data for kernels")
+    parser.add_argument(
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
+    args = parser.parse_args()
+
+    global verbose
+    verbose = args.verbose
+
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    # Read tot_nz from the hjson file
+    tot_nz = param["tot_nz"]
+
+    mat_A, bits_A = rand_data_generator((param["M"], param["N"]), param["prec"])
+    vec_B, bits_B = rand_data_generator((param["N"], 1), param["prec"])
+
+    # --- Sparsity Logic ---
+    # Randomly select `tot_nz` indices to keep, set the rest to 0.0
+    nz_indices = torch.randperm(param["N"])[:tot_nz]
+    mask = torch.zeros((param["N"], 1), dtype=torch.bool)
+    mask[nz_indices, 0] = True
+    
+    # Temporarily upcast to float32 for the masking math, then cast back
+    vec_B = (vec_B.float() * mask).to(vec_B.dtype)
+
+    # Also zero out the raw bits if using 8-bit precision to maintain parity
+    if bool(bits_B):
+        for k in bits_B.keys():
+            # Apply the mask, ensuring the shape matches the 1D bits array format
+            bits_B[k] = bits_B[k] * mask.squeeze().byte()
+    # ----------------------
+
+    # Calculate result using the now-sparse vector
+    result = gemv(mat_A, vec_B)
+
+    # Store A in col major format
+    mat_A = mat_A.T
+
+    kwargs = {
+        "A": mat_A,
+        "B": vec_B,
+        "result": result,
+        "M": param["M"],
+        "N": param["N"],
+        "tot_nz": tot_nz,  # Pass the new parameter down
+        "prec": param["prec"],
+        "expand": param["expand"],
+        "bits_A": bits_A,
+        "bits_B": bits_B,
+    }
+
+    emit_header_file("gemv", **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json
new file mode 100644
index 00000000..e14c70e4
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json
@@ -0,0 +1,16 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMV
+
+{
+    kernel: "GEMV"
+    M: 128,
+    N: 4096,
+    tot_nz: 512
+    transpose_A: false,
+    transpose_B: false,
+    prec: 16,
+    expand: 0
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json
new file mode 100644
index 00000000..8942de89
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json
@@ -0,0 +1,16 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMV
+
+{
+    kernel: "GEMV"
+    M: 256,
+    N: 128,
+    tot_nz: 16
+    transpose_A: false,
+    transpose_B: false,
+    prec: 64,
+    expand: 0
+}

From 69cdd8fd93b672850c9453764279e1e00b4d4cd2 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 10:50:45 +0200
Subject: [PATCH 03/13] [SW] Fix trailing whitespace .

---
 sw/spatzBenchmarks/sa-gemv/main.c             | 26 +++++++++----------
 sw/spatzBenchmarks/sa-gemv/script/gen_data.py |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
index e046c7df..d591806e 100644
--- a/sw/spatzBenchmarks/sa-gemv/main.c
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -86,7 +86,7 @@ int main() {
   const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram;
   const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram;
   const uint32_t result_size    = sizeof(T) * gemv_l.M;
-  
+
   // leave 8 KiB for Stack
   const uint32_t l1_size  = (spm_size - 8) * 1024;
   const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size;
@@ -94,7 +94,7 @@ int main() {
   // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? ---
   if (fixed_alloc_size >= l1_size) {
     if (cid == 0) {
-      printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", 
+      printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n",
              fixed_alloc_size, l1_size);
     }
     snrt_cluster_hw_barrier();
@@ -110,7 +110,7 @@ int main() {
   if (num_row_mat < 1) {
     if (cid == 0) {
       printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. "
-             "Chunk space left: %u bytes, Row size: %u bytes.\n", 
+             "Chunk space left: %u bytes, Row size: %u bytes.\n",
              l1_for_chunk, row_size);
     }
     snrt_cluster_hw_barrier();
@@ -181,8 +181,8 @@ int main() {
       // Make sure the previous load completes
       snrt_dma_wait_all();
       // Double buffer to search the next non-zero
-      uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) 
-                        ? (vec_size - (i + 1) * vec_chunk_size) 
+      uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size)
+                        ? (vec_size - (i + 1) * vec_chunk_size)
                         : vec_chunk_size;
 
       if (i < num_vec_chunk - 1) {
@@ -257,7 +257,7 @@ int main() {
     printf("GEMV PreLD\n");
     printf("Active Rows:%u\n", active_rows);
     #endif
-    
+
     for (unsigned int i = 0; i < active_rows; i++) {
       #ifdef DEBUG
       printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
@@ -295,10 +295,10 @@ int main() {
     // Determine bounds for the NEXT chunk (for background DMA)
     uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat;
     uint32_t next_active_rows = 0;
-    
+
     if (next_chunk_start < tot_nz_dram) {
-        next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) 
-                           ? (tot_nz_dram - next_chunk_start) 
+        next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat)
+                           ? (tot_nz_dram - next_chunk_start)
                            : num_row_mat;
     }
 
@@ -318,16 +318,16 @@ int main() {
                 mat_db_ptr + i * gemv_l.M,
                 row_size);
         #endif
-        snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, 
-                          gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, 
+        snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M,
+                          gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
                           row_size);
         idx_ptr++;
       }
     }
 
     // Calculate active rows for the CURRENT compute phase
-    uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) 
-                                ? (tot_nz_dram - chunk_idx * num_row_mat) 
+    uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat)
+                                ? (tot_nz_dram - chunk_idx * num_row_mat)
                                 : num_row_mat;
 
     // Calculate GEMV on the current chunk
diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
index 4daa6689..8a3734e1 100644
--- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
@@ -187,7 +187,7 @@ def main():
     nz_indices = torch.randperm(param["N"])[:tot_nz]
     mask = torch.zeros((param["N"], 1), dtype=torch.bool)
     mask[nz_indices, 0] = True
-    
+
     # Temporarily upcast to float32 for the masking math, then cast back
     vec_B = (vec_B.float() * mask).to(vec_B.dtype)
 

From b8f60e702845f462b9e2004971a8ccb9bdf9b6b2 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 10:53:35 +0200
Subject: [PATCH 04/13] [SW] Change to use PRINTF for sa-gemv kernel.

---
 sw/spatzBenchmarks/sa-gemv/main.c | 40 +++++++++++++++----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
index d591806e..df2eb8ba 100644
--- a/sw/spatzBenchmarks/sa-gemv/main.c
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -94,7 +94,7 @@ int main() {
   // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? ---
   if (fixed_alloc_size >= l1_size) {
     if (cid == 0) {
-      printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n",
+      PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n",
              fixed_alloc_size, l1_size);
     }
     snrt_cluster_hw_barrier();
@@ -109,7 +109,7 @@ int main() {
   // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? ---
   if (num_row_mat < 1) {
     if (cid == 0) {
-      printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. "
+      PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. "
              "Chunk space left: %u bytes, Row size: %u bytes.\n",
              l1_for_chunk, row_size);
     }
@@ -165,8 +165,8 @@ int main() {
   // Task 1: Find out the non-zeros
   if (cid == 0) {
     #ifdef DEBUG_NZ
-    printf("NZ-Calc PreLD\n");
-    printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size);
+    PRINTF("NZ-Calc PreLD\n");
+    PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size);
     #endif
     snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size);
     snrt_dma_wait_all();
@@ -187,8 +187,8 @@ int main() {
 
       if (i < num_vec_chunk - 1) {
         #ifdef DEBUG_NZ
-        printf("NZ-Calc DB Iter%u\n", i);
-        printf("DMA SRC:%p, TGT:%p, SIZE:%u\n",
+        PRINTF("NZ-Calc DB Iter%u\n", i);
+        PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n",
                 gemv_vec_dram + (i + 1) * vec_chunk_len,
                 vec_db_ptr,
                 next_bytes);
@@ -227,13 +227,13 @@ int main() {
 
   #ifdef DEBUG_NZ
   if (cid == 0)
-    printf("Non-Zero Calc Complete\n");
+    PRINTF("Non-Zero Calc Complete\n");
   #endif
 
   #ifdef DEBUG_NZ_IDX
   if (cid == 0) {
     for (uint32_t i = 0; i < tot_nz_dram; i++) {
-      printf("IDX[%u]=%u\n", i, dense_idx[i]);
+      PRINTF("IDX[%u]=%u\n", i, dense_idx[i]);
     }
   }
   #endif
@@ -254,13 +254,13 @@ int main() {
     uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat;
 
     #ifdef DEBUG_GEMV_PreLD
-    printf("GEMV PreLD\n");
-    printf("Active Rows:%u\n", active_rows);
+    PRINTF("GEMV PreLD\n");
+    PRINTF("Active Rows:%u\n", active_rows);
     #endif
 
     for (unsigned int i = 0; i < active_rows; i++) {
       #ifdef DEBUG
-      printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
+      PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
               i,
               gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
               mat_ptr + i * gemv_l.M,
@@ -277,12 +277,12 @@ int main() {
 
   #ifdef DEBUG_GEMV_PreLD
   if (cid == 0)
-    printf("GEMV PreLD Complete\n");
+    PRINTF("GEMV PreLD Complete\n");
   #endif
 
   #ifdef DEBUG_GEMV_DB
   if (cid == 0)
-    printf("Tot Chunks %u\n", num_mat_chunk);
+    PRINTF("Tot Chunks %u\n", num_mat_chunk);
   #endif
 
   for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) {
@@ -304,14 +304,14 @@ int main() {
 
     #ifdef DEBUG_GEMV_DB
     if (cid == 0)
-      printf("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows);
+      PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows);
     #endif
 
     // Load NEXT chunk in the background
     if (cid == 0 && next_active_rows > 0) {
       for (unsigned int i = 0; i < next_active_rows; i++) {
         #ifdef DEBUG_GEMV_DB
-        printf("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
+        PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
                 idx_ptr,
                 i,
                 gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
@@ -363,7 +363,7 @@ int main() {
     // Checking
     for (unsigned int i = 0; i < gemv_l.M; i++) {
       if (fp_check(&result[i], &gemv_result[i])) {
-        printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]);
+        PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]);
       }
     }
   }
@@ -380,10 +380,10 @@ int main() {
     long unsigned int utilization =
         performance / (2 * num_cores * 4 * 8 / sizeof(T));
 
-    printf("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram);
-    printf("The NZ finding takes %u cycles.\n", timer_nz);
-    printf("The GEMV execution took %u cycles.\n", timer);
-    printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
+    PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram);
+    PRINTF("The NZ finding takes %u cycles.\n", timer_nz);
+    PRINTF("The GEMV execution took %u cycles.\n", timer);
+    PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
            performance, utilization);
   }
 

From dec39bb7a38ef9284bdff9f12aef24af03c7561a Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 14:20:14 +0200
Subject: [PATCH 05/13] [SW] Fix a problem of gen_data.py.

---
 .../hp-fmatmul/script/gen_data.py             | 209 +++++++++++++-----
 sw/spatzBenchmarks/sa-gemv/script/gen_data.py |   1 +
 .../sdotp-bp-fmatmul/script/gen_data.py       | 172 +++++++++-----
 .../sdotp-hp-fmatmul/script/gen_data.py       | 172 +++++++++-----
 .../widening-bp-fmatmul/script/gen_data.py    | 172 +++++++++-----
 .../widening-hp-fmatmul/script/gen_data.py    | 172 +++++++++-----
 6 files changed, 634 insertions(+), 264 deletions(-)

diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
index f33c12b1..4da0e20d 100755
--- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
+
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
+
     layer_str = ""
     return layer_str
 
@@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(torch.sum(result.float(), dim=-1))
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
@@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
+
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            # Universal Fix: Generate FP32, cast to BF16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            # Universal Fix: Generate FP32, cast to FP16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
-        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
-        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
-    # Universal Fix: Upcast for CPU Math
     orig_dtype = ifmap.dtype
-    ifmap, weights = ifmap.float(), weights.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
 
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    # Ensure bias natively generates in float32
-    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
-
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
+    )
     ofmap = conv2d(ifmap)
 
-    # Universal Fix: Downcast back
     return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
@@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel):
 
 def batchnorm(ifmap):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
 
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
@@ -424,12 +439,10 @@ def batchnorm(ifmap):
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
-    # Universal Fix: Upcast EVERYTHING before starting the math
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
-    weights = weights.float()
-    bn_k = bn_k.float()
-    bn_l = bn_l.float()
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -443,13 +456,14 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype, # Safely uses float32
+        dtype=ifmap.dtype,
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -460,16 +474,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
-
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
+        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -480,6 +494,7 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
+        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -492,22 +507,34 @@ def fused_conv(
 
     ofmap += ofmap_before
 
+    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
+    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    # Universal Fix: Downcast back
-    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
+    return ofmap, ofmap_before, ifmap_padded
 
 
 def main():
+
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
     args = parser.parse_args()
 
     global verbose
@@ -526,18 +553,31 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
-        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            param["channels"]["out"],
+            param["channels"]["in"],
+            param["filter"]["height"],
+            param["filter"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
-        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
+        ofmap = conv2d(
+            ifmap,
+            weights,
+            padding=param["filter"]["padding"],
+            stride=param["filter"]["stride"],
+        )
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -549,8 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        # Upcast for CPU Math
-        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -558,22 +600,38 @@ def main():
             mat_B = mat_B.T
 
         kwargs = {
-            "A": mat_A, "B": mat_B, "C": mat_C, "result": result,
-            "M": param["M"], "N": param["N"], "K": param["K"],
-            "ta": param["transpose_A"], "tb": param["transpose_B"], "alpha": param["alpha"],
-            "prec": param["prec"], "expand": param["expand"],
-            "bits_A": bits_A, "bits_B": bits_B, "bits_C": bits_C,
+            "A": mat_A,
+            "B": mat_B,
+            "C": mat_C,
+            "result": result,
+            "M": param["M"],
+            "N": param["N"],
+            "K": param["K"],
+            "ta": param["transpose_A"],
+            "tb": param["transpose_B"],
+            "alpha": param["alpha"],
+            "prec": param["prec"],
+            "expand": param["expand"],
+            "bits_A": bits_A,
+            "bits_B": bits_B,
+            "bits_C": bits_C,
         }
+
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -582,12 +640,17 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -596,28 +659,44 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
-            requires_grad=False, dtype=torch.float32,
+            param["dim_in_y"],
+            param["dim_in_x"],
+            param["ch_in"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
-
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["ch_out"],
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
 
         bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
         bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
-            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
+            ifmap,
+            kernel,
+            bn_k,
+            bn_l,
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -626,15 +705,25 @@ def main():
             kernel = kernel.permute(0, 3, 1, 2)
 
         kwargs = {
-            "ifmap": ifmap, "ifmap_padded": ifmap_padded, "ofmap": ofmap,
-            "ofmap_before": ofmap_before, "kernel": kernel, "bn_k": bn_k, "bn_l": bn_l,
-            "padding": param["padding"], "stride": param["stride"], "prec": param["prec"],
-            "flags": param["flags"], "depthwise": param["depthwise"], "chw_layer": param["chw_layer"],
+            "ifmap": ifmap,
+            "ifmap_padded": ifmap_padded,
+            "ofmap": ofmap,
+            "ofmap_before": ofmap_before,
+            "kernel": kernel,
+            "bn_k": bn_k,
+            "bn_l": bn_l,
+            "padding": param["padding"],
+            "stride": param["stride"],
+            "prec": param["prec"],
+            "flags": param["flags"],
+            "depthwise": param["depthwise"],
+            "chw_layer": param["chw_layer"],
         }
         emit_header_file("FusedConv", **kwargs)
 
     else:
         print("No valid kernel selected")
 
+
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
index 8a3734e1..ecc91071 100644
--- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
@@ -156,6 +156,7 @@ def gemv(a, b):
     # Upcast to float32 for CPU math, then downcast back to the original dtype
     return torch.matmul(a.float(), b.float()).to(a.dtype)
 
+
 def main():
 
     parser = argparse.ArgumentParser(description="Generate data for kernels")
diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
index 0c500558..4da0e20d 100755
--- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
+
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
+
     layer_str = ""
     return layer_str
 
@@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(torch.sum(result.float(), dim=-1))
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
@@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
+
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            # Universal Fix: Generate FP32, cast to BF16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            # Universal Fix: Generate FP32, cast to FP16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
-        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
-        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
-    # Universal Fix: Upcast for CPU Math
     orig_dtype = ifmap.dtype
-    ifmap, weights = ifmap.float(), weights.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
 
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    # Ensure bias natively generates in float32
-    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
-
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
+    )
     ofmap = conv2d(ifmap)
 
-    # Universal Fix: Downcast back
     return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
@@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel):
 
 def batchnorm(ifmap):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
 
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
@@ -424,12 +439,10 @@ def batchnorm(ifmap):
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
-    # Universal Fix: Upcast EVERYTHING before starting the math
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
-    weights = weights.float()
-    bn_k = bn_k.float()
-    bn_l = bn_l.float()
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -443,13 +456,14 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype, # Safely uses float32
+        dtype=ifmap.dtype,
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -460,16 +474,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
-
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
+        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -480,6 +494,7 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
+        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -492,22 +507,34 @@ def fused_conv(
 
     ofmap += ofmap_before
 
+    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
+    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    # Universal Fix: Downcast back
-    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
+    return ofmap, ofmap_before, ifmap_padded
 
 
 def main():
+
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
     args = parser.parse_args()
 
     global verbose
@@ -526,18 +553,31 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
-        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            param["channels"]["out"],
+            param["channels"]["in"],
+            param["filter"]["height"],
+            param["filter"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
-        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
+        ofmap = conv2d(
+            ifmap,
+            weights,
+            padding=param["filter"]["padding"],
+            stride=param["filter"]["stride"],
+        )
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -549,8 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        # Upcast for CPU Math
-        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -574,16 +616,22 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
+
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -592,12 +640,17 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -606,28 +659,44 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
-            requires_grad=False, dtype=torch.float32,
+            param["dim_in_y"],
+            param["dim_in_x"],
+            param["ch_in"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
-
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["ch_out"],
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
 
         bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
         bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
-            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
+            ifmap,
+            kernel,
+            bn_k,
+            bn_l,
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -655,5 +724,6 @@ def main():
     else:
         print("No valid kernel selected")
 
+
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
index 0c500558..4da0e20d 100755
--- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
+
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
+
     layer_str = ""
     return layer_str
 
@@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(torch.sum(result.float(), dim=-1))
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
@@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
+
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            # Universal Fix: Generate FP32, cast to BF16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            # Universal Fix: Generate FP32, cast to FP16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
-        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
-        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
-    # Universal Fix: Upcast for CPU Math
     orig_dtype = ifmap.dtype
-    ifmap, weights = ifmap.float(), weights.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
 
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    # Ensure bias natively generates in float32
-    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
-
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
+    )
     ofmap = conv2d(ifmap)
 
-    # Universal Fix: Downcast back
     return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
@@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel):
 
 def batchnorm(ifmap):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
 
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
@@ -424,12 +439,10 @@ def batchnorm(ifmap):
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
-    # Universal Fix: Upcast EVERYTHING before starting the math
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
-    weights = weights.float()
-    bn_k = bn_k.float()
-    bn_l = bn_l.float()
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -443,13 +456,14 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype, # Safely uses float32
+        dtype=ifmap.dtype,
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -460,16 +474,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
-
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
+        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -480,6 +494,7 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
+        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -492,22 +507,34 @@ def fused_conv(
 
     ofmap += ofmap_before
 
+    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
+    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    # Universal Fix: Downcast back
-    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
+    return ofmap, ofmap_before, ifmap_padded
 
 
 def main():
+
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
     args = parser.parse_args()
 
     global verbose
@@ -526,18 +553,31 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
-        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            param["channels"]["out"],
+            param["channels"]["in"],
+            param["filter"]["height"],
+            param["filter"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
-        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
+        ofmap = conv2d(
+            ifmap,
+            weights,
+            padding=param["filter"]["padding"],
+            stride=param["filter"]["stride"],
+        )
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -549,8 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        # Upcast for CPU Math
-        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -574,16 +616,22 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
+
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -592,12 +640,17 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -606,28 +659,44 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
-            requires_grad=False, dtype=torch.float32,
+            param["dim_in_y"],
+            param["dim_in_x"],
+            param["ch_in"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
-
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["ch_out"],
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
 
         bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
         bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
-            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
+            ifmap,
+            kernel,
+            bn_k,
+            bn_l,
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -655,5 +724,6 @@ def main():
     else:
         print("No valid kernel selected")
 
+
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
index 0c500558..4da0e20d 100755
--- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
+
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
+
     layer_str = ""
     return layer_str
 
@@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(torch.sum(result.float(), dim=-1))
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
@@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
+
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            # Universal Fix: Generate FP32, cast to BF16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            # Universal Fix: Generate FP32, cast to FP16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
-        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
-        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
-    # Universal Fix: Upcast for CPU Math
     orig_dtype = ifmap.dtype
-    ifmap, weights = ifmap.float(), weights.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
 
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    # Ensure bias natively generates in float32
-    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
-
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
+    )
     ofmap = conv2d(ifmap)
 
-    # Universal Fix: Downcast back
     return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
@@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel):
 
 def batchnorm(ifmap):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
 
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
@@ -424,12 +439,10 @@ def batchnorm(ifmap):
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
-    # Universal Fix: Upcast EVERYTHING before starting the math
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
-    weights = weights.float()
-    bn_k = bn_k.float()
-    bn_l = bn_l.float()
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -443,13 +456,14 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype, # Safely uses float32
+        dtype=ifmap.dtype,
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -460,16 +474,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
-
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
+        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -480,6 +494,7 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
+        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -492,22 +507,34 @@ def fused_conv(
 
     ofmap += ofmap_before
 
+    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
+    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    # Universal Fix: Downcast back
-    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
+    return ofmap, ofmap_before, ifmap_padded
 
 
 def main():
+
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
     args = parser.parse_args()
 
     global verbose
@@ -526,18 +553,31 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
-        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            param["channels"]["out"],
+            param["channels"]["in"],
+            param["filter"]["height"],
+            param["filter"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
-        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
+        ofmap = conv2d(
+            ifmap,
+            weights,
+            padding=param["filter"]["padding"],
+            stride=param["filter"]["stride"],
+        )
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -549,8 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        # Upcast for CPU Math
-        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -574,16 +616,22 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
+
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -592,12 +640,17 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -606,28 +659,44 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
-            requires_grad=False, dtype=torch.float32,
+            param["dim_in_y"],
+            param["dim_in_x"],
+            param["ch_in"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
-
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["ch_out"],
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
 
         bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
         bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
-            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
+            ifmap,
+            kernel,
+            bn_k,
+            bn_l,
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -655,5 +724,6 @@ def main():
     else:
         print("No valid kernel selected")
 
+
 if __name__ == "__main__":
     main()
diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
index 0c500558..4da0e20d 100755
--- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float):
 
 
 def emit_header_file(layer_type: str, **kwargs):
+
     file_path = pathlib.Path(__file__).parent.parent / "data"
     file_path.mkdir(parents=True, exist_ok=True)
     emit_str = (
@@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
 
 
 def emit_linear_layer(input, weights, ofmap):
+
     layer_str = ""
     return layer_str
 
@@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
         )
         layer_str += (
             f"static const {dtype} {name}_result[{m}*{n}] = "
-            + array_to_cstr(torch.sum(result.float(), dim=-1))
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
@@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs):
 
 
 def emit_batchnorm_layer(name="batchnorm", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     beta = kwargs["beta"]
@@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
 
 
 def emit_maxpool_layer(name="maxpool", **kwargs):
+
     ifmap = kwargs["ifmap"]
     ofmap = kwargs["ofmap"]
     k = kwargs["kernel_size"]
@@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
 
 
 def emit_fusedconv(name="fusedconv", **kwargs):
+
     ifmap = kwargs["ifmap"]
     kernel = kwargs["kernel"]
     bn_k = kwargs["bn_k"]
@@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            # Universal Fix: Generate FP32, cast to BF16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            # Universal Fix: Generate FP32, cast to FP16
             return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
-        sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8)
-        exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8)
-        mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8)
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
         bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
         return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
             1.0 + mantissa.double() / (2**2)
         ), bits
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
-    # Universal Fix: Upcast for CPU Math
     orig_dtype = ifmap.dtype
-    ifmap, weights = ifmap.float(), weights.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
 
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
     conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2))
     conv2d.weight = nn.Parameter(weights, requires_grad=False)
-    # Ensure bias natively generates in float32
-    conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False)
-
+    conv2d.bias = nn.Parameter(
+        torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False
+    )
     ofmap = conv2d(ifmap)
 
-    # Universal Fix: Downcast back
     return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
@@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel):
 
 def batchnorm(ifmap):
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
 
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
 
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
@@ -424,12 +439,10 @@ def batchnorm(ifmap):
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
-    # Universal Fix: Upcast EVERYTHING before starting the math
     orig_dtype = ifmap.dtype
-    ifmap = ifmap.float()
-    weights = weights.float()
-    bn_k = bn_k.float()
-    bn_l = bn_l.float()
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -443,13 +456,14 @@ def fused_conv(
         iw + padding["padding_x_left"] + padding["padding_x_right"],
         ci,
         requires_grad=False,
-        dtype=ifmap.dtype, # Safely uses float32
+        dtype=ifmap.dtype,
     )
     ifmap_padded[
         padding["padding_y_top"] : ih + padding["padding_y_top"],
         padding["padding_x_left"] : iw + padding["padding_x_left"],
     ] = ifmap
 
+    # Don't cover undefined behaviour when there are steps without a complete kernel window
     if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0:
         print("Warning: rounding h output dimension")
     if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0:
@@ -460,16 +474,16 @@ def fused_conv(
         (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1,
         co,
     )
-
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
-        ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32)
+        ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
     if verbose:
         print(ifmap.shape, ifmap_padded.shape, ofmap.shape)
 
     if depthwise:
+        # depthwise Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -480,6 +494,7 @@ def fused_conv(
                         weights[:, :, c].flatten(),
                     )
     else:
+        # Conv2d
         for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]):
             for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]):
                 for c in range(co):
@@ -492,22 +507,34 @@ def fused_conv(
 
     ofmap += ofmap_before
 
+    # BatchNorm
     if bn:
         ofmap = ofmap * bn_k + bn_l
 
+    # ReLU
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
-    # Universal Fix: Downcast back
-    return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype)
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
+    return ofmap, ofmap_before, ifmap_padded
 
 
 def main():
+
     parser = argparse.ArgumentParser(description="Generate data for kernels")
     parser.add_argument(
-        "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel",
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
     args = parser.parse_args()
 
     global verbose
@@ -526,18 +553,31 @@ def main():
         dtype = torch.float32
 
     if param["kernel"] == "Conv2d":
-        # Generate safely in float32, then cast
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
         weights = torch.randn(
-            param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            param["channels"]["out"],
+            param["channels"]["in"],
+            param["filter"]["height"],
+            param["filter"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
-        ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"])
+        ofmap = conv2d(
+            ifmap,
+            weights,
+            padding=param["filter"]["padding"],
+            stride=param["filter"]["stride"],
+        )
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
         weights = weights.permute(0, 2, 3, 1)
@@ -549,8 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        # Upcast for CPU Math
-        result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -574,16 +616,22 @@ def main():
             "bits_B": bits_B,
             "bits_C": bits_C,
         }
+
         emit_header_file("GEMM", **kwargs)
 
     elif param["kernel"] == "BatchNorm":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -592,12 +640,17 @@ def main():
 
     elif param["kernel"] == "MaxPool":
         ifmap = torch.randn(
-            1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"],
-            requires_grad=False, dtype=torch.float32,
+            1,
+            param["channels"]["in"],
+            param["input_dim"]["height"],
+            param["input_dim"]["width"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
+        # convert from CHW to HWC format
         ifmap = ifmap.permute(0, 2, 3, 1)
         ofmap = ofmap.permute(0, 2, 3, 1)
 
@@ -606,28 +659,44 @@ def main():
 
     elif param["kernel"] == "FusedConv":
         ifmap = torch.randn(
-            param["dim_in_y"], param["dim_in_x"], param["ch_in"],
-            requires_grad=False, dtype=torch.float32,
+            param["dim_in_y"],
+            param["dim_in_x"],
+            param["ch_in"],
+            requires_grad=False,
+            dtype=torch.float32,
         ).to(dtype)
-
         if not param["depthwise"]:
             kernel = torch.randn(
-                param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["ch_out"],
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
         else:
             kernel = torch.randn(
-                param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"],
-                requires_grad=False, dtype=torch.float32,
+                param["dim_kernel_y"],
+                param["dim_kernel_x"],
+                param["ch_in"],
+                requires_grad=False,
+                dtype=torch.float32,
             ).to(dtype)
 
         bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
         bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
-            ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"],
-            param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"], param["depthwise"],
+            ifmap,
+            kernel,
+            bn_k,
+            bn_l,
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
@@ -655,5 +724,6 @@ def main():
     else:
         print("No valid kernel selected")
 
+
 if __name__ == "__main__":
     main()

From 28ff7fbd562fe291d7f0a4268ed0106aed05c5aa Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 16:45:02 +0200
Subject: [PATCH 06/13] [SW] Fix a trailing whitespace in gen_data.

---
 sw/spatzBenchmarks/gemv/script/gen_data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py
index f33d5154..532d725b 100644
--- a/sw/spatzBenchmarks/gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/gemv/script/gen_data.py
@@ -194,4 +194,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-    
\ No newline at end of file

From 218b5a3e4abe458125fe142525df39a322fa34d6 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 16:46:38 +0200
Subject: [PATCH 07/13] [TB] Change error call to fatal to trigger abnormal
 exit when kernel fails. Deliberately changed a test to test the failure.

---
 hw/ip/snitch_test/src/tb_bin.sv    | 2 +-
 sw/spatzBenchmarks/dp-fdotp/main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv
index 469a3ebf..1a947f07 100644
--- a/hw/ip/snitch_test/src/tb_bin.sv
+++ b/hw/ip/snitch_test/src/tb_bin.sv
@@ -48,7 +48,7 @@ module tb_bin;
     end while (exit_code == 0);
     exit_code >>= 1;
     if (exit_code > 0) begin
-      $error("[FAILURE] Finished with exit code %2d", exit_code);
+      $fatal("[FAILURE] Finished with exit code %2d", exit_code);
     end else begin
       $info("[SUCCESS] Program finished successfully");
     end
diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c
index f4a0524d..0b4ac88b 100644
--- a/sw/spatzBenchmarks/dp-fdotp/main.c
+++ b/sw/spatzBenchmarks/dp-fdotp/main.c
@@ -32,7 +32,7 @@ static inline int fp_check(const double a, const double b) {
   const double threshold = 0.00001;
 
   // Absolute value
-  double comp = a - b;
+  double comp = a - 1;
   if (comp < 0)
     comp = -comp;
 

From bc32500aa7f8ff9d3eac34240cc9b5eb3c1a3461 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Fri, 24 Apr 2026 17:12:30 +0200
Subject: [PATCH 08/13] [TB] Only mark success when return 0.

---
 hw/ip/snitch_test/src/tb_bin.sv | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv
index 1a947f07..cf81aaf0 100644
--- a/hw/ip/snitch_test/src/tb_bin.sv
+++ b/hw/ip/snitch_test/src/tb_bin.sv
@@ -46,11 +46,13 @@ module tb_bin;
       if (exit_code == 0)
         #200ns;
     end while (exit_code == 0);
+
     exit_code >>= 1;
-    if (exit_code > 0) begin
-      $fatal("[FAILURE] Finished with exit code %2d", exit_code);
-    end else begin
+
+    if (exit_code == 0) begin
       $info("[SUCCESS] Program finished successfully");
+    end else begin
+      $fatal("[FAILURE] Finished with exit code %2d", exit_code);
     end
     $finish;
   end

From f7e11f2f7765cee3dfc686ab0674f5b5c2b4bfe8 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Mon, 27 Apr 2026 09:36:47 +0200
Subject: [PATCH 09/13] [SW] Fix a lint issue in gen_data.py

---
 sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
index ecc91071..4d268d6c 100644
--- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
@@ -93,7 +93,7 @@ def emit_gemv_layer(name="gemv", **kwargs):
             + ";\n"
         )
         # Assuming you have variables like M (output size) and tot_nz (number of non-zeros)
-        layer_str += f'// Auto-generated buffers for Cache Mode\n'
+        layer_str += '// Auto-generated buffers for Cache Mode\n'
         layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n'
         layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n'
         layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n'

From 8df585e53351c33863623f761dfcc0ef37e354f9 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Mon, 27 Apr 2026 09:37:27 +0200
Subject: [PATCH 10/13] [SW] Change comparison type to float to avoid illegal
 instruction in 32b configuration.

---
 sw/spatzBenchmarks/sa-gemv/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
index df2eb8ba..ce877af2 100644
--- a/sw/spatzBenchmarks/sa-gemv/main.c
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -53,7 +53,7 @@ static inline int fp_check(const T *a, const T *b) {
   const T threshold = 0.001;
 
   // Absolute value
-  double comp = (double)*a - (double)*b;
+  float comp = (float)*a - (float)*b;
   if (comp < 0)
     comp = -comp;
 
@@ -199,7 +199,7 @@ int main() {
       }
 
       for (unsigned int j = 0; j < vec_chunk_len; ++j) {
-        if ((double) vec_ptr[j] != 0.0) {
+        if ((float) vec_ptr[j] != 0.0) {
           dense_vec[nz_count] = vec_ptr[j];
           dense_idx[nz_count] = i * vec_chunk_len + j;
           nz_count++;

From 7c50b182b0617678cbe05868756758ceab9b4d7f Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Mon, 27 Apr 2026 10:36:37 +0200
Subject: [PATCH 11/13] [Verilator] Adjust the verilator's DPI-C library to
 ensure the capture of failed tests.

---
 hw/ip/snitch_test/src/verilator_lib.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/ip/snitch_test/src/verilator_lib.cc b/hw/ip/snitch_test/src/verilator_lib.cc
index b04a44db..583d54c2 100644
--- a/hw/ip/snitch_test/src/verilator_lib.cc
+++ b/hw/ip/snitch_test/src/verilator_lib.cc
@@ -32,10 +32,10 @@ int Sim::run() {
     target.init(sim_thread_main, this);
 
     int exit_code = htif_t::run();
-    if (exit_code > 0)
-      fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code);
-    else
+    if (exit_code == 0)
       fprintf(stderr, "[SUCCESS] Program finished successfully\n");
+    else
+      fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code);
     return exit_code;
 }
 

From b95b644e0ce2a45af79cf805422c54269e0f2aa4 Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Mon, 27 Apr 2026 13:21:06 +0200
Subject: [PATCH 12/13] [SW] Lint fix and restore the deliberate failed test.

---
 hw/ip/snitch_test/src/tb_bin.sv             |   2 +-
 sw/spatzBenchmarks/dp-fdotp/main.c          |   2 +-
 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c |   6 +-
 sw/spatzBenchmarks/sa-gemv/main.c           | 181 ++++++++++----------
 4 files changed, 100 insertions(+), 91 deletions(-)

diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv
index cf81aaf0..423f9d9e 100644
--- a/hw/ip/snitch_test/src/tb_bin.sv
+++ b/hw/ip/snitch_test/src/tb_bin.sv
@@ -52,7 +52,7 @@ module tb_bin;
     if (exit_code == 0) begin
       $info("[SUCCESS] Program finished successfully");
     end else begin
-      $fatal("[FAILURE] Finished with exit code %2d", exit_code);
+      $error("[FAILURE] Finished with exit code %2d", exit_code);
     end
     $finish;
   end
diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c
index 0b4ac88b..f4a0524d 100644
--- a/sw/spatzBenchmarks/dp-fdotp/main.c
+++ b/sw/spatzBenchmarks/dp-fdotp/main.c
@@ -32,7 +32,7 @@ static inline int fp_check(const double a, const double b) {
   const double threshold = 0.00001;
 
   // Absolute value
-  double comp = a - 1;
+  double comp = a - b;
   if (comp < 0)
     comp = -comp;
 
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
index 4e641041..733cd484 100644
--- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
@@ -19,7 +19,7 @@
 
 #include "sa-gemv.h"
 
-void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) {
+void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N) {
   unsigned int vl, avl = M_core;
   double *a_, *a_start = a;
   double *c_ = c;
@@ -66,7 +66,7 @@ void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) {
   } while (avl > 0);
 }
 
-void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) {
+void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) {
   unsigned int vl, avl = M_core;
   float *a_, *a_start = a;
   float *c_ = c;
@@ -114,7 +114,7 @@ void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) {
   } while (avl > 0);
 }
 
-void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N) {
+void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N) {
   unsigned int vl, avl = M_core;
   __fp16 *a_, *a_start = a;
   __fp16 *c_ = c;
diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
index ce877af2..2875d52c 100644
--- a/sw/spatzBenchmarks/sa-gemv/main.c
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -65,9 +65,9 @@ int main() {
   const unsigned int cid = snrt_cluster_core_idx();
 
   // Reset timer
-  unsigned int timer      = (unsigned int)-1;
+  unsigned int timer = (unsigned int)-1;
   unsigned int timer_best = (unsigned int)-1;
-  unsigned int timer_nz   = (unsigned int)-1;
+  unsigned int timer_nz = (unsigned int)-1;
   const unsigned int m_core = gemv_l.M / num_cores;
   // Size (in KiB) of L1 SPM, used to calculate tiling window
   const unsigned int spm_size = 128;
@@ -80,21 +80,23 @@ int main() {
   // 2. 2 chunks of matrix + densed vector (output) + densed idx
 
   // Sizes of each part we need
-  const uint32_t mat_size       = sizeof(T) * gemv_l.M * tot_nz_dram;
-  const uint32_t row_size       = sizeof(T) * gemv_l.M;
-  const uint32_t vec_size       = sizeof(T) * gemv_l.N;
+  const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram;
+  const uint32_t row_size = sizeof(T) * gemv_l.M;
+  const uint32_t vec_size = sizeof(T) * gemv_l.N;
   const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram;
   const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram;
-  const uint32_t result_size    = sizeof(T) * gemv_l.M;
+  const uint32_t result_size = sizeof(T) * gemv_l.M;
 
   // leave 8 KiB for Stack
-  const uint32_t l1_size  = (spm_size - 8) * 1024;
-  const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size;
+  const uint32_t l1_size = (spm_size - 8) * 1024;
+  const uint32_t fixed_alloc_size =
+      dense_vec_size + dense_idx_size + result_size;
 
   // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? ---
   if (fixed_alloc_size >= l1_size) {
     if (cid == 0) {
-      PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n",
+      PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but "
+             "only %u bytes available.\n",
              fixed_alloc_size, l1_size);
     }
     snrt_cluster_hw_barrier();
@@ -109,7 +111,8 @@ int main() {
   // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? ---
   if (num_row_mat < 1) {
     if (cid == 0) {
-      PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. "
+      PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double "
+             "buffering. "
              "Chunk space left: %u bytes, Row size: %u bytes.\n",
              l1_for_chunk, row_size);
     }
@@ -119,7 +122,10 @@ int main() {
 
   // Always strictly split the available memory in half for double-buffering
   const uint32_t vec_chunk_size = l1_for_chunk / 2;
-  const uint32_t num_vec_chunk  = (l1_for_chunk > vec_size) ? 1 : ((vec_size + vec_chunk_size - 1) / vec_chunk_size);
+  const uint32_t num_vec_chunk =
+      (l1_for_chunk > vec_size)
+          ? 1
+          : ((vec_size + vec_chunk_size - 1) / vec_chunk_size);
 
   // Recalculate exact chunk size based on whole rows
   const uint32_t mat_chunk_size = num_row_mat * row_size;
@@ -128,14 +134,13 @@ int main() {
   const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat;
 
   // Number of elements in each chunk
-  const uint32_t vec_chunk_len  = vec_chunk_size / sizeof(T);
-
+  const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T);
 
   // Memory Allocation
   if (cid == 0) {
-    result    = (T *)snrt_l1alloc(result_size);
+    result = (T *)snrt_l1alloc(result_size);
     dense_vec = (T *)snrt_l1alloc(dense_vec_size);
-    vec_buf0  = (T *)snrt_l1alloc(l1_for_chunk);
+    vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk);
     dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size);
 
     // Offset by half of the size if needed by double buffering
@@ -159,47 +164,45 @@ int main() {
   timer = benchmark_get_cycle();
 
   // Calculate internal pointers
-  T *vec_ptr      = vec_buf0;
-  T *vec_db_ptr   = vec_buf1;
+  T *vec_ptr = vec_buf0;
+  T *vec_db_ptr = vec_buf1;
 
   // Task 1: Find out the non-zeros
   if (cid == 0) {
-    #ifdef DEBUG_NZ
+#ifdef DEBUG_NZ
     PRINTF("NZ-Calc PreLD\n");
-    PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size);
-    #endif
+    PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram,
+           vec_chunk_size);
+#endif
     snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size);
     snrt_dma_wait_all();
   }
 
   uint32_t nz_count = 0;
 
-
   if (cid == 0) {
     for (unsigned int i = 0; i < num_vec_chunk; ++i) {
       // Step 1.1: preload the next chunk if not the end
       // Make sure the previous load completes
       snrt_dma_wait_all();
       // Double buffer to search the next non-zero
-      uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size)
-                        ? (vec_size - (i + 1) * vec_chunk_size)
-                        : vec_chunk_size;
+      uint32_t next_bytes =
+          (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size)
+              ? (vec_size - (i + 1) * vec_chunk_size)
+              : vec_chunk_size;
 
       if (i < num_vec_chunk - 1) {
-        #ifdef DEBUG_NZ
+#ifdef DEBUG_NZ
         PRINTF("NZ-Calc DB Iter%u\n", i);
         PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n",
-                gemv_vec_dram + (i + 1) * vec_chunk_len,
-                vec_db_ptr,
-                next_bytes);
-        #endif
-        snrt_dma_start_1d(vec_db_ptr,
-                          gemv_vec_dram + (i + 1) * vec_chunk_len,
+               gemv_vec_dram + (i + 1) * vec_chunk_len, vec_db_ptr, next_bytes);
+#endif
+        snrt_dma_start_1d(vec_db_ptr, gemv_vec_dram + (i + 1) * vec_chunk_len,
                           next_bytes); // Use exact bytes
       }
 
       for (unsigned int j = 0; j < vec_chunk_len; ++j) {
-        if ((float) vec_ptr[j] != 0.0) {
+        if ((float)vec_ptr[j] != 0.0) {
           dense_vec[nz_count] = vec_ptr[j];
           dense_idx[nz_count] = i * vec_chunk_len + j;
           nz_count++;
@@ -214,10 +217,10 @@ int main() {
 
       if (i % 2 == 0) {
         // pointer exchange
-        vec_ptr    = vec_buf1;
+        vec_ptr = vec_buf1;
         vec_db_ptr = vec_buf0;
       } else {
-        vec_ptr    = vec_buf0;
+        vec_ptr = vec_buf0;
         vec_db_ptr = vec_buf1;
       }
     }
@@ -225,49 +228,48 @@ int main() {
 
   snrt_cluster_hw_barrier();
 
-  #ifdef DEBUG_NZ
+#ifdef DEBUG_NZ
   if (cid == 0)
     PRINTF("Non-Zero Calc Complete\n");
-  #endif
+#endif
 
-  #ifdef DEBUG_NZ_IDX
+#ifdef DEBUG_NZ_IDX
   if (cid == 0) {
     for (uint32_t i = 0; i < tot_nz_dram; i++) {
       PRINTF("IDX[%u]=%u\n", i, dense_idx[i]);
     }
   }
-  #endif
+#endif
 
   timer_nz = benchmark_get_cycle() - timer_nz;
   timer = benchmark_get_cycle();
 
-
   // Task 2: GEMV calculation
   // Calculate internal pointers
-  T *mat_ptr     = mat_buf0;
-  T *mat_db_ptr  = mat_buf1;
+  T *mat_ptr = mat_buf0;
+  T *mat_db_ptr = mat_buf1;
   T *result_core = result + m_core * cid;
   uint16_t *idx_ptr = dense_idx; // Corrected pointer type
 
   if (cid == 0) {
     // Determine how many rows are actually active for this very first chunk
-    uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat;
+    uint32_t active_rows =
+        (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat;
 
-    #ifdef DEBUG_GEMV_PreLD
+#ifdef DEBUG_GEMV_PreLD
     PRINTF("GEMV PreLD\n");
     PRINTF("Active Rows:%u\n", active_rows);
-    #endif
+#endif
 
     for (unsigned int i = 0; i < active_rows; i++) {
-      #ifdef DEBUG
-      PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
-              i,
-              gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
-              mat_ptr + i * gemv_l.M,
-              row_size);
-      #endif
+#ifdef DEBUG
+      PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", i,
+             gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+             mat_ptr + i * gemv_l.M, row_size);
+#endif
       snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1
-                        gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM
+                        gemv_mat_dram +
+                            (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM
                         row_size);
       idx_ptr++;
     }
@@ -275,15 +277,15 @@ int main() {
 
   snrt_cluster_hw_barrier();
 
-  #ifdef DEBUG_GEMV_PreLD
+#ifdef DEBUG_GEMV_PreLD
   if (cid == 0)
     PRINTF("GEMV PreLD Complete\n");
-  #endif
+#endif
 
-  #ifdef DEBUG_GEMV_DB
+#ifdef DEBUG_GEMV_DB
   if (cid == 0)
     PRINTF("Tot Chunks %u\n", num_mat_chunk);
-  #endif
+#endif
 
   for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) {
     // Wait for the CURRENT chunk to finish loading
@@ -297,27 +299,24 @@ int main() {
     uint32_t next_active_rows = 0;
 
     if (next_chunk_start < tot_nz_dram) {
-        next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat)
-                           ? (tot_nz_dram - next_chunk_start)
-                           : num_row_mat;
+      next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat)
+                             ? (tot_nz_dram - next_chunk_start)
+                             : num_row_mat;
     }
 
-    #ifdef DEBUG_GEMV_DB
+#ifdef DEBUG_GEMV_DB
     if (cid == 0)
       PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows);
-    #endif
+#endif
 
     // Load NEXT chunk in the background
     if (cid == 0 && next_active_rows > 0) {
       for (unsigned int i = 0; i < next_active_rows; i++) {
-        #ifdef DEBUG_GEMV_DB
-        PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n",
-                idx_ptr,
-                i,
-                gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
-                mat_db_ptr + i * gemv_l.M,
-                row_size);
-        #endif
+#ifdef DEBUG_GEMV_DB
+        PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", idx_ptr, i,
+               gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+               mat_db_ptr + i * gemv_l.M, row_size);
+#endif
         snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M,
                           gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
                           row_size);
@@ -326,28 +325,32 @@ int main() {
     }
 
     // Calculate active rows for the CURRENT compute phase
-    uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat)
-                                ? (tot_nz_dram - chunk_idx * num_row_mat)
-                                : num_row_mat;
+    uint32_t curr_active_rows =
+        (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat)
+            ? (tot_nz_dram - chunk_idx * num_row_mat)
+            : num_row_mat;
 
     // Calculate GEMV on the current chunk
     T *current_dense_vec = dense_vec + chunk_idx * num_row_mat;
 
-    // Offset the matrix pointer by m_core * cid so each core reads its correct rows
+    // Offset the matrix pointer by m_core * cid so each core reads its correct
+    // rows
     T *mat_core_ptr = mat_ptr + m_core * cid;
 
-    #if (PREC == 64)
-      gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
-    #elif (PREC == 32)
-      gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
-    #else
-      gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows);
-    #endif
-
+#if (PREC == 64)
+    gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#elif (PREC == 32)
+    gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#else
+    gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#endif
 
     // Swap pointers for the next iteration
-    T *temp    = mat_ptr;
-    mat_ptr    = mat_db_ptr;
+    T *temp = mat_ptr;
+    mat_ptr = mat_db_ptr;
     mat_db_ptr = temp;
   }
 
@@ -363,14 +366,19 @@ int main() {
     // Checking
     for (unsigned int i = 0; i < gemv_l.M; i++) {
       if (fp_check(&result[i], &gemv_result[i])) {
-        PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]);
+#if (PREC == 64)
+        PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i],
+               gemv_result[i]);
+#else
+        PRINTF("Error: ID: %i Result = %x, Golden = %x\n", i,
+               *(int *)&result[i], *(int *)&gemv_result[i]);
+#endif
       }
     }
   }
 
   snrt_cluster_hw_barrier();
 
-
   // Check and display results
   // Assume 2 core 4 fpu configuration
   if (cid == 0) {
@@ -380,7 +388,8 @@ int main() {
     long unsigned int utilization =
         performance / (2 * num_cores * 4 * 8 / sizeof(T));
 
-    PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram);
+    PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N,
+           tot_nz_dram);
     PRINTF("The NZ finding takes %u cycles.\n", timer_nz);
     PRINTF("The GEMV execution took %u cycles.\n", timer);
     PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",

From fb610d7ff05e60da6e7e38541fcbcfc33a66381c Mon Sep 17 00:00:00 2001
From: Diyou Shen <dishen@iis.ee.ethz.ch>
Date: Mon, 27 Apr 2026 14:55:41 +0200
Subject: [PATCH 13/13] [SW] Keep lint fixing.

---
 sw/spatzBenchmarks/sa-gemv/data/layer.h     | 6 +++---
 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c | 4 ++--
 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h
index 0a07ad53..62da8ce6 100644
--- a/sw/spatzBenchmarks/sa-gemv/data/layer.h
+++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h
@@ -9,8 +9,8 @@
 typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
 
 typedef struct gemv_layer_struct {
-    uint32_t M;
-    uint32_t N;
+  uint32_t M;
+  uint32_t N;
 
-    precision_t dtype;
+  precision_t dtype;
 } gemv_layer;
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
index 733cd484..31973656 100644
--- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
@@ -102,11 +102,11 @@ void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) {
 
     asm volatile("vle32.v v16, (%0)" ::"r"(c_));
     asm volatile("vfadd.vv v4, v4, v16");
-    asm volatile("vse32.v v12, (%0)" ::"r"(c_)); // wait, mapping v4 to v12? No, use v4.
+    asm volatile("vse32.v v12, (%0)" ::"r"(c_));
     // Correction:
     // asm volatile("vse32.v v4, (%0)" ::"r"(c_));
     // Let's rewrite this block safely:
-    asm volatile("vse32.v v4, (%0)" ::"r"(c_)); // Fixed register writeback
+    asm volatile("vse32.v v4, (%0)" ::"r"(c_));
 
     avl -= vl;
     c_ += vl;
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
index 1bf377e1..1b172260 100644
--- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
@@ -19,8 +19,8 @@
 #ifndef _GEMV_H
 #define _GEMV_H
 
-void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N);
-void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N);
-void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N);
+void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N);
+void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N);
+void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N);
 
 #endif