diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv
index 469a3ebf..423f9d9e 100644
--- a/hw/ip/snitch_test/src/tb_bin.sv
+++ b/hw/ip/snitch_test/src/tb_bin.sv
@@ -46,11 +46,13 @@ module tb_bin;
       if (exit_code == 0)
         #200ns;
     end while (exit_code == 0);
+
     exit_code >>= 1;
-    if (exit_code > 0) begin
-      $error("[FAILURE] Finished with exit code %2d", exit_code);
-    end else begin
+
+    if (exit_code == 0) begin
       $info("[SUCCESS] Program finished successfully");
+    end else begin
+      $error("[FAILURE] Finished with exit code %2d", exit_code);
     end
     $finish;
   end
diff --git a/hw/ip/snitch_test/src/verilator_lib.cc b/hw/ip/snitch_test/src/verilator_lib.cc
index b04a44db..583d54c2 100644
--- a/hw/ip/snitch_test/src/verilator_lib.cc
+++ b/hw/ip/snitch_test/src/verilator_lib.cc
@@ -32,10 +32,10 @@ int Sim::run() {
     target.init(sim_thread_main, this);
 
     int exit_code = htif_t::run();
-    if (exit_code > 0)
-      fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code);
-    else
+    if (exit_code == 0)
       fprintf(stderr, "[SUCCESS] Program finished successfully\n");
+    else
+      fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code);
     return exit_code;
 }
 
diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile
index 3fa7bd52..eb87cc4e 100644
--- a/hw/system/spatz_cluster/Makefile
+++ b/hw/system/spatz_cluster/Makefile
@@ -133,6 +133,20 @@ spatz.gendata:
 		fi \
 	done
 
+.PHONY: spatz.cleandata
+spatz.cleandata:
+	@for benchmark_dir in $(ROOT)/sw/spatzBenchmarks/*/; do \
+		data_dir="$$benchmark_dir/data"; \
+		if [ -d "$$data_dir" ]; then \
+			data_count=$$(find "$$data_dir" -name 'data*.h' -type f 2>/dev/null | wc -l); \
+			if [ "$$data_count" -gt 0 ]; then \
+				echo "Cleaning $$data_count data file(s) from $$data_dir"; \
+				rm -f "$$data_dir"/data*.h; \
+			fi \
+		fi \
+	done
+	@echo "All benchmark data cleaned."
+
 #############
 # Verilator #
 #############
diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt
index 70f72e5c..7c7d5170 100644
--- a/sw/spatzBenchmarks/CMakeLists.txt
+++ b/sw/spatzBenchmarks/CMakeLists.txt
@@ -110,6 +110,7 @@ if (ELEN EQUAL 64)
   add_spatz_test_threeParam(dp-fmatmul dp-fmatmul/main.c 64  64  64 )
 
   add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64  128 64)
+  add_spatz_test_threeParam_type(dp-sa-gemv sa-gemv/main.c 256 128 16 64)
 
   add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 256)
   add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 1024)
@@ -121,6 +122,7 @@ if (ELEN EQUAL 64)
   add_spatz_test_threeParam(dp-fconv2d dp-fconv2d/main.c 64 64 7)
 
   add_spatz_test_twoParam(dp-fft dp-fft/main.c 128 2)
+
 endif()
 
 add_spatz_test_threeParam(sp-fmatmul sp-fmatmul/main.c 64  64  64 )
@@ -150,6 +152,7 @@ add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128)
 
 add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32)
 add_spatz_test_twoParam_type(hp-gemv gemv/main.c 256 128 16)
+add_spatz_test_threeParam_type(hp-sa-gemv sa-gemv/main.c 128 4096 512 16)
 
 add_spatz_test_twoParam(sp-fft sp-fft/main.c 256 2)
 add_spatz_test_twoParam(sp-fft sp-fft/main.c 512 2)
diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py
index 177733ef..532d725b 100644
--- a/sw/spatzBenchmarks/gemv/script/gen_data.py
+++ b/sw/spatzBenchmarks/gemv/script/gen_data.py
@@ -23,7 +23,8 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = list(a.flat)
         elif isinstance(a, torch.Tensor):
-            a = a.numpy().flatten().tolist()
+            # Universal Fix: Cast to float32 before sending to NumPy/C-string to avoid formatting errors
+            a = a.float().numpy().flatten().tolist()
         else:
             a = list(a)
         for i, el in enumerate(a):
@@ -121,9 +122,11 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            # Universal Fix: Generate FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return torch.randn(shape, requires_grad=False, dtype=torch.float16), {}
+            # Universal Fix: Generate FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -142,15 +145,8 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def gemv(a, b):
-    # PyTorch doesn't support matmul for float16 on CPU, so convert to float32
-    original_dtype = a.dtype
-    if original_dtype == torch.float16:
-        a = a.float()
-        b = b.float()
-    result = torch.matmul(a, b)
-    if original_dtype == torch.float16:
-        result = result.half()
-    return result
+    # Universal Fix: One-liner upcast and downcast
+    return torch.matmul(a.float(), b.float()).to(a.dtype)
 
 
 def main():
diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
index 0cd18994..4da0e20d 100755
--- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
@@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1):
     )
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    orig_dtype = ifmap.dtype
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -459,7 +475,7 @@ def fused_conv(
         co,
     )
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
         ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
@@ -499,6 +515,11 @@ def fused_conv(
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
     return ofmap, ofmap_before, ifmap_padded
 
 
@@ -538,16 +559,16 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
             param["channels"]["out"],
             param["channels"]["in"],
             param["filter"]["height"],
             param["filter"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = conv2d(
             ifmap,
@@ -568,7 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -602,8 +626,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
@@ -621,8 +645,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
@@ -639,8 +663,8 @@ def main():
             param["dim_in_x"],
             param["ch_in"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         if not param["depthwise"]:
             kernel = torch.randn(
                 param["ch_out"],
@@ -648,31 +672,31 @@ def main():
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
                 param["dim_kernel_y"],
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
             ifmap,
             kernel,
             bn_k,
             bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h
new file mode 100644
index 00000000..62da8ce6
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h
@@ -0,0 +1,16 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <stdint.h>
+
+typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t;
+
+typedef struct gemv_layer_struct {
+  uint32_t M;
+  uint32_t N;
+
+  precision_t dtype;
+} gemv_layer;
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
new file mode 100644
index 00000000..31973656
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c
@@ -0,0 +1,165 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+// Author: Diyou Shen,              ETH Zurich <dishen@iis.ee.ethz.ch>
+
+#include "sa-gemv.h"
+
+void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  double *a_, *a_start = a;
+  double *c_ = c;
+
+  do {
+    a_ = a_start;
+    double *b_ = b;
+    asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    // 1. CLEAR ACCUMULATORS for every new vl block (0 encodes to +0.0 float)
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle64.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+      b_++;
+
+      asm volatile("vle64.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_));
+      b_++;
+    }
+
+    // 2. HANDLE ODD N BOUNDARY
+    if (col < N) {
+      asm volatile("vle64.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    // 3. ACCUMULATE INTO MEMORY C (Load -> Add -> Store)
+    asm volatile("vle64.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse64.v v4, (%0)" ::"r"(c_));
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
+
+void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  float *a_, *a_start = a;
+  float *c_ = c;
+
+  do {
+    a_ = a_start;
+    float *b_ = b;
+    asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle32.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+      b_++;
+
+      asm volatile("vle32.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_));
+      b_++;
+    }
+
+    if (col < N) {
+      asm volatile("vle32.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    asm volatile("vle32.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse32.v v12, (%0)" ::"r"(c_));
+    // Correction:
+    // asm volatile("vse32.v v4, (%0)" ::"r"(c_));
+    // Let's rewrite this block safely:
+    asm volatile("vse32.v v4, (%0)" ::"r"(c_));
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
+
+void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N) {
+  unsigned int vl, avl = M_core;
+  __fp16 *a_, *a_start = a;
+  __fp16 *c_ = c;
+
+  do {
+    a_ = a_start;
+    __fp16 *b_ = b;
+    asm volatile("vsetvli %0, %1, e16, m4, ta, ma" : "=r"(vl) : "r"(avl));
+
+    asm volatile("vmv.v.i v4, 0");
+    asm volatile("vmv.v.i v12, 0");
+
+    int col = 0;
+    for (; col < N - 1; col += 2) {
+      asm volatile("vle16.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      asm volatile("vle16.v v8, (%0)" ::"r"(a_));
+      a_ += M;
+
+      float t0, t1;
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0));
+      b_++;
+
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t1) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v12, %0, v8" ::"f"(t1));
+      b_++;
+    }
+
+    if (col < N) {
+      asm volatile("vle16.v v0, (%0)" ::"r"(a_));
+      a_ += M;
+      float t0;
+      asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_));
+      asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0));
+    }
+
+    asm volatile("vfadd.vv v4, v4, v12");
+
+    asm volatile("vle16.v v16, (%0)" ::"r"(c_));
+    asm volatile("vfadd.vv v4, v4, v16");
+    asm volatile("vse16.v v4, (%0)" ::"r"(c_));
+
+    avl -= vl;
+    c_ += vl;
+    a_start += vl;
+  } while (avl > 0);
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
new file mode 100644
index 00000000..1b172260
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h
@@ -0,0 +1,26 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+
+#ifndef _GEMV_H
+#define _GEMV_H
+
+void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N);
+void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N);
+void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N);
+
+#endif
diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c
new file mode 100644
index 00000000..2875d52c
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/main.c
@@ -0,0 +1,402 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Diyou Shen,              ETH Zurich <dishen@iis.ee.ethz.ch>
+// Author: Navaneeth Kunhi Purayil, ETH Zurich <nkunhi@iis.ee.ethz.ch>
+
+#include <benchmark.h>
+#include <debug.h>
+#include <snrt.h>
+#include <stdio.h>
+
+#include DATAHEADER
+#include "kernel/sa-gemv.c"
+
+#if (PREC == 64)
+#define T double
+#elif (PREC == 32)
+#define T float
+#elif (PREC == 16)
+#define T __fp16
+#else
+#define T double
+#endif
+
+// Debugging defines
+// #define DEBUG_NZ
+// #define DEBUG_NZ_IDX
+// #define DEBUG_GEMV_PreLD
+// #define DEBUG_GEMV_DB
+
+T *vec_buf0;
+T *vec_buf1;
+uint16_t *dense_idx;
+T *dense_vec;
+T *mat_buf0;
+T *mat_buf1;
+T *result;
+
+static inline int fp_check(const T *a, const T *b) {
+  const T threshold = 0.001;
+
+  // Absolute value
+  float comp = (float)*a - (float)*b;
+  if (comp < 0)
+    comp = -comp;
+
+  return comp > threshold;
+}
+
+int main() {
+  const unsigned int num_cores = snrt_cluster_core_num();
+  const unsigned int cid = snrt_cluster_core_idx();
+
+  // Reset timer
+  unsigned int timer = (unsigned int)-1;
+  unsigned int timer_best = (unsigned int)-1;
+  unsigned int timer_nz = (unsigned int)-1;
+  const unsigned int m_core = gemv_l.M / num_cores;
+  // Size (in KiB) of L1 SPM, used to calculate tiling window
+  const unsigned int spm_size = 128;
+
+  // For Sparse Attention GEMV, we need several steps
+  // 1. Find all non-zeros
+  // 2. Calculate the GEMV
+  // What to be double buffered?
+  // 1. 2 chunks of sparse vector + densed vector (output) + densed idx
+  // 2. 2 chunks of matrix + densed vector (output) + densed idx
+
+  // Sizes of each part we need
+  const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram;
+  const uint32_t row_size = sizeof(T) * gemv_l.M;
+  const uint32_t vec_size = sizeof(T) * gemv_l.N;
+  const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram;
+  const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram;
+  const uint32_t result_size = sizeof(T) * gemv_l.M;
+
+  // leave 8 KiB for Stack
+  const uint32_t l1_size = (spm_size - 8) * 1024;
+  const uint32_t fixed_alloc_size =
+      dense_vec_size + dense_idx_size + result_size;
+
+  // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? ---
+  if (fixed_alloc_size >= l1_size) {
+    if (cid == 0) {
+      PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but "
+             "only %u bytes available.\n",
+             fixed_alloc_size, l1_size);
+    }
+    snrt_cluster_hw_barrier();
+    return -1; // Exit gracefully
+  }
+
+  const uint32_t l1_for_chunk = l1_size - fixed_alloc_size;
+
+  // How many whole rows (or columns) can fit in half the L1 chunk space?
+  const uint32_t num_row_mat = (l1_for_chunk / 2) / row_size;
+
+  // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? ---
+  if (num_row_mat < 1) {
+    if (cid == 0) {
+      PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double "
+             "buffering. "
+             "Chunk space left: %u bytes, Row size: %u bytes.\n",
+             l1_for_chunk, row_size);
+    }
+    snrt_cluster_hw_barrier();
+    return -1; // Exit gracefully
+  }
+
+  // Always strictly split the available memory in half for double-buffering
+  const uint32_t vec_chunk_size = l1_for_chunk / 2;
+  const uint32_t num_vec_chunk =
+      (l1_for_chunk > vec_size)
+          ? 1
+          : ((vec_size + vec_chunk_size - 1) / vec_chunk_size);
+
+  // Recalculate exact chunk size based on whole rows
+  const uint32_t mat_chunk_size = num_row_mat * row_size;
+
+  // Number of chunks based on the total non-zeros we need to process
+  const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat;
+
+  // Number of elements in each chunk
+  const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T);
+
+  // Memory Allocation
+  if (cid == 0) {
+    result = (T *)snrt_l1alloc(result_size);
+    dense_vec = (T *)snrt_l1alloc(dense_vec_size);
+    vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk);
+    dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size);
+
+    // Offset by half of the size if needed by double buffering
+    vec_buf1 = vec_buf0 + vec_chunk_len;
+
+    mat_buf0 = vec_buf0;
+    mat_buf1 = vec_buf1;
+  }
+
+  // MUST zero out the memory accumulator!
+  if (cid == 0) {
+    for (unsigned int i = 0; i < gemv_l.M; i++) {
+      result[i] = 0.0;
+    }
+  }
+  snrt_cluster_hw_barrier();
+
+  if (cid == 0)
+    start_kernel();
+
+  timer = benchmark_get_cycle();
+
+  // Calculate internal pointers
+  T *vec_ptr = vec_buf0;
+  T *vec_db_ptr = vec_buf1;
+
+  // Task 1: Find out the non-zeros
+  if (cid == 0) {
+#ifdef DEBUG_NZ
+    PRINTF("NZ-Calc PreLD\n");
+    PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram,
+           vec_chunk_size);
+#endif
+    snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size);
+    snrt_dma_wait_all();
+  }
+
+  uint32_t nz_count = 0;
+
+  if (cid == 0) {
+    for (unsigned int i = 0; i < num_vec_chunk; ++i) {
+      // Step 1.1: preload the next chunk if not the end
+      // Make sure the previous load completes
+      snrt_dma_wait_all();
+      // Double buffer to search the next non-zero
+      uint32_t next_bytes =
+          (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size)
+              ? (vec_size - (i + 1) * vec_chunk_size)
+              : vec_chunk_size;
+
+      if (i < num_vec_chunk - 1) {
+#ifdef DEBUG_NZ
+        PRINTF("NZ-Calc DB Iter%u\n", i);
+        PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n",
+               gemv_vec_dram + (i + 1) * vec_chunk_len, vec_db_ptr, next_bytes);
+#endif
+        snrt_dma_start_1d(vec_db_ptr, gemv_vec_dram + (i + 1) * vec_chunk_len,
+                          next_bytes); // Use exact bytes
+      }
+
+      for (unsigned int j = 0; j < vec_chunk_len; ++j) {
+        if ((float)vec_ptr[j] != 0.0) {
+          dense_vec[nz_count] = vec_ptr[j];
+          dense_idx[nz_count] = i * vec_chunk_len + j;
+          nz_count++;
+        }
+
+        if (nz_count == tot_nz_dram)
+          break;
+      }
+
+      if (nz_count == tot_nz_dram)
+        break;
+
+      if (i % 2 == 0) {
+        // pointer exchange
+        vec_ptr = vec_buf1;
+        vec_db_ptr = vec_buf0;
+      } else {
+        vec_ptr = vec_buf0;
+        vec_db_ptr = vec_buf1;
+      }
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+#ifdef DEBUG_NZ
+  if (cid == 0)
+    PRINTF("Non-Zero Calc Complete\n");
+#endif
+
+#ifdef DEBUG_NZ_IDX
+  if (cid == 0) {
+    for (uint32_t i = 0; i < tot_nz_dram; i++) {
+      PRINTF("IDX[%u]=%u\n", i, dense_idx[i]);
+    }
+  }
+#endif
+
+  timer_nz = benchmark_get_cycle() - timer_nz;
+  timer = benchmark_get_cycle();
+
+  // Task 2: GEMV calculation
+  // Calculate internal pointers
+  T *mat_ptr = mat_buf0;
+  T *mat_db_ptr = mat_buf1;
+  T *result_core = result + m_core * cid;
+  uint16_t *idx_ptr = dense_idx; // Corrected pointer type
+
+  if (cid == 0) {
+    // Determine how many rows are actually active for this very first chunk
+    uint32_t active_rows =
+        (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat;
+
+#ifdef DEBUG_GEMV_PreLD
+    PRINTF("GEMV PreLD\n");
+    PRINTF("Active Rows:%u\n", active_rows);
+#endif
+
+    for (unsigned int i = 0; i < active_rows; i++) {
+#ifdef DEBUG
+      PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", i,
+             gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+             mat_ptr + i * gemv_l.M, row_size);
+#endif
+      snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1
+                        gemv_mat_dram +
+                            (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM
+                        row_size);
+      idx_ptr++;
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+#ifdef DEBUG_GEMV_PreLD
+  if (cid == 0)
+    PRINTF("GEMV PreLD Complete\n");
+#endif
+
+#ifdef DEBUG_GEMV_DB
+  if (cid == 0)
+    PRINTF("Tot Chunks %u\n", num_mat_chunk);
+#endif
+
+  for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) {
+    // Wait for the CURRENT chunk to finish loading
+    if (cid == 0) {
+      snrt_dma_wait_all();
+    }
+    snrt_cluster_hw_barrier();
+
+    // Determine bounds for the NEXT chunk (for background DMA)
+    uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat;
+    uint32_t next_active_rows = 0;
+
+    if (next_chunk_start < tot_nz_dram) {
+      next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat)
+                             ? (tot_nz_dram - next_chunk_start)
+                             : num_row_mat;
+    }
+
+#ifdef DEBUG_GEMV_DB
+    if (cid == 0)
+      PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows);
+#endif
+
+    // Load NEXT chunk in the background
+    if (cid == 0 && next_active_rows > 0) {
+      for (unsigned int i = 0; i < next_active_rows; i++) {
+#ifdef DEBUG_GEMV_DB
+        PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", idx_ptr, i,
+               gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+               mat_db_ptr + i * gemv_l.M, row_size);
+#endif
+        snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M,
+                          gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M,
+                          row_size);
+        idx_ptr++;
+      }
+    }
+
+    // Calculate active rows for the CURRENT compute phase
+    uint32_t curr_active_rows =
+        (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat)
+            ? (tot_nz_dram - chunk_idx * num_row_mat)
+            : num_row_mat;
+
+    // Calculate GEMV on the current chunk
+    T *current_dense_vec = dense_vec + chunk_idx * num_row_mat;
+
+    // Offset the matrix pointer by m_core * cid so each core reads its correct
+    // rows
+    T *mat_core_ptr = mat_ptr + m_core * cid;
+
+#if (PREC == 64)
+    gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#elif (PREC == 32)
+    gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#else
+    gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core,
+                 curr_active_rows);
+#endif
+
+    // Swap pointers for the next iteration
+    T *temp = mat_ptr;
+    mat_ptr = mat_db_ptr;
+    mat_db_ptr = temp;
+  }
+
+  snrt_cluster_hw_barrier();
+
+  timer = benchmark_get_cycle() - timer;
+
+  if (cid == 0)
+    stop_kernel();
+
+  // Result Checking
+  if (cid == 0) {
+    // Checking
+    for (unsigned int i = 0; i < gemv_l.M; i++) {
+      if (fp_check(&result[i], &gemv_result[i])) {
+#if (PREC == 64)
+        PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i],
+               gemv_result[i]);
+#else
+        PRINTF("Error: ID: %i Result = %x, Golden = %x\n", i,
+               *(int *)&result[i], *(int *)&gemv_result[i]);
+#endif
+      }
+    }
+  }
+
+  snrt_cluster_hw_barrier();
+
+  // Check and display results
+  // Assume 2 core 4 fpu configuration
+  if (cid == 0) {
+    // Flops per cycle
+    long unsigned int performance = 1000 * 2 * gemv_l.M * tot_nz_dram / timer;
+    // Ideal perf = MACC * NCore * Nfpu * Prec adjustment
+    long unsigned int utilization =
+        performance / (2 * num_cores * 4 * 8 / sizeof(T));
+
+    PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N,
+           tot_nz_dram);
+    PRINTF("The NZ finding takes %u cycles.\n", timer_nz);
+    PRINTF("The GEMV execution took %u cycles.\n", timer);
+    PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n",
+           performance, utilization);
+  }
+
+  // Wait for core 0 to finish displaying results
+  snrt_cluster_hw_barrier();
+  return 0;
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
new file mode 100644
index 00000000..4d268d6c
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Matheus Cavalcante <matheusd@iis.ee.ethz.ch>
+
+import numpy as np
+import torch
+import argparse
+import pathlib
+import hjson
+
+np.random.seed(42)
+torch.manual_seed(42)
+
+global verbose
+
+
+def array_to_cstr(a, fmt=float):
+    out = "{\n"
+    if fmt == float:
+        if isinstance(a, np.ndarray):
+            a = a.flat
+        if isinstance(a, torch.Tensor):
+            a = a.numpy().flat
+        for el in a:
+            out += "\t{},\n".format(el)
+    else:
+        for sign, exp, mant in zip(
+            a["sign"].numpy().flat,
+            a["exponent"].numpy().flat,
+            a["mantissa"].numpy().flat,
+        ):
+            value = sign * 2**7 + exp * 2**2 + mant
+            out += "0x{:02x},\n".format(value)
+    out = out[:-2] + "}"
+    return out
+
+
+def emit_header_file(layer_type: str, **kwargs):
+    file_path = pathlib.Path(__file__).parent.parent / "data"
+    emit_str = (
+        "// Copyright 2025 ETH Zurich and University of Bologna.\n"
+        + "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n"
+        + "// SPDX-License-Identifier: Apache-2.0\n\n"
+        + "// This file was generated automatically.\n\n"
+    )
+
+    file = file_path / ("data_" + str(kwargs["M"]) + "_" + str(kwargs["N"]) + "_" + str(kwargs["tot_nz"]) + "_" + str(kwargs["prec"]) + ".h")
+    emit_str += emit_gemv_layer(**kwargs)
+    with file.open("w") as f:
+        f.write(emit_str)
+
+
+def emit_gemv_layer(name="gemv", **kwargs):
+    mat_A = kwargs["A"]
+    vec_B = kwargs["B"]
+    result = kwargs["result"]
+
+    m = kwargs["M"]
+    n = kwargs["N"]
+    tot_nz = kwargs["tot_nz"]
+
+    layer_str = ""
+    layer_str += '#include "layer.h"\n\n'
+    layer_str += f"const gemv_layer {name}_l = {{\n"
+    layer_str += f"\t.M = {m},\n"
+    layer_str += f"\t.N = {n},\n"
+    layer_str += f'\t.dtype = FP{kwargs["prec"]}'
+    layer_str += "};\n\n"
+
+    # Export the total non-zeros directly so the kernel can use it
+    layer_str += f"const uint32_t tot_nz_dram = {tot_nz};\n\n"
+
+    ctypes = {"64": "double", "32": "float", "16": "__fp16", "8": "char"}
+
+    dtype = ctypes[str(kwargs["prec"])]
+    if dtype != "char":
+        layer_str += (
+            f'static {dtype} {name}_mat_dram[{m}*{n}] __attribute__((section(".data"))) = '
+            + array_to_cstr(mat_A)
+            + ";\n\n"
+        )
+        layer_str += (
+            f'static {dtype} {name}_vec_dram[{n}] __attribute__((section(".data"))) = '
+            + array_to_cstr(vec_B)
+            + ";\n\n"
+        )
+        layer_str += (
+            f'static {dtype} {name}_result[{m}] __attribute__((section(".data"))) = '
+            + array_to_cstr(result)
+            + ";\n"
+        )
+        # Assuming you have variables like M (output size) and tot_nz (number of non-zeros)
+        layer_str += '// Auto-generated buffers for Cache Mode\n'
+        layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n'
+        layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n'
+        layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n'
+    else:
+        layer_str += (
+            f"static {dtype} {name}_mat_dram[{m}*{n}] = "
+            + array_to_cstr(kwargs["bits_A"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_vec_dram[{n}] = "
+            + array_to_cstr(kwargs["bits_B"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_result[{m}] = "
+            + array_to_cstr(kwargs["result"], fmt="char")
+            + ";\n\n\n"
+        )
+        layer_str += (
+            f"static {dtype} {name}_result_buf_dram[{m}] ="
+            + array_to_cstr(kwargs["result"], fmt="char")
+            + ";\n\n\n"
+        )
+
+    return layer_str
+
+
+def rand_data_generator(shape, prec, alt=False):
+    if prec == 64:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float64), {}
+    elif prec == 32:
+        return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
+    elif prec == 16:
+        if alt:
+            # Generate in FP32, cast to BF16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
+        else:
+            # Generate in FP32, cast to FP16
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {}
+    elif prec == 8:
+        sign = torch.randint(
+            0, 2, shape, requires_grad=False, dtype=torch.uint8
+        )  # -1 or 1
+        exponent = torch.randint(
+            0, 16, shape, requires_grad=False, dtype=torch.uint8
+        )  # < 0b01111
+        mantissa = torch.randint(
+            0, 4, shape, requires_grad=False, dtype=torch.uint8
+        )  # can be arbitrary
+        bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa}
+        # TODO: not actually correct
+        return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * (
+            1.0 + mantissa.double() / (2**2)
+        ), bits
+
+
+def gemv(a, b):
+    print(a.shape, b.shape)
+    # Upcast to float32 for CPU math, then downcast back to the original dtype
+    return torch.matmul(a.float(), b.float()).to(a.dtype)
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description="Generate data for kernels")
+    parser.add_argument(
+        "-c",
+        "--cfg",
+        type=pathlib.Path,
+        required=True,
+        help="Select param config file kernel",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose")
+
+    args = parser.parse_args()
+
+    global verbose
+    verbose = args.verbose
+
+    with args.cfg.open() as f:
+        param = hjson.loads(f.read())
+
+    # Read tot_nz from the hjson file
+    tot_nz = param["tot_nz"]
+
+    mat_A, bits_A = rand_data_generator((param["M"], param["N"]), param["prec"])
+    vec_B, bits_B = rand_data_generator((param["N"], 1), param["prec"])
+
+    # --- Sparsity Logic ---
+    # Randomly select `tot_nz` indices to keep, set the rest to 0.0
+    nz_indices = torch.randperm(param["N"])[:tot_nz]
+    mask = torch.zeros((param["N"], 1), dtype=torch.bool)
+    mask[nz_indices, 0] = True
+
+    # Temporarily upcast to float32 for the masking math, then cast back
+    vec_B = (vec_B.float() * mask).to(vec_B.dtype)
+
+    # Also zero out the raw bits if using 8-bit precision to maintain parity
+    if bool(bits_B):
+        for k in bits_B.keys():
+            # Apply the mask, ensuring the shape matches the 1D bits array format
+            bits_B[k] = bits_B[k] * mask.squeeze().byte()
+    # ----------------------
+
+    # Calculate result using the now-sparse vector
+    result = gemv(mat_A, vec_B)
+
+    # Store A in col major format
+    mat_A = mat_A.T
+
+    kwargs = {
+        "A": mat_A,
+        "B": vec_B,
+        "result": result,
+        "M": param["M"],
+        "N": param["N"],
+        "tot_nz": tot_nz,  # Pass the new parameter down
+        "prec": param["prec"],
+        "expand": param["expand"],
+        "bits_A": bits_A,
+        "bits_B": bits_B,
+    }
+
+    emit_header_file("gemv", **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json
new file mode 100644
index 00000000..e14c70e4
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json
@@ -0,0 +1,16 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMV
+
+{
+    kernel: "GEMV"
+    M: 128,
+    N: 4096,
+    tot_nz: 512
+    transpose_A: false,
+    transpose_B: false,
+    prec: 16,
+    expand: 0
+}
diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json
new file mode 100644
index 00000000..8942de89
--- /dev/null
+++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json
@@ -0,0 +1,16 @@
+// Copyright 2020 ETH Zurich and University of Bologna.
+// Solderpad Hardware License, Version 0.51, see LICENSE for details.
+// SPDX-License-Identifier: SHL-0.51
+
+// Parameters for a GEMV
+
+{
+    kernel: "GEMV"
+    M: 256,
+    N: 128,
+    tot_nz: 16
+    transpose_A: false,
+    transpose_B: false,
+    prec: 64,
+    expand: 0
+}
diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
index 9918713e..4da0e20d 100755
--- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -167,23 +170,23 @@ def emit_GEMM_layer(name="gemm", **kwargs):
             + ";\n\n\n"
         )
         layer_str += (
-            f"static const {dtype} {name}_checksum[{m}] = "
-            + array_to_cstr(torch.sum(result, dim=-1))
+            f"static const {dtype} {name}_result[{m}*{n}] = "
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
         layer_str += (
-            f"static {dtype} {name}_A_dram [{m}*{k}] = "
+            f"static {dtype} {name}_A_dram [{m}][{k}] = "
             + array_to_cstr(kwargs["bits_A"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_B_dram [{k}*{n}] = "
+            f"static {dtype} {name}_B_dram [{k}][{n}] = "
             + array_to_cstr(kwargs["bits_B"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_C_dram [{m}*{n}] = "
+            f"static {dtype} {name}_C_dram [{m}][{n}] = "
             + array_to_cstr(kwargs["bits_C"], fmt="char")
             + ";\n\n\n"
         )
@@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
@@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1):
     )
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    orig_dtype = ifmap.dtype
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -459,7 +475,7 @@ def fused_conv(
         co,
     )
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
         ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
@@ -499,6 +515,11 @@ def fused_conv(
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
     return ofmap, ofmap_before, ifmap_padded
 
 
@@ -538,16 +559,16 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
             param["channels"]["out"],
             param["channels"]["in"],
             param["filter"]["height"],
             param["filter"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = conv2d(
             ifmap,
@@ -568,7 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -602,8 +626,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
@@ -621,8 +645,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
@@ -639,8 +663,8 @@ def main():
             param["dim_in_x"],
             param["ch_in"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         if not param["depthwise"]:
             kernel = torch.randn(
                 param["ch_out"],
@@ -648,31 +672,31 @@ def main():
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
                 param["dim_kernel_y"],
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
             ifmap,
             kernel,
             bn_k,
             bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
index 0cd18994..4da0e20d 100755
--- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
@@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1):
     )
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    orig_dtype = ifmap.dtype
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -459,7 +475,7 @@ def fused_conv(
         co,
     )
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
         ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
@@ -499,6 +515,11 @@ def fused_conv(
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
     return ofmap, ofmap_before, ifmap_padded
 
 
@@ -538,16 +559,16 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
             param["channels"]["out"],
             param["channels"]["in"],
             param["filter"]["height"],
             param["filter"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = conv2d(
             ifmap,
@@ -568,7 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -602,8 +626,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
@@ -621,8 +645,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
@@ -639,8 +663,8 @@ def main():
             param["dim_in_x"],
             param["ch_in"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         if not param["depthwise"]:
             kernel = torch.randn(
                 param["ch_out"],
@@ -648,31 +672,31 @@ def main():
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
                 param["dim_kernel_y"],
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
             ifmap,
             kernel,
             bn_k,
             bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
index 9918713e..4da0e20d 100755
--- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -167,23 +170,23 @@ def emit_GEMM_layer(name="gemm", **kwargs):
             + ";\n\n\n"
         )
         layer_str += (
-            f"static const {dtype} {name}_checksum[{m}] = "
-            + array_to_cstr(torch.sum(result, dim=-1))
+            f"static const {dtype} {name}_result[{m}*{n}] = "
+            + array_to_cstr(result)
             + ";\n\n\n"
         )
     else:
         layer_str += (
-            f"static {dtype} {name}_A_dram [{m}*{k}] = "
+            f"static {dtype} {name}_A_dram [{m}][{k}] = "
             + array_to_cstr(kwargs["bits_A"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_B_dram [{k}*{n}] = "
+            f"static {dtype} {name}_B_dram [{k}][{n}] = "
             + array_to_cstr(kwargs["bits_B"], fmt="char")
             + ";\n\n\n"
         )
         layer_str += (
-            f"static {dtype} {name}_C_dram [{m}*{n}] = "
+            f"static {dtype} {name}_C_dram [{m}][{n}] = "
             + array_to_cstr(kwargs["bits_C"], fmt="char")
             + ";\n\n\n"
         )
@@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
@@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1):
     )
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    orig_dtype = ifmap.dtype
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -459,7 +475,7 @@ def fused_conv(
         co,
     )
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
         ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
@@ -499,6 +515,11 @@ def fused_conv(
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
     return ofmap, ofmap_before, ifmap_padded
 
 
@@ -538,16 +559,16 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
             param["channels"]["out"],
             param["channels"]["in"],
             param["filter"]["height"],
             param["filter"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = conv2d(
             ifmap,
@@ -568,7 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -602,8 +626,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
@@ -621,8 +645,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
@@ -639,8 +663,8 @@ def main():
             param["dim_in_x"],
             param["ch_in"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         if not param["depthwise"]:
             kernel = torch.randn(
                 param["ch_out"],
@@ -648,31 +672,31 @@ def main():
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
                 param["dim_kernel_y"],
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
             ifmap,
             kernel,
             bn_k,
             bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]:
diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
index 0cd18994..4da0e20d 100755
--- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
+++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py
@@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float):
         if isinstance(a, np.ndarray):
             a = a.flat
         if isinstance(a, torch.Tensor):
+            # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone
+            if a.dtype == torch.bfloat16:
+                a = a.float()
             a = a.cpu().numpy().flat
         for el in a:
             out += "{}, ".format(el)
@@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs):
     layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n'
     layer_str += (
         f"static double {name}_checksum[{oh}][{ow}] = "
-        + array_to_cstr(torch.sum(ofmap, dim=-1))
+        + array_to_cstr(torch.sum(ofmap.float(), dim=-1))
         + ";\n\n\n"
     )
     layer_str += (
@@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False):
         return torch.randn(shape, requires_grad=False, dtype=torch.float32), {}
     elif prec == 16:
         if alt:
-            return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {}
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {}
         else:
-            return (
-                torch.randn(
-                    shape, requires_grad=False, dtype=torch.float16, device=device
-                ),
-                {},
-            )
+            return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {}
     elif prec == 8:
         sign = torch.randint(
             0, 2, shape, requires_grad=False, dtype=torch.uint8
@@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False):
 
 
 def conv2d(ifmap, weights, padding=1, stride=1):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap, weights = ifmap.float(), weights.float()
+
     n, ci, ih, iw = ifmap.shape
     co, _, fh, fw = weights.shape
 
@@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1):
     )
     ofmap = conv2d(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def max_pooling(ifmap, kernel):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     max_pool = nn.MaxPool2d(kernel_size=kernel)
     ofmap = max_pool(ifmap)
 
-    return ofmap
+    return ofmap.to(orig_dtype)
 
 
 def batchnorm(ifmap):
+    orig_dtype = ifmap.dtype
+    if orig_dtype in [torch.float16, torch.bfloat16]:
+        ifmap = ifmap.float()
+
     n, ci, ih, iw = ifmap.shape
     bn = torch.nn.BatchNorm2d(ci)
     bn.weight.requires_grad = False
     bn.bias.requires_grad = False
-    running_mean = torch.randn_like(bn.running_mean, requires_grad=False)
-    running_var = torch.rand_like(bn.running_var, requires_grad=False)
+
+    running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype)
+    running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype)
+
     gamma = bn.weight / torch.sqrt(running_var + bn.eps)
     beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps)
     ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1)
 
-    return ofmap, gamma, beta
+    return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype)
 
 
 def fused_conv(
     ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise
 ):
+    orig_dtype = ifmap.dtype
+    is_half = orig_dtype in [torch.float16, torch.bfloat16]
+    if is_half:
+        ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float()
 
     ih, iw, ci = ifmap.shape
     if not depthwise:
@@ -459,7 +475,7 @@ def fused_conv(
         co,
     )
     if accumulate:
-        ofmap_before = torch.randn_like(ofmap, requires_grad=False)
+        ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype)
     else:
         ofmap_before = torch.zeros_like(ofmap, requires_grad=False)
 
@@ -499,6 +515,11 @@ def fused_conv(
     if relu:
         ofmap = torch.nn.functional.relu(ofmap)
 
+    if is_half:
+        ofmap = ofmap.to(orig_dtype)
+        ofmap_before = ofmap_before.to(orig_dtype)
+        ifmap_padded = ifmap_padded.to(orig_dtype)
+
     return ofmap, ofmap_before, ifmap_padded
 
 
@@ -538,16 +559,16 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         weights = torch.randn(
             param["channels"]["out"],
             param["channels"]["in"],
             param["filter"]["height"],
             param["filter"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = conv2d(
             ifmap,
@@ -568,7 +589,10 @@ def main():
         mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"])
         mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"])
 
-        result = torch.matmul(mat_A, mat_B)
+        if mat_A.dtype in [torch.float16, torch.bfloat16]:
+            result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype)
+        else:
+            result = torch.matmul(mat_A, mat_B)
 
         if param["transpose_A"]:
             mat_A = mat_A.T
@@ -602,8 +626,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap, gamma, beta = batchnorm(ifmap)
 
@@ -621,8 +645,8 @@ def main():
             param["input_dim"]["height"],
             param["input_dim"]["width"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
 
         ofmap = max_pooling(ifmap, param["kernel_size"])
 
@@ -639,8 +663,8 @@ def main():
             param["dim_in_x"],
             param["ch_in"],
             requires_grad=False,
-            dtype=dtype,
-        )
+            dtype=torch.float32,
+        ).to(dtype)
         if not param["depthwise"]:
             kernel = torch.randn(
                 param["ch_out"],
@@ -648,31 +672,31 @@ def main():
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
         else:
             kernel = torch.randn(
                 param["dim_kernel_y"],
                 param["dim_kernel_x"],
                 param["ch_in"],
                 requires_grad=False,
-                dtype=dtype,
-            )
+                dtype=torch.float32,
+            ).to(dtype)
 
-        bn_k = torch.randn(param["ch_out"], requires_grad=False)
-        bn_l = torch.randn(param["ch_out"], requires_grad=False)
+        bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
+        bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype)
 
         ofmap, ofmap_before, ifmap_padded = fused_conv(
             ifmap,
             kernel,
             bn_k,
             bn_l,
-            param["padding"],
-            param["stride"],
-            param["flags"]["flag_batch_norm"],
-            param["flags"]["flag_relu"],
-            not param["flags"]["flag_y_accumulate_start"],
-            param["depthwise"],
+            padding=param["padding"],
+            stride=param["stride"],
+            bn=param["flags"]["flag_batch_norm"],
+            relu=param["flags"]["flag_relu"],
+            accumulate=not param["flags"]["flag_y_accumulate_start"],
+            depthwise=param["depthwise"],
         )
 
         if param["chw_layer"]: