diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv index 469a3ebf..423f9d9e 100644 --- a/hw/ip/snitch_test/src/tb_bin.sv +++ b/hw/ip/snitch_test/src/tb_bin.sv @@ -46,11 +46,13 @@ module tb_bin; if (exit_code == 0) #200ns; end while (exit_code == 0); + exit_code >>= 1; - if (exit_code > 0) begin - $error("[FAILURE] Finished with exit code %2d", exit_code); - end else begin + + if (exit_code == 0) begin $info("[SUCCESS] Program finished successfully"); + end else begin + $error("[FAILURE] Finished with exit code %2d", exit_code); end $finish; end diff --git a/hw/ip/snitch_test/src/verilator_lib.cc b/hw/ip/snitch_test/src/verilator_lib.cc index b04a44db..583d54c2 100644 --- a/hw/ip/snitch_test/src/verilator_lib.cc +++ b/hw/ip/snitch_test/src/verilator_lib.cc @@ -32,10 +32,10 @@ int Sim::run() { target.init(sim_thread_main, this); int exit_code = htif_t::run(); - if (exit_code > 0) - fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code); - else + if (exit_code == 0) fprintf(stderr, "[SUCCESS] Program finished successfully\n"); + else + fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code); return exit_code; } diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile index 3fa7bd52..eb87cc4e 100644 --- a/hw/system/spatz_cluster/Makefile +++ b/hw/system/spatz_cluster/Makefile @@ -133,6 +133,20 @@ spatz.gendata: fi \ done +.PHONY: spatz.cleandata +spatz.cleandata: + @for benchmark_dir in $(ROOT)/sw/spatzBenchmarks/*/; do \ + data_dir="$$benchmark_dir/data"; \ + if [ -d "$$data_dir" ]; then \ + data_count=$$(find "$$data_dir" -name 'data*.h' -type f 2>/dev/null | wc -l); \ + if [ "$$data_count" -gt 0 ]; then \ + echo "Cleaning $$data_count data file(s) from $$data_dir"; \ + rm -f "$$data_dir"/data*.h; \ + fi \ + fi \ + done + @echo "All benchmark data cleaned." + ############# # Verilator # ############# diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt index 70f72e5c..7c7d5170 100644 --- a/sw/spatzBenchmarks/CMakeLists.txt +++ b/sw/spatzBenchmarks/CMakeLists.txt @@ -110,6 +110,7 @@ if (ELEN EQUAL 64) add_spatz_test_threeParam(dp-fmatmul dp-fmatmul/main.c 64 64 64 ) add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64 128 64) + add_spatz_test_threeParam_type(dp-sa-gemv sa-gemv/main.c 256 128 16 64) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 256) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 1024) @@ -121,6 +122,7 @@ if (ELEN EQUAL 64) add_spatz_test_threeParam(dp-fconv2d dp-fconv2d/main.c 64 64 7) add_spatz_test_twoParam(dp-fft dp-fft/main.c 128 2) + endif() add_spatz_test_threeParam(sp-fmatmul sp-fmatmul/main.c 64 64 64 ) @@ -150,6 +152,7 @@ add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128) add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32) add_spatz_test_twoParam_type(hp-gemv gemv/main.c 256 128 16) +add_spatz_test_threeParam_type(hp-sa-gemv sa-gemv/main.c 128 4096 512 16) add_spatz_test_twoParam(sp-fft sp-fft/main.c 256 2) add_spatz_test_twoParam(sp-fft sp-fft/main.c 512 2) diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py index 177733ef..532d725b 100644 --- a/sw/spatzBenchmarks/gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/gemv/script/gen_data.py @@ -23,7 +23,8 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = list(a.flat) elif isinstance(a, torch.Tensor): - a = a.numpy().flatten().tolist() + # Universal Fix: Cast to float32 before sending to NumPy/C-string to avoid formatting errors + a = a.float().numpy().flatten().tolist() else: a = list(a) for i, el in enumerate(a): @@ -121,9 +122,11 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return torch.randn(shape, requires_grad=False, dtype=torch.float16), {} + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -142,15 +145,8 @@ def rand_data_generator(shape, prec, alt=False): def gemv(a, b): - # PyTorch doesn't support matmul for float16 on CPU, so convert to float32 - original_dtype = a.dtype - if original_dtype == torch.float16: - a = a.float() - b = b.float() - result = torch.matmul(a, b) - if original_dtype == torch.float16: - result = result.half() - return result + # Universal Fix: One-liner upcast and downcast + return torch.matmul(a.float(), b.float()).to(a.dtype) def main(): diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py index 0cd18994..4da0e20d 100755 --- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False): def conv2d(ifmap, weights, padding=1, stride=1): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape @@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1): ) ofmap = conv2d(ifmap) - return ofmap + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + orig_dtype = ifmap.dtype + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -459,7 +475,7 @@ def fused_conv( co, ) if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: ofmap_before = torch.zeros_like(ofmap, requires_grad=False) @@ -499,6 +515,11 @@ def fused_conv( if relu: ofmap = torch.nn.functional.relu(ofmap) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + return ofmap, ofmap_before, ifmap_padded @@ -538,16 +559,16 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) weights = torch.randn( param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = conv2d( ifmap, @@ -568,7 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -602,8 +626,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) @@ -621,8 +645,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) @@ -639,8 +663,8 @@ def main(): param["dim_in_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) if not param["depthwise"]: kernel = torch.randn( param["ch_out"], @@ -648,31 +672,31 @@ def main(): param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( ifmap, kernel, bn_k, bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h new file mode 100644 index 00000000..62da8ce6 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h @@ -0,0 +1,16 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; + +typedef struct gemv_layer_struct { + uint32_t M; + uint32_t N; + + precision_t dtype; +} gemv_layer; diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c new file mode 100644 index 00000000..31973656 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c @@ -0,0 +1,165 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Navaneeth Kunhi Purayil, ETH Zurich +// Author: Diyou Shen, ETH Zurich + +#include "sa-gemv.h" + +void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + double *a_, *a_start = a; + double *c_ = c; + + do { + a_ = a_start; + double *b_ = b; + asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + // 1. CLEAR ACCUMULATORS for every new vl block (0 encodes to +0.0 float) + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle64.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + b_++; + + asm volatile("vle64.v v8, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_)); + b_++; + } + + // 2. HANDLE ODD N BOUNDARY + if (col < N) { + asm volatile("vle64.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + // 3. ACCUMULATE INTO MEMORY C (Load -> Add -> Store) + asm volatile("vle64.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse64.v v4, (%0)" ::"r"(c_)); + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} + +void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + float *a_, *a_start = a; + float *c_ = c; + + do { + a_ = a_start; + float *b_ = b; + asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle32.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + b_++; + + asm volatile("vle32.v v8, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_)); + b_++; + } + + if (col < N) { + asm volatile("vle32.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + asm volatile("vle32.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse32.v v12, (%0)" ::"r"(c_)); + // Correction: + // asm volatile("vse32.v v4, (%0)" ::"r"(c_)); + // Let's rewrite this block safely: + asm volatile("vse32.v v4, (%0)" ::"r"(c_)); + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} + +void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + __fp16 *a_, *a_start = a; + __fp16 *c_ = c; + + do { + a_ = a_start; + __fp16 *b_ = b; + asm volatile("vsetvli %0, %1, e16, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle16.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vle16.v v8, (%0)" ::"r"(a_)); + a_ += M; + + float t0, t1; + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_)); + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0)); + b_++; + + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t1) : [b] "r"(b_)); + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(t1)); + b_++; + } + + if (col < N) { + asm volatile("vle16.v v0, (%0)" ::"r"(a_)); + a_ += M; + float t0; + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_)); + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + asm volatile("vle16.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse16.v v4, (%0)" ::"r"(c_)); + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h new file mode 100644 index 00000000..1b172260 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h @@ -0,0 +1,26 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Navaneeth Kunhi Purayil, ETH Zurich + +#ifndef _GEMV_H +#define _GEMV_H + +void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N); +void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N); +void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N); + +#endif diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c new file mode 100644 index 00000000..2875d52c --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -0,0 +1,402 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Diyou Shen, ETH Zurich +// Author: Navaneeth Kunhi Purayil, ETH Zurich + +#include +#include +#include +#include + +#include DATAHEADER +#include "kernel/sa-gemv.c" + +#if (PREC == 64) +#define T double +#elif (PREC == 32) +#define T float +#elif (PREC == 16) +#define T __fp16 +#else +#define T double +#endif + +// Debugging defines +// #define DEBUG_NZ +// #define DEBUG_NZ_IDX +// #define DEBUG_GEMV_PreLD +// #define DEBUG_GEMV_DB + +T *vec_buf0; +T *vec_buf1; +uint16_t *dense_idx; +T *dense_vec; +T *mat_buf0; +T *mat_buf1; +T *result; + +static inline int fp_check(const T *a, const T *b) { + const T threshold = 0.001; + + // Absolute value + float comp = (float)*a - (float)*b; + if (comp < 0) + comp = -comp; + + return comp > threshold; +} + +int main() { + const unsigned int num_cores = snrt_cluster_core_num(); + const unsigned int cid = snrt_cluster_core_idx(); + + // Reset timer + unsigned int timer = (unsigned int)-1; + unsigned int timer_best = (unsigned int)-1; + unsigned int timer_nz = (unsigned int)-1; + const unsigned int m_core = gemv_l.M / num_cores; + // Size (in KiB) of L1 SPM, used to calculate tiling window + const unsigned int spm_size = 128; + + // For Sparse Attention GEMV, we need several steps + // 1. Find all non-zeros + // 2. Calculate the GEMV + // What to be double buffered? + // 1. 2 chunks of sparse vector + densed vector (output) + densed idx + // 2. 2 chunks of matrix + densed vector (output) + densed idx + + // Sizes of each part we need + const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram; + const uint32_t row_size = sizeof(T) * gemv_l.M; + const uint32_t vec_size = sizeof(T) * gemv_l.N; + const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram; + const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram; + const uint32_t result_size = sizeof(T) * gemv_l.M; + + // leave 8 KiB for Stack + const uint32_t l1_size = (spm_size - 8) * 1024; + const uint32_t fixed_alloc_size = + dense_vec_size + dense_idx_size + result_size; + + // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? --- + if (fixed_alloc_size >= l1_size) { + if (cid == 0) { + PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but " + "only %u bytes available.\n", + fixed_alloc_size, l1_size); + } + snrt_cluster_hw_barrier(); + return -1; // Exit gracefully + } + + const uint32_t l1_for_chunk = l1_size - fixed_alloc_size; + + // How many whole rows (or columns) can fit in half the L1 chunk space? + const uint32_t num_row_mat = (l1_for_chunk / 2) / row_size; + + // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? --- + if (num_row_mat < 1) { + if (cid == 0) { + PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double " + "buffering. " + "Chunk space left: %u bytes, Row size: %u bytes.\n", + l1_for_chunk, row_size); + } + snrt_cluster_hw_barrier(); + return -1; // Exit gracefully + } + + // Always strictly split the available memory in half for double-buffering + const uint32_t vec_chunk_size = l1_for_chunk / 2; + const uint32_t num_vec_chunk = + (l1_for_chunk > vec_size) + ? 1 + : ((vec_size + vec_chunk_size - 1) / vec_chunk_size); + + // Recalculate exact chunk size based on whole rows + const uint32_t mat_chunk_size = num_row_mat * row_size; + + // Number of chunks based on the total non-zeros we need to process + const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat; + + // Number of elements in each chunk + const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T); + + // Memory Allocation + if (cid == 0) { + result = (T *)snrt_l1alloc(result_size); + dense_vec = (T *)snrt_l1alloc(dense_vec_size); + vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk); + dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size); + + // Offset by half of the size if needed by double buffering + vec_buf1 = vec_buf0 + vec_chunk_len; + + mat_buf0 = vec_buf0; + mat_buf1 = vec_buf1; + } + + // MUST zero out the memory accumulator! + if (cid == 0) { + for (unsigned int i = 0; i < gemv_l.M; i++) { + result[i] = 0.0; + } + } + snrt_cluster_hw_barrier(); + + if (cid == 0) + start_kernel(); + + timer = benchmark_get_cycle(); + + // Calculate internal pointers + T *vec_ptr = vec_buf0; + T *vec_db_ptr = vec_buf1; + + // Task 1: Find out the non-zeros + if (cid == 0) { +#ifdef DEBUG_NZ + PRINTF("NZ-Calc PreLD\n"); + PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, + vec_chunk_size); +#endif + snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size); + snrt_dma_wait_all(); + } + + uint32_t nz_count = 0; + + if (cid == 0) { + for (unsigned int i = 0; i < num_vec_chunk; ++i) { + // Step 1.1: preload the next chunk if not the end + // Make sure the previous load completes + snrt_dma_wait_all(); + // Double buffer to search the next non-zero + uint32_t next_bytes = + (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) + ? (vec_size - (i + 1) * vec_chunk_size) + : vec_chunk_size; + + if (i < num_vec_chunk - 1) { +#ifdef DEBUG_NZ + PRINTF("NZ-Calc DB Iter%u\n", i); + PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", + gemv_vec_dram + (i + 1) * vec_chunk_len, vec_db_ptr, next_bytes); +#endif + snrt_dma_start_1d(vec_db_ptr, gemv_vec_dram + (i + 1) * vec_chunk_len, + next_bytes); // Use exact bytes + } + + for (unsigned int j = 0; j < vec_chunk_len; ++j) { + if ((float)vec_ptr[j] != 0.0) { + dense_vec[nz_count] = vec_ptr[j]; + dense_idx[nz_count] = i * vec_chunk_len + j; + nz_count++; + } + + if (nz_count == tot_nz_dram) + break; + } + + if (nz_count == tot_nz_dram) + break; + + if (i % 2 == 0) { + // pointer exchange + vec_ptr = vec_buf1; + vec_db_ptr = vec_buf0; + } else { + vec_ptr = vec_buf0; + vec_db_ptr = vec_buf1; + } + } + } + + snrt_cluster_hw_barrier(); + +#ifdef DEBUG_NZ + if (cid == 0) + PRINTF("Non-Zero Calc Complete\n"); +#endif + +#ifdef DEBUG_NZ_IDX + if (cid == 0) { + for (uint32_t i = 0; i < tot_nz_dram; i++) { + PRINTF("IDX[%u]=%u\n", i, dense_idx[i]); + } + } +#endif + + timer_nz = benchmark_get_cycle() - timer_nz; + timer = benchmark_get_cycle(); + + // Task 2: GEMV calculation + // Calculate internal pointers + T *mat_ptr = mat_buf0; + T *mat_db_ptr = mat_buf1; + T *result_core = result + m_core * cid; + uint16_t *idx_ptr = dense_idx; // Corrected pointer type + + if (cid == 0) { + // Determine how many rows are actually active for this very first chunk + uint32_t active_rows = + (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat; + +#ifdef DEBUG_GEMV_PreLD + PRINTF("GEMV PreLD\n"); + PRINTF("Active Rows:%u\n", active_rows); +#endif + + for (unsigned int i = 0; i < active_rows; i++) { +#ifdef DEBUG + PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_ptr + i * gemv_l.M, row_size); +#endif + snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1 + gemv_mat_dram + + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM + row_size); + idx_ptr++; + } + } + + snrt_cluster_hw_barrier(); + +#ifdef DEBUG_GEMV_PreLD + if (cid == 0) + PRINTF("GEMV PreLD Complete\n"); +#endif + +#ifdef DEBUG_GEMV_DB + if (cid == 0) + PRINTF("Tot Chunks %u\n", num_mat_chunk); +#endif + + for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) { + // Wait for the CURRENT chunk to finish loading + if (cid == 0) { + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + + // Determine bounds for the NEXT chunk (for background DMA) + uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat; + uint32_t next_active_rows = 0; + + if (next_chunk_start < tot_nz_dram) { + next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) + ? (tot_nz_dram - next_chunk_start) + : num_row_mat; + } + +#ifdef DEBUG_GEMV_DB + if (cid == 0) + PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows); +#endif + + // Load NEXT chunk in the background + if (cid == 0 && next_active_rows > 0) { + for (unsigned int i = 0; i < next_active_rows; i++) { +#ifdef DEBUG_GEMV_DB + PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", idx_ptr, i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_db_ptr + i * gemv_l.M, row_size); +#endif + snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + row_size); + idx_ptr++; + } + } + + // Calculate active rows for the CURRENT compute phase + uint32_t curr_active_rows = + (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) + ? (tot_nz_dram - chunk_idx * num_row_mat) + : num_row_mat; + + // Calculate GEMV on the current chunk + T *current_dense_vec = dense_vec + chunk_idx * num_row_mat; + + // Offset the matrix pointer by m_core * cid so each core reads its correct + // rows + T *mat_core_ptr = mat_ptr + m_core * cid; + +#if (PREC == 64) + gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#elif (PREC == 32) + gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#else + gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#endif + + // Swap pointers for the next iteration + T *temp = mat_ptr; + mat_ptr = mat_db_ptr; + mat_db_ptr = temp; + } + + snrt_cluster_hw_barrier(); + + timer = benchmark_get_cycle() - timer; + + if (cid == 0) + stop_kernel(); + + // Result Checking + if (cid == 0) { + // Checking + for (unsigned int i = 0; i < gemv_l.M; i++) { + if (fp_check(&result[i], &gemv_result[i])) { +#if (PREC == 64) + PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], + gemv_result[i]); +#else + PRINTF("Error: ID: %i Result = %x, Golden = %x\n", i, + *(int *)&result[i], *(int *)&gemv_result[i]); +#endif + } + } + } + + snrt_cluster_hw_barrier(); + + // Check and display results + // Assume 2 core 4 fpu configuration + if (cid == 0) { + // Flops per cycle + long unsigned int performance = 1000 * 2 * gemv_l.M * tot_nz_dram / timer; + // Ideal perf = MACC * NCore * Nfpu * Prec adjustment + long unsigned int utilization = + performance / (2 * num_cores * 4 * 8 / sizeof(T)); + + PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, + tot_nz_dram); + PRINTF("The NZ finding takes %u cycles.\n", timer_nz); + PRINTF("The GEMV execution took %u cycles.\n", timer); + PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n", + performance, utilization); + } + + // Wait for core 0 to finish displaying results + snrt_cluster_hw_barrier(); + return 0; +} diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py new file mode 100644 index 00000000..4d268d6c --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Matheus Cavalcante + +import numpy as np +import torch +import argparse +import pathlib +import hjson + +np.random.seed(42) +torch.manual_seed(42) + +global verbose + + +def array_to_cstr(a, fmt=float): + out = "{\n" + if fmt == float: + if isinstance(a, np.ndarray): + a = a.flat + if isinstance(a, torch.Tensor): + a = a.numpy().flat + for el in a: + out += "\t{},\n".format(el) + else: + for sign, exp, mant in zip( + a["sign"].numpy().flat, + a["exponent"].numpy().flat, + a["mantissa"].numpy().flat, + ): + value = sign * 2**7 + exp * 2**2 + mant + out += "0x{:02x},\n".format(value) + out = out[:-2] + "}" + return out + + +def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" + emit_str = ( + "// Copyright 2025 ETH Zurich and University of Bologna.\n" + + "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" + + "// SPDX-License-Identifier: Apache-2.0\n\n" + + "// This file was generated automatically.\n\n" + ) + + file = file_path / ("data_" + str(kwargs["M"]) + "_" + str(kwargs["N"]) + "_" + str(kwargs["tot_nz"]) + "_" + str(kwargs["prec"]) + ".h") + emit_str += emit_gemv_layer(**kwargs) + with file.open("w") as f: + f.write(emit_str) + + +def emit_gemv_layer(name="gemv", **kwargs): + mat_A = kwargs["A"] + vec_B = kwargs["B"] + result = kwargs["result"] + + m = kwargs["M"] + n = kwargs["N"] + tot_nz = kwargs["tot_nz"] + + layer_str = "" + layer_str += '#include "layer.h"\n\n' + layer_str += f"const gemv_layer {name}_l = {{\n" + layer_str += f"\t.M = {m},\n" + layer_str += f"\t.N = {n},\n" + layer_str += f'\t.dtype = FP{kwargs["prec"]}' + layer_str += "};\n\n" + + # Export the total non-zeros directly so the kernel can use it + layer_str += f"const uint32_t tot_nz_dram = {tot_nz};\n\n" + + ctypes = {"64": "double", "32": "float", "16": "__fp16", "8": "char"} + + dtype = ctypes[str(kwargs["prec"])] + if dtype != "char": + layer_str += ( + f'static {dtype} {name}_mat_dram[{m}*{n}] __attribute__((section(".data"))) = ' + + array_to_cstr(mat_A) + + ";\n\n" + ) + layer_str += ( + f'static {dtype} {name}_vec_dram[{n}] __attribute__((section(".data"))) = ' + + array_to_cstr(vec_B) + + ";\n\n" + ) + layer_str += ( + f'static {dtype} {name}_result[{m}] __attribute__((section(".data"))) = ' + + array_to_cstr(result) + + ";\n" + ) + # Assuming you have variables like M (output size) and tot_nz (number of non-zeros) + layer_str += '// Auto-generated buffers for Cache Mode\n' + layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n' + layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n' + layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n' + else: + layer_str += ( + f"static {dtype} {name}_mat_dram[{m}*{n}] = " + + array_to_cstr(kwargs["bits_A"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_vec_dram[{n}] = " + + array_to_cstr(kwargs["bits_B"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_result[{m}] = " + + array_to_cstr(kwargs["result"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_result_buf_dram[{m}] =" + + array_to_cstr(kwargs["result"], fmt="char") + + ";\n\n\n" + ) + + return layer_str + + +def rand_data_generator(shape, prec, alt=False): + if prec == 64: + return torch.randn(shape, requires_grad=False, dtype=torch.float64), {} + elif prec == 32: + return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} + elif prec == 16: + if alt: + # Generate in FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} + else: + # Generate in FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {} + elif prec == 8: + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary + bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct + return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( + 1.0 + mantissa.double() / (2**2) + ), bits + + +def gemv(a, b): + print(a.shape, b.shape) + # Upcast to float32 for CPU math, then downcast back to the original dtype + return torch.matmul(a.float(), b.float()).to(a.dtype) + + +def main(): + + parser = argparse.ArgumentParser(description="Generate data for kernels") + parser.add_argument( + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + + args = parser.parse_args() + + global verbose + verbose = args.verbose + + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + # Read tot_nz from the hjson file + tot_nz = param["tot_nz"] + + mat_A, bits_A = rand_data_generator((param["M"], param["N"]), param["prec"]) + vec_B, bits_B = rand_data_generator((param["N"], 1), param["prec"]) + + # --- Sparsity Logic --- + # Randomly select `tot_nz` indices to keep, set the rest to 0.0 + nz_indices = torch.randperm(param["N"])[:tot_nz] + mask = torch.zeros((param["N"], 1), dtype=torch.bool) + mask[nz_indices, 0] = True + + # Temporarily upcast to float32 for the masking math, then cast back + vec_B = (vec_B.float() * mask).to(vec_B.dtype) + + # Also zero out the raw bits if using 8-bit precision to maintain parity + if bool(bits_B): + for k in bits_B.keys(): + # Apply the mask, ensuring the shape matches the 1D bits array format + bits_B[k] = bits_B[k] * mask.squeeze().byte() + # ---------------------- + + # Calculate result using the now-sparse vector + result = gemv(mat_A, vec_B) + + # Store A in col major format + mat_A = mat_A.T + + kwargs = { + "A": mat_A, + "B": vec_B, + "result": result, + "M": param["M"], + "N": param["N"], + "tot_nz": tot_nz, # Pass the new parameter down + "prec": param["prec"], + "expand": param["expand"], + "bits_A": bits_A, + "bits_B": bits_B, + } + + emit_header_file("gemv", **kwargs) + + +if __name__ == "__main__": + main() diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json new file mode 100644 index 00000000..e14c70e4 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json @@ -0,0 +1,16 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMV + +{ + kernel: "GEMV" + M: 128, + N: 4096, + tot_nz: 512 + transpose_A: false, + transpose_B: false, + prec: 16, + expand: 0 +} diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json new file mode 100644 index 00000000..8942de89 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json @@ -0,0 +1,16 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMV + +{ + kernel: "GEMV" + M: 256, + N: 128, + tot_nz: 16 + transpose_A: false, + transpose_B: false, + prec: 64, + expand: 0 +} diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py index 9918713e..4da0e20d 100755 --- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -167,23 +170,23 @@ def emit_GEMM_layer(name="gemm", **kwargs): + ";\n\n\n" ) layer_str += ( - f"static const {dtype} {name}_checksum[{m}] = " - + array_to_cstr(torch.sum(result, dim=-1)) + f"static const {dtype} {name}_result[{m}*{n}] = " + + array_to_cstr(result) + ";\n\n\n" ) else: layer_str += ( - f"static {dtype} {name}_A_dram [{m}*{k}] = " + f"static {dtype} {name}_A_dram [{m}][{k}] = " + array_to_cstr(kwargs["bits_A"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_B_dram [{k}*{n}] = " + f"static {dtype} {name}_B_dram [{k}][{n}] = " + array_to_cstr(kwargs["bits_B"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_C_dram [{m}*{n}] = " + f"static {dtype} {name}_C_dram [{m}][{n}] = " + array_to_cstr(kwargs["bits_C"], fmt="char") + ";\n\n\n" ) @@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False): def conv2d(ifmap, weights, padding=1, stride=1): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape @@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1): ) ofmap = conv2d(ifmap) - return ofmap + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + orig_dtype = ifmap.dtype + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -459,7 +475,7 @@ def fused_conv( co, ) if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: ofmap_before = torch.zeros_like(ofmap, requires_grad=False) @@ -499,6 +515,11 @@ def fused_conv( if relu: ofmap = torch.nn.functional.relu(ofmap) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + return ofmap, ofmap_before, ifmap_padded @@ -538,16 +559,16 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) weights = torch.randn( param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = conv2d( ifmap, @@ -568,7 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -602,8 +626,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) @@ -621,8 +645,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) @@ -639,8 +663,8 @@ def main(): param["dim_in_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) if not param["depthwise"]: kernel = torch.randn( param["ch_out"], @@ -648,31 +672,31 @@ def main(): param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( ifmap, kernel, bn_k, bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py index 0cd18994..4da0e20d 100755 --- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False): def conv2d(ifmap, weights, padding=1, stride=1): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape @@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1): ) ofmap = conv2d(ifmap) - return ofmap + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + orig_dtype = ifmap.dtype + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -459,7 +475,7 @@ def fused_conv( co, ) if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: ofmap_before = torch.zeros_like(ofmap, requires_grad=False) @@ -499,6 +515,11 @@ def fused_conv( if relu: ofmap = torch.nn.functional.relu(ofmap) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + return ofmap, ofmap_before, ifmap_padded @@ -538,16 +559,16 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) weights = torch.randn( param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = conv2d( ifmap, @@ -568,7 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -602,8 +626,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) @@ -621,8 +645,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) @@ -639,8 +663,8 @@ def main(): param["dim_in_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) if not param["depthwise"]: kernel = torch.randn( param["ch_out"], @@ -648,31 +672,31 @@ def main(): param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( ifmap, kernel, bn_k, bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py index 9918713e..4da0e20d 100755 --- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -167,23 +170,23 @@ def emit_GEMM_layer(name="gemm", **kwargs): + ";\n\n\n" ) layer_str += ( - f"static const {dtype} {name}_checksum[{m}] = " - + array_to_cstr(torch.sum(result, dim=-1)) + f"static const {dtype} {name}_result[{m}*{n}] = " + + array_to_cstr(result) + ";\n\n\n" ) else: layer_str += ( - f"static {dtype} {name}_A_dram [{m}*{k}] = " + f"static {dtype} {name}_A_dram [{m}][{k}] = " + array_to_cstr(kwargs["bits_A"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_B_dram [{k}*{n}] = " + f"static {dtype} {name}_B_dram [{k}][{n}] = " + array_to_cstr(kwargs["bits_B"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_C_dram [{m}*{n}] = " + f"static {dtype} {name}_C_dram [{m}][{n}] = " + array_to_cstr(kwargs["bits_C"], fmt="char") + ";\n\n\n" ) @@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False): def conv2d(ifmap, weights, padding=1, stride=1): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape @@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1): ) ofmap = conv2d(ifmap) - return ofmap + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + orig_dtype = ifmap.dtype + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -459,7 +475,7 @@ def fused_conv( co, ) if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: ofmap_before = torch.zeros_like(ofmap, requires_grad=False) @@ -499,6 +515,11 @@ def fused_conv( if relu: ofmap = torch.nn.functional.relu(ofmap) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + return ofmap, ofmap_before, ifmap_padded @@ -538,16 +559,16 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) weights = torch.randn( param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = conv2d( ifmap, @@ -568,7 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -602,8 +626,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) @@ -621,8 +645,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) @@ -639,8 +663,8 @@ def main(): param["dim_in_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) if not param["depthwise"]: kernel = torch.randn( param["ch_out"], @@ -648,31 +672,31 @@ def main(): param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( ifmap, kernel, bn_k, bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py index 0cd18994..4da0e20d 100755 --- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -96,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -215,7 +218,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -263,7 +266,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -363,14 +366,9 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -389,6 +387,10 @@ def rand_data_generator(shape, prec, alt=False): def conv2d(ifmap, weights, padding=1, stride=1): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape @@ -399,34 +401,48 @@ def conv2d(ifmap, weights, padding=1, stride=1): ) ofmap = conv2d(ifmap) - return ofmap + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + orig_dtype = ifmap.dtype + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -459,7 +475,7 @@ def fused_conv( co, ) if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: ofmap_before = torch.zeros_like(ofmap, requires_grad=False) @@ -499,6 +515,11 @@ def fused_conv( if relu: ofmap = torch.nn.functional.relu(ofmap) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + return ofmap, ofmap_before, ifmap_padded @@ -538,16 +559,16 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) weights = torch.randn( param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = conv2d( ifmap, @@ -568,7 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -602,8 +626,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) @@ -621,8 +645,8 @@ def main(): param["input_dim"]["height"], param["input_dim"]["width"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) @@ -639,8 +663,8 @@ def main(): param["dim_in_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) if not param["depthwise"]: kernel = torch.randn( param["ch_out"], @@ -648,31 +672,31 @@ def main(): param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], requires_grad=False, - dtype=dtype, - ) + dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( ifmap, kernel, bn_k, bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: