From 6a33c48679f38d92659156dff9f2bc35e2aea817 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 10:37:16 +0200 Subject: [PATCH 01/13] [SW] Fix a data generation problem: If only has cpu version of pytorch, the data generation of 16b and 8b will not function. --- hw/system/spatz_cluster/Makefile | 14 ++ sw/spatzBenchmarks/gemv/script/gen_data.py | 21 +- .../hp-fmatmul/script/gen_data.py | 233 +++++++----------- .../sdotp-bp-fmatmul/script/gen_data.py | 204 ++++++--------- .../sdotp-hp-fmatmul/script/gen_data.py | 196 ++++++--------- .../widening-bp-fmatmul/script/gen_data.py | 204 ++++++--------- .../widening-hp-fmatmul/script/gen_data.py | 196 ++++++--------- 7 files changed, 415 insertions(+), 653 deletions(-) diff --git a/hw/system/spatz_cluster/Makefile b/hw/system/spatz_cluster/Makefile index 3fa7bd52..eb87cc4e 100644 --- a/hw/system/spatz_cluster/Makefile +++ b/hw/system/spatz_cluster/Makefile @@ -133,6 +133,20 @@ spatz.gendata: fi \ done +.PHONY: spatz.cleandata +spatz.cleandata: + @for benchmark_dir in $(ROOT)/sw/spatzBenchmarks/*/; do \ + data_dir="$$benchmark_dir/data"; \ + if [ -d "$$data_dir" ]; then \ + data_count=$$(find "$$data_dir" -name 'data*.h' -type f 2>/dev/null | wc -l); \ + if [ "$$data_count" -gt 0 ]; then \ + echo "Cleaning $$data_count data file(s) from $$data_dir"; \ + rm -f "$$data_dir"/data*.h; \ + fi \ + fi \ + done + @echo "All benchmark data cleaned." + ############# # Verilator # ############# diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py index 177733ef..f33d5154 100644 --- a/sw/spatzBenchmarks/gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/gemv/script/gen_data.py @@ -23,7 +23,8 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = list(a.flat) elif isinstance(a, torch.Tensor): - a = a.numpy().flatten().tolist() + # Universal Fix: Cast to float32 before sending to NumPy/C-string to avoid formatting errors + a = a.float().numpy().flatten().tolist() else: a = list(a) for i, el in enumerate(a): @@ -121,9 +122,11 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return torch.randn(shape, requires_grad=False, dtype=torch.float16), {} + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {} elif prec == 8: sign = torch.randint( 0, 2, shape, requires_grad=False, dtype=torch.uint8 @@ -142,15 +145,8 @@ def rand_data_generator(shape, prec, alt=False): def gemv(a, b): - # PyTorch doesn't support matmul for float16 on CPU, so convert to float32 - original_dtype = a.dtype - if original_dtype == torch.float16: - a = a.float() - b = b.float() - result = torch.matmul(a, b) - if original_dtype == torch.float16: - result = result.half() - return result + # Universal Fix: One-liner upcast and downcast + return torch.matmul(a.float(), b.float()).to(a.dtype) def main(): @@ -198,3 +194,4 @@ def main(): if __name__ == "__main__": main() + \ No newline at end of file diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py index 0cd18994..f33c12b1 100755 --- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py @@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): - file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + ";\n\n\n" ) layer_str += ( @@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): - layer_str = "" return layer_str @@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(result) + + array_to_cstr(torch.sum(result.float(), dim=-1)) + ";\n\n\n" ) else: @@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): - ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint( - 0, 2, shape, requires_grad=False, dtype=torch.uint8 - ) # -1 or 1 - exponent = torch.randint( - 0, 16, shape, requires_grad=False, dtype=torch.uint8 - ) # < 0b01111 - mantissa = torch.randint( - 0, 4, shape, requires_grad=False, dtype=torch.uint8 - ) # can be arbitrary + sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) + exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) + mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} - # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): + # Universal Fix: Upcast for CPU Math + orig_dtype = ifmap.dtype + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - conv2d.bias = nn.Parameter( - torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False - ) + # Ensure bias natively generates in float32 + conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) + ofmap = conv2d(ifmap) - return ofmap + # Universal Fix: Downcast back + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) + running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + # Universal Fix: Upcast EVERYTHING before starting the math + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + weights = weights.float() + bn_k = bn_k.float() + bn_l = bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -440,14 +443,13 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, + dtype=ifmap.dtype, # Safely uses float32 ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap - # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -458,16 +460,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) + if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: - # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -478,7 +480,6 @@ def fused_conv( weights[:, :, c].flatten(), ) else: - # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -491,29 +492,22 @@ def fused_conv( ofmap += ofmap_before - # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l - # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - return ofmap, ofmap_before, ifmap_padded + # Universal Fix: Downcast back + return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) def main(): - parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", + "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") - args = parser.parse_args() global verbose @@ -532,31 +526,18 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": + # Generate safely in float32, then cast ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) weights = torch.randn( - param["channels"]["out"], - param["channels"]["in"], - param["filter"]["height"], - param["filter"]["width"], - requires_grad=False, - dtype=dtype, - ) + param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - ofmap = conv2d( - ifmap, - weights, - padding=param["filter"]["padding"], - stride=param["filter"]["stride"], - ) + ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -568,7 +549,8 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + # Upcast for CPU Math + result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) if param["transpose_A"]: mat_A = mat_A.T @@ -576,38 +558,22 @@ def main(): mat_B = mat_B.T kwargs = { - "A": mat_A, - "B": mat_B, - "C": mat_C, - "result": result, - "M": param["M"], - "N": param["N"], - "K": param["K"], - "ta": param["transpose_A"], - "tb": param["transpose_B"], - "alpha": param["alpha"], - "prec": param["prec"], - "expand": param["expand"], - "bits_A": bits_A, - "bits_B": bits_B, - "bits_C": bits_C, + "A": mat_A, "B": mat_B, "C": mat_C, "result": result, + "M": param["M"], "N": param["N"], "K": param["K"], + "ta": param["transpose_A"], "tb": param["transpose_B"], "alpha": param["alpha"], + "prec": param["prec"], "expand": param["expand"], + "bits_A": bits_A, "bits_B": bits_B, "bits_C": bits_C, } - emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -616,17 +582,12 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -635,44 +596,28 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], - param["dim_in_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_in_y"], param["dim_in_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) + if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, - kernel, - bn_k, - bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], + param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], + not param["flags"]["flag_y_accumulate_start"], param["depthwise"], ) if param["chw_layer"]: @@ -681,25 +626,15 @@ def main(): kernel = kernel.permute(0, 3, 1, 2) kwargs = { - "ifmap": ifmap, - "ifmap_padded": ifmap_padded, - "ofmap": ofmap, - "ofmap_before": ofmap_before, - "kernel": kernel, - "bn_k": bn_k, - "bn_l": bn_l, - "padding": param["padding"], - "stride": param["stride"], - "prec": param["prec"], - "flags": param["flags"], - "depthwise": param["depthwise"], - "chw_layer": param["chw_layer"], + "ifmap": ifmap, "ifmap_padded": ifmap_padded, "ofmap": ofmap, + "ofmap_before": ofmap_before, "kernel": kernel, "bn_k": bn_k, "bn_l": bn_l, + "padding": param["padding"], "stride": param["stride"], "prec": param["prec"], + "flags": param["flags"], "depthwise": param["depthwise"], "chw_layer": param["chw_layer"], } emit_header_file("FusedConv", **kwargs) else: print("No valid kernel selected") - if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py index 9918713e..0c500558 100755 --- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py @@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): - file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + ";\n\n\n" ) layer_str += ( @@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): - layer_str = "" return layer_str @@ -167,23 +165,23 @@ def emit_GEMM_layer(name="gemm", **kwargs): + ";\n\n\n" ) layer_str += ( - f"static const {dtype} {name}_checksum[{m}] = " - + array_to_cstr(torch.sum(result, dim=-1)) + f"static const {dtype} {name}_result[{m}*{n}] = " + + array_to_cstr(torch.sum(result.float(), dim=-1)) + ";\n\n\n" ) else: layer_str += ( - f"static {dtype} {name}_A_dram [{m}*{k}] = " + f"static {dtype} {name}_A_dram [{m}][{k}] = " + array_to_cstr(kwargs["bits_A"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_B_dram [{k}*{n}] = " + f"static {dtype} {name}_B_dram [{k}][{n}] = " + array_to_cstr(kwargs["bits_B"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_C_dram [{m}*{n}] = " + f"static {dtype} {name}_C_dram [{m}][{n}] = " + array_to_cstr(kwargs["bits_C"], fmt="char") + ";\n\n\n" ) @@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): - ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint( - 0, 2, shape, requires_grad=False, dtype=torch.uint8 - ) # -1 or 1 - exponent = torch.randint( - 0, 16, shape, requires_grad=False, dtype=torch.uint8 - ) # < 0b01111 - mantissa = torch.randint( - 0, 4, shape, requires_grad=False, dtype=torch.uint8 - ) # can be arbitrary + sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) + exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) + mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} - # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): + # Universal Fix: Upcast for CPU Math + orig_dtype = ifmap.dtype + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - conv2d.bias = nn.Parameter( - torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False - ) + # Ensure bias natively generates in float32 + conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) + ofmap = conv2d(ifmap) - return ofmap + # Universal Fix: Downcast back + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) + running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + # Universal Fix: Upcast EVERYTHING before starting the math + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + weights = weights.float() + bn_k = bn_k.float() + bn_l = bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -440,14 +443,13 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, + dtype=ifmap.dtype, # Safely uses float32 ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap - # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -458,16 +460,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) + if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: - # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -478,7 +480,6 @@ def fused_conv( weights[:, :, c].flatten(), ) else: - # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -491,29 +492,22 @@ def fused_conv( ofmap += ofmap_before - # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l - # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - return ofmap, ofmap_before, ifmap_padded + # Universal Fix: Downcast back + return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) def main(): - parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", + "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") - args = parser.parse_args() global verbose @@ -532,31 +526,18 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": + # Generate safely in float32, then cast ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) weights = torch.randn( - param["channels"]["out"], - param["channels"]["in"], - param["filter"]["height"], - param["filter"]["width"], - requires_grad=False, - dtype=dtype, - ) + param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - ofmap = conv2d( - ifmap, - weights, - padding=param["filter"]["padding"], - stride=param["filter"]["stride"], - ) + ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -568,7 +549,8 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + # Upcast for CPU Math + result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) if param["transpose_A"]: mat_A = mat_A.T @@ -592,22 +574,16 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } - emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -616,17 +592,12 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -635,44 +606,28 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], - param["dim_in_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_in_y"], param["dim_in_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) + if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, - kernel, - bn_k, - bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], + param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], + not param["flags"]["flag_y_accumulate_start"], param["depthwise"], ) if param["chw_layer"]: @@ -700,6 +655,5 @@ def main(): else: print("No valid kernel selected") - if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py index 0cd18994..0c500558 100755 --- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py @@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): - file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + ";\n\n\n" ) layer_str += ( @@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): - layer_str = "" return layer_str @@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(result) + + array_to_cstr(torch.sum(result.float(), dim=-1)) + ";\n\n\n" ) else: @@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): - ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint( - 0, 2, shape, requires_grad=False, dtype=torch.uint8 - ) # -1 or 1 - exponent = torch.randint( - 0, 16, shape, requires_grad=False, dtype=torch.uint8 - ) # < 0b01111 - mantissa = torch.randint( - 0, 4, shape, requires_grad=False, dtype=torch.uint8 - ) # can be arbitrary + sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) + exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) + mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} - # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): + # Universal Fix: Upcast for CPU Math + orig_dtype = ifmap.dtype + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - conv2d.bias = nn.Parameter( - torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False - ) + # Ensure bias natively generates in float32 + conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) + ofmap = conv2d(ifmap) - return ofmap + # Universal Fix: Downcast back + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) + running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + # Universal Fix: Upcast EVERYTHING before starting the math + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + weights = weights.float() + bn_k = bn_k.float() + bn_l = bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -440,14 +443,13 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, + dtype=ifmap.dtype, # Safely uses float32 ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap - # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -458,16 +460,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) + if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: - # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -478,7 +480,6 @@ def fused_conv( weights[:, :, c].flatten(), ) else: - # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -491,29 +492,22 @@ def fused_conv( ofmap += ofmap_before - # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l - # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - return ofmap, ofmap_before, ifmap_padded + # Universal Fix: Downcast back + return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) def main(): - parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", + "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") - args = parser.parse_args() global verbose @@ -532,31 +526,18 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": + # Generate safely in float32, then cast ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) weights = torch.randn( - param["channels"]["out"], - param["channels"]["in"], - param["filter"]["height"], - param["filter"]["width"], - requires_grad=False, - dtype=dtype, - ) + param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - ofmap = conv2d( - ifmap, - weights, - padding=param["filter"]["padding"], - stride=param["filter"]["stride"], - ) + ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -568,7 +549,8 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + # Upcast for CPU Math + result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) if param["transpose_A"]: mat_A = mat_A.T @@ -592,22 +574,16 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } - emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -616,17 +592,12 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -635,44 +606,28 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], - param["dim_in_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_in_y"], param["dim_in_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) + if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, - kernel, - bn_k, - bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], + param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], + not param["flags"]["flag_y_accumulate_start"], param["depthwise"], ) if param["chw_layer"]: @@ -700,6 +655,5 @@ def main(): else: print("No valid kernel selected") - if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py index 9918713e..0c500558 100755 --- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py @@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): - file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + ";\n\n\n" ) layer_str += ( @@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): - layer_str = "" return layer_str @@ -167,23 +165,23 @@ def emit_GEMM_layer(name="gemm", **kwargs): + ";\n\n\n" ) layer_str += ( - f"static const {dtype} {name}_checksum[{m}] = " - + array_to_cstr(torch.sum(result, dim=-1)) + f"static const {dtype} {name}_result[{m}*{n}] = " + + array_to_cstr(torch.sum(result.float(), dim=-1)) + ";\n\n\n" ) else: layer_str += ( - f"static {dtype} {name}_A_dram [{m}*{k}] = " + f"static {dtype} {name}_A_dram [{m}][{k}] = " + array_to_cstr(kwargs["bits_A"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_B_dram [{k}*{n}] = " + f"static {dtype} {name}_B_dram [{k}][{n}] = " + array_to_cstr(kwargs["bits_B"], fmt="char") + ";\n\n\n" ) layer_str += ( - f"static {dtype} {name}_C_dram [{m}*{n}] = " + f"static {dtype} {name}_C_dram [{m}][{n}] = " + array_to_cstr(kwargs["bits_C"], fmt="char") + ";\n\n\n" ) @@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): - ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint( - 0, 2, shape, requires_grad=False, dtype=torch.uint8 - ) # -1 or 1 - exponent = torch.randint( - 0, 16, shape, requires_grad=False, dtype=torch.uint8 - ) # < 0b01111 - mantissa = torch.randint( - 0, 4, shape, requires_grad=False, dtype=torch.uint8 - ) # can be arbitrary + sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) + exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) + mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} - # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): + # Universal Fix: Upcast for CPU Math + orig_dtype = ifmap.dtype + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - conv2d.bias = nn.Parameter( - torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False - ) + # Ensure bias natively generates in float32 + conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) + ofmap = conv2d(ifmap) - return ofmap + # Universal Fix: Downcast back + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) + running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + # Universal Fix: Upcast EVERYTHING before starting the math + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + weights = weights.float() + bn_k = bn_k.float() + bn_l = bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -440,14 +443,13 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, + dtype=ifmap.dtype, # Safely uses float32 ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap - # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -458,16 +460,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) + if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: - # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -478,7 +480,6 @@ def fused_conv( weights[:, :, c].flatten(), ) else: - # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -491,29 +492,22 @@ def fused_conv( ofmap += ofmap_before - # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l - # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - return ofmap, ofmap_before, ifmap_padded + # Universal Fix: Downcast back + return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) def main(): - parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", + "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") - args = parser.parse_args() global verbose @@ -532,31 +526,18 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": + # Generate safely in float32, then cast ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) weights = torch.randn( - param["channels"]["out"], - param["channels"]["in"], - param["filter"]["height"], - param["filter"]["width"], - requires_grad=False, - dtype=dtype, - ) + param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - ofmap = conv2d( - ifmap, - weights, - padding=param["filter"]["padding"], - stride=param["filter"]["stride"], - ) + ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -568,7 +549,8 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + # Upcast for CPU Math + result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) if param["transpose_A"]: mat_A = mat_A.T @@ -592,22 +574,16 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } - emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -616,17 +592,12 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -635,44 +606,28 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], - param["dim_in_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_in_y"], param["dim_in_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) + if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, - kernel, - bn_k, - bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], + param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], + not param["flags"]["flag_y_accumulate_start"], param["depthwise"], ) if param["chw_layer"]: @@ -700,6 +655,5 @@ def main(): else: print("No valid kernel selected") - if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py index 0cd18994..0c500558 100755 --- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py @@ -42,7 +42,6 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): - file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -96,7 +95,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + ";\n\n\n" ) layer_str += ( @@ -119,7 +118,6 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): - layer_str = "" return layer_str @@ -168,7 +166,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(result) + + array_to_cstr(torch.sum(result.float(), dim=-1)) + ";\n\n\n" ) else: @@ -192,7 +190,6 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -215,7 +212,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -239,7 +236,6 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): - ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -263,7 +259,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap, dim=-1)) + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -281,7 +277,6 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): - ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -363,70 +358,78 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - return torch.randn(shape, requires_grad=False, dtype=torch.bfloat16), {} + # Universal Fix: Generate FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - return ( - torch.randn( - shape, requires_grad=False, dtype=torch.float16, device=device - ), - {}, - ) + # Universal Fix: Generate FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint( - 0, 2, shape, requires_grad=False, dtype=torch.uint8 - ) # -1 or 1 - exponent = torch.randint( - 0, 16, shape, requires_grad=False, dtype=torch.uint8 - ) # < 0b01111 - mantissa = torch.randint( - 0, 4, shape, requires_grad=False, dtype=torch.uint8 - ) # can be arbitrary + sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) + exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) + mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} - # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): + # Universal Fix: Upcast for CPU Math + orig_dtype = ifmap.dtype + ifmap, weights = ifmap.float(), weights.float() + n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - conv2d.bias = nn.Parameter( - torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False - ) + # Ensure bias natively generates in float32 + conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) + ofmap = conv2d(ifmap) - return ofmap + # Universal Fix: Downcast back + return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) ofmap = max_pool(ifmap) - return ofmap + return ofmap.to(orig_dtype) def batchnorm(ifmap): + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False) - running_var = torch.rand_like(bn.running_var, requires_grad=False) + running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) + running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) ofmap = ifmap * gamma.unsqueeze(-1).unsqueeze(-1) + beta.unsqueeze(-1).unsqueeze(-1) - return ofmap, gamma, beta + return ofmap.to(orig_dtype), gamma.to(orig_dtype), beta.to(orig_dtype) def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): + # Universal Fix: Upcast EVERYTHING before starting the math + orig_dtype = ifmap.dtype + ifmap = ifmap.float() + weights = weights.float() + bn_k = bn_k.float() + bn_l = bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -440,14 +443,13 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, + dtype=ifmap.dtype, # Safely uses float32 ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap - # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -458,16 +460,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) + if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False) + ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: - # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -478,7 +480,6 @@ def fused_conv( weights[:, :, c].flatten(), ) else: - # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -491,29 +492,22 @@ def fused_conv( ofmap += ofmap_before - # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l - # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - return ofmap, ofmap_before, ifmap_padded + # Universal Fix: Downcast back + return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) def main(): - parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", - "--cfg", - type=pathlib.Path, - required=True, - help="Select param config file kernel", + "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") - args = parser.parse_args() global verbose @@ -532,31 +526,18 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": + # Generate safely in float32, then cast ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) weights = torch.randn( - param["channels"]["out"], - param["channels"]["in"], - param["filter"]["height"], - param["filter"]["width"], - requires_grad=False, - dtype=dtype, - ) + param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - ofmap = conv2d( - ifmap, - weights, - padding=param["filter"]["padding"], - stride=param["filter"]["stride"], - ) + ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -568,7 +549,8 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - result = torch.matmul(mat_A, mat_B) + # Upcast for CPU Math + result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) if param["transpose_A"]: mat_A = mat_A.T @@ -592,22 +574,16 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } - emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -616,17 +592,12 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, - param["channels"]["in"], - param["input_dim"]["height"], - param["input_dim"]["width"], - requires_grad=False, - dtype=dtype, - ) + 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) - # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -635,44 +606,28 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], - param["dim_in_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_in_y"], param["dim_in_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) + if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], - param["dim_kernel_x"], - param["ch_in"], - requires_grad=False, - dtype=dtype, - ) + param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], + requires_grad=False, dtype=torch.float32, + ).to(dtype) - bn_k = torch.randn(param["ch_out"], requires_grad=False) - bn_l = torch.randn(param["ch_out"], requires_grad=False) + bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) + bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, - kernel, - bn_k, - bn_l, - param["padding"], - param["stride"], - param["flags"]["flag_batch_norm"], - param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], - param["depthwise"], + ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], + param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], + not param["flags"]["flag_y_accumulate_start"], param["depthwise"], ) if param["chw_layer"]: @@ -700,6 +655,5 @@ def main(): else: print("No valid kernel selected") - if __name__ == "__main__": main() From e629968fda2a15f9eaff1316e4f71d221deba672 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 10:38:16 +0200 Subject: [PATCH 02/13] [SW] Add sparse-attention gemv kernel. Kernel includes two parts: 1. non-zero element finding; 2. calculation on non-zeros. --- sw/spatzBenchmarks/CMakeLists.txt | 3 + sw/spatzBenchmarks/sa-gemv/data/layer.h | 16 + sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c | 165 ++++++++ sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h | 26 ++ sw/spatzBenchmarks/sa-gemv/main.c | 393 ++++++++++++++++++ sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 224 ++++++++++ .../script/sa_gemv_128_4096_512_16.json | 16 + .../sa-gemv/script/sa_gemv_256_128_16_64.json | 16 + 8 files changed, 859 insertions(+) create mode 100644 sw/spatzBenchmarks/sa-gemv/data/layer.h create mode 100644 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c create mode 100644 sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h create mode 100644 sw/spatzBenchmarks/sa-gemv/main.c create mode 100644 sw/spatzBenchmarks/sa-gemv/script/gen_data.py create mode 100644 sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json create mode 100644 sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json diff --git a/sw/spatzBenchmarks/CMakeLists.txt b/sw/spatzBenchmarks/CMakeLists.txt index 70f72e5c..7c7d5170 100644 --- a/sw/spatzBenchmarks/CMakeLists.txt +++ b/sw/spatzBenchmarks/CMakeLists.txt @@ -110,6 +110,7 @@ if (ELEN EQUAL 64) add_spatz_test_threeParam(dp-fmatmul dp-fmatmul/main.c 64 64 64 ) add_spatz_test_twoParam_type(dp-gemv gemv/main.c 64 128 64) + add_spatz_test_threeParam_type(dp-sa-gemv sa-gemv/main.c 256 128 16 64) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 256) add_spatz_test_oneParam(dp-faxpy dp-faxpy/main.c 1024) @@ -121,6 +122,7 @@ if (ELEN EQUAL 64) add_spatz_test_threeParam(dp-fconv2d dp-fconv2d/main.c 64 64 7) add_spatz_test_twoParam(dp-fft dp-fft/main.c 128 2) + endif() add_spatz_test_threeParam(sp-fmatmul sp-fmatmul/main.c 64 64 64 ) @@ -150,6 +152,7 @@ add_spatz_test_threeParam(sdotp-bp-fmatmul sdotp-bp-fmatmul/main.c 128 256 128) add_spatz_test_twoParam_type(sp-gemv gemv/main.c 128 128 32) add_spatz_test_twoParam_type(hp-gemv gemv/main.c 256 128 16) +add_spatz_test_threeParam_type(hp-sa-gemv sa-gemv/main.c 128 4096 512 16) add_spatz_test_twoParam(sp-fft sp-fft/main.c 256 2) add_spatz_test_twoParam(sp-fft sp-fft/main.c 512 2) diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h new file mode 100644 index 00000000..0a07ad53 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h @@ -0,0 +1,16 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; + +typedef struct gemv_layer_struct { + uint32_t M; + uint32_t N; + + precision_t dtype; +} gemv_layer; diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c new file mode 100644 index 00000000..4e641041 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c @@ -0,0 +1,165 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Navaneeth Kunhi Purayil, ETH Zurich +// Author: Diyou Shen, ETH Zurich + +#include "sa-gemv.h" + +void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + double *a_, *a_start = a; + double *c_ = c; + + do { + a_ = a_start; + double *b_ = b; + asm volatile("vsetvli %0, %1, e64, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + // 1. CLEAR ACCUMULATORS for every new vl block (0 encodes to +0.0 float) + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle64.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + b_++; + + asm volatile("vle64.v v8, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_)); + b_++; + } + + // 2. HANDLE ODD N BOUNDARY + if (col < N) { + asm volatile("vle64.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + // 3. ACCUMULATE INTO MEMORY C (Load -> Add -> Store) + asm volatile("vle64.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse64.v v4, (%0)" ::"r"(c_)); + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} + +void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + float *a_, *a_start = a; + float *c_ = c; + + do { + a_ = a_start; + float *b_ = b; + asm volatile("vsetvli %0, %1, e32, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle32.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + b_++; + + asm volatile("vle32.v v8, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(*b_)); + b_++; + } + + if (col < N) { + asm volatile("vle32.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(*b_)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + asm volatile("vle32.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse32.v v12, (%0)" ::"r"(c_)); // wait, mapping v4 to v12? No, use v4. + // Correction: + // asm volatile("vse32.v v4, (%0)" ::"r"(c_)); + // Let's rewrite this block safely: + asm volatile("vse32.v v4, (%0)" ::"r"(c_)); // Fixed register writeback + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} + +void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N) { + unsigned int vl, avl = M_core; + __fp16 *a_, *a_start = a; + __fp16 *c_ = c; + + do { + a_ = a_start; + __fp16 *b_ = b; + asm volatile("vsetvli %0, %1, e16, m4, ta, ma" : "=r"(vl) : "r"(avl)); + + asm volatile("vmv.v.i v4, 0"); + asm volatile("vmv.v.i v12, 0"); + + int col = 0; + for (; col < N - 1; col += 2) { + asm volatile("vle16.v v0, (%0)" ::"r"(a_)); + a_ += M; + asm volatile("vle16.v v8, (%0)" ::"r"(a_)); + a_ += M; + + float t0, t1; + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_)); + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0)); + b_++; + + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t1) : [b] "r"(b_)); + asm volatile("vfmacc.vf v12, %0, v8" ::"f"(t1)); + b_++; + } + + if (col < N) { + asm volatile("vle16.v v0, (%0)" ::"r"(a_)); + a_ += M; + float t0; + asm volatile("flh %[t], 0(%[b])" : [t] "=f"(t0) : [b] "r"(b_)); + asm volatile("vfmacc.vf v4, %0, v0" ::"f"(t0)); + } + + asm volatile("vfadd.vv v4, v4, v12"); + + asm volatile("vle16.v v16, (%0)" ::"r"(c_)); + asm volatile("vfadd.vv v4, v4, v16"); + asm volatile("vse16.v v4, (%0)" ::"r"(c_)); + + avl -= vl; + c_ += vl; + a_start += vl; + } while (avl > 0); +} diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h new file mode 100644 index 00000000..1bf377e1 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h @@ -0,0 +1,26 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Navaneeth Kunhi Purayil, ETH Zurich + +#ifndef _GEMV_H +#define _GEMV_H + +void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N); +void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N); +void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N); + +#endif diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c new file mode 100644 index 00000000..e046c7df --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -0,0 +1,393 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Diyou Shen, ETH Zurich +// Author: Navaneeth Kunhi Purayil, ETH Zurich + +#include +#include +#include +#include + +#include DATAHEADER +#include "kernel/sa-gemv.c" + +#if (PREC == 64) +#define T double +#elif (PREC == 32) +#define T float +#elif (PREC == 16) +#define T __fp16 +#else +#define T double +#endif + +// Debugging defines +// #define DEBUG_NZ +// #define DEBUG_NZ_IDX +// #define DEBUG_GEMV_PreLD +// #define DEBUG_GEMV_DB + +T *vec_buf0; +T *vec_buf1; +uint16_t *dense_idx; +T *dense_vec; +T *mat_buf0; +T *mat_buf1; +T *result; + +static inline int fp_check(const T *a, const T *b) { + const T threshold = 0.001; + + // Absolute value + double comp = (double)*a - (double)*b; + if (comp < 0) + comp = -comp; + + return comp > threshold; +} + +int main() { + const unsigned int num_cores = snrt_cluster_core_num(); + const unsigned int cid = snrt_cluster_core_idx(); + + // Reset timer + unsigned int timer = (unsigned int)-1; + unsigned int timer_best = (unsigned int)-1; + unsigned int timer_nz = (unsigned int)-1; + const unsigned int m_core = gemv_l.M / num_cores; + // Size (in KiB) of L1 SPM, used to calculate tiling window + const unsigned int spm_size = 128; + + // For Sparse Attention GEMV, we need several steps + // 1. Find all non-zeros + // 2. Calculate the GEMV + // What to be double buffered? + // 1. 2 chunks of sparse vector + densed vector (output) + densed idx + // 2. 2 chunks of matrix + densed vector (output) + densed idx + + // Sizes of each part we need + const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram; + const uint32_t row_size = sizeof(T) * gemv_l.M; + const uint32_t vec_size = sizeof(T) * gemv_l.N; + const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram; + const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram; + const uint32_t result_size = sizeof(T) * gemv_l.M; + + // leave 8 KiB for Stack + const uint32_t l1_size = (spm_size - 8) * 1024; + const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size; + + // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? --- + if (fixed_alloc_size >= l1_size) { + if (cid == 0) { + printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", + fixed_alloc_size, l1_size); + } + snrt_cluster_hw_barrier(); + return -1; // Exit gracefully + } + + const uint32_t l1_for_chunk = l1_size - fixed_alloc_size; + + // How many whole rows (or columns) can fit in half the L1 chunk space? + const uint32_t num_row_mat = (l1_for_chunk / 2) / row_size; + + // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? --- + if (num_row_mat < 1) { + if (cid == 0) { + printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. " + "Chunk space left: %u bytes, Row size: %u bytes.\n", + l1_for_chunk, row_size); + } + snrt_cluster_hw_barrier(); + return -1; // Exit gracefully + } + + // Always strictly split the available memory in half for double-buffering + const uint32_t vec_chunk_size = l1_for_chunk / 2; + const uint32_t num_vec_chunk = (l1_for_chunk > vec_size) ? 1 : ((vec_size + vec_chunk_size - 1) / vec_chunk_size); + + // Recalculate exact chunk size based on whole rows + const uint32_t mat_chunk_size = num_row_mat * row_size; + + // Number of chunks based on the total non-zeros we need to process + const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat; + + // Number of elements in each chunk + const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T); + + + // Memory Allocation + if (cid == 0) { + result = (T *)snrt_l1alloc(result_size); + dense_vec = (T *)snrt_l1alloc(dense_vec_size); + vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk); + dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size); + + // Offset by half of the size if needed by double buffering + vec_buf1 = vec_buf0 + vec_chunk_len; + + mat_buf0 = vec_buf0; + mat_buf1 = vec_buf1; + } + + // MUST zero out the memory accumulator! + if (cid == 0) { + for (unsigned int i = 0; i < gemv_l.M; i++) { + result[i] = 0.0; + } + } + snrt_cluster_hw_barrier(); + + if (cid == 0) + start_kernel(); + + timer = benchmark_get_cycle(); + + // Calculate internal pointers + T *vec_ptr = vec_buf0; + T *vec_db_ptr = vec_buf1; + + // Task 1: Find out the non-zeros + if (cid == 0) { + #ifdef DEBUG_NZ + printf("NZ-Calc PreLD\n"); + printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size); + #endif + snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size); + snrt_dma_wait_all(); + } + + uint32_t nz_count = 0; + + + if (cid == 0) { + for (unsigned int i = 0; i < num_vec_chunk; ++i) { + // Step 1.1: preload the next chunk if not the end + // Make sure the previous load completes + snrt_dma_wait_all(); + // Double buffer to search the next non-zero + uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) + ? (vec_size - (i + 1) * vec_chunk_size) + : vec_chunk_size; + + if (i < num_vec_chunk - 1) { + #ifdef DEBUG_NZ + printf("NZ-Calc DB Iter%u\n", i); + printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", + gemv_vec_dram + (i + 1) * vec_chunk_len, + vec_db_ptr, + next_bytes); + #endif + snrt_dma_start_1d(vec_db_ptr, + gemv_vec_dram + (i + 1) * vec_chunk_len, + next_bytes); // Use exact bytes + } + + for (unsigned int j = 0; j < vec_chunk_len; ++j) { + if ((double) vec_ptr[j] != 0.0) { + dense_vec[nz_count] = vec_ptr[j]; + dense_idx[nz_count] = i * vec_chunk_len + j; + nz_count++; + } + + if (nz_count == tot_nz_dram) + break; + } + + if (nz_count == tot_nz_dram) + break; + + if (i % 2 == 0) { + // pointer exchange + vec_ptr = vec_buf1; + vec_db_ptr = vec_buf0; + } else { + vec_ptr = vec_buf0; + vec_db_ptr = vec_buf1; + } + } + } + + snrt_cluster_hw_barrier(); + + #ifdef DEBUG_NZ + if (cid == 0) + printf("Non-Zero Calc Complete\n"); + #endif + + #ifdef DEBUG_NZ_IDX + if (cid == 0) { + for (uint32_t i = 0; i < tot_nz_dram; i++) { + printf("IDX[%u]=%u\n", i, dense_idx[i]); + } + } + #endif + + timer_nz = benchmark_get_cycle() - timer_nz; + timer = benchmark_get_cycle(); + + + // Task 2: GEMV calculation + // Calculate internal pointers + T *mat_ptr = mat_buf0; + T *mat_db_ptr = mat_buf1; + T *result_core = result + m_core * cid; + uint16_t *idx_ptr = dense_idx; // Corrected pointer type + + if (cid == 0) { + // Determine how many rows are actually active for this very first chunk + uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat; + + #ifdef DEBUG_GEMV_PreLD + printf("GEMV PreLD\n"); + printf("Active Rows:%u\n", active_rows); + #endif + + for (unsigned int i = 0; i < active_rows; i++) { + #ifdef DEBUG + printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", + i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_ptr + i * gemv_l.M, + row_size); + #endif + snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1 + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM + row_size); + idx_ptr++; + } + } + + snrt_cluster_hw_barrier(); + + #ifdef DEBUG_GEMV_PreLD + if (cid == 0) + printf("GEMV PreLD Complete\n"); + #endif + + #ifdef DEBUG_GEMV_DB + if (cid == 0) + printf("Tot Chunks %u\n", num_mat_chunk); + #endif + + for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) { + // Wait for the CURRENT chunk to finish loading + if (cid == 0) { + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + + // Determine bounds for the NEXT chunk (for background DMA) + uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat; + uint32_t next_active_rows = 0; + + if (next_chunk_start < tot_nz_dram) { + next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) + ? (tot_nz_dram - next_chunk_start) + : num_row_mat; + } + + #ifdef DEBUG_GEMV_DB + if (cid == 0) + printf("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows); + #endif + + // Load NEXT chunk in the background + if (cid == 0 && next_active_rows > 0) { + for (unsigned int i = 0; i < next_active_rows; i++) { + #ifdef DEBUG_GEMV_DB + printf("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", + idx_ptr, + i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_db_ptr + i * gemv_l.M, + row_size); + #endif + snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + row_size); + idx_ptr++; + } + } + + // Calculate active rows for the CURRENT compute phase + uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) + ? (tot_nz_dram - chunk_idx * num_row_mat) + : num_row_mat; + + // Calculate GEMV on the current chunk + T *current_dense_vec = dense_vec + chunk_idx * num_row_mat; + + // Offset the matrix pointer by m_core * cid so each core reads its correct rows + T *mat_core_ptr = mat_ptr + m_core * cid; + + #if (PREC == 64) + gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); + #elif (PREC == 32) + gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); + #else + gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); + #endif + + + // Swap pointers for the next iteration + T *temp = mat_ptr; + mat_ptr = mat_db_ptr; + mat_db_ptr = temp; + } + + snrt_cluster_hw_barrier(); + + timer = benchmark_get_cycle() - timer; + + if (cid == 0) + stop_kernel(); + + // Result Checking + if (cid == 0) { + // Checking + for (unsigned int i = 0; i < gemv_l.M; i++) { + if (fp_check(&result[i], &gemv_result[i])) { + printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); + } + } + } + + snrt_cluster_hw_barrier(); + + + // Check and display results + // Assume 2 core 4 fpu configuration + if (cid == 0) { + // Flops per cycle + long unsigned int performance = 1000 * 2 * gemv_l.M * tot_nz_dram / timer; + // Ideal perf = MACC * NCore * Nfpu * Prec adjustment + long unsigned int utilization = + performance / (2 * num_cores * 4 * 8 / sizeof(T)); + + printf("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram); + printf("The NZ finding takes %u cycles.\n", timer_nz); + printf("The GEMV execution took %u cycles.\n", timer); + printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n", + performance, utilization); + } + + // Wait for core 0 to finish displaying results + snrt_cluster_hw_barrier(); + return 0; +} diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py new file mode 100644 index 00000000..4daa6689 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Matheus Cavalcante + +import numpy as np +import torch +import argparse +import pathlib +import hjson + +np.random.seed(42) +torch.manual_seed(42) + +global verbose + + +def array_to_cstr(a, fmt=float): + out = "{\n" + if fmt == float: + if isinstance(a, np.ndarray): + a = a.flat + if isinstance(a, torch.Tensor): + a = a.numpy().flat + for el in a: + out += "\t{},\n".format(el) + else: + for sign, exp, mant in zip( + a["sign"].numpy().flat, + a["exponent"].numpy().flat, + a["mantissa"].numpy().flat, + ): + value = sign * 2**7 + exp * 2**2 + mant + out += "0x{:02x},\n".format(value) + out = out[:-2] + "}" + return out + + +def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" + emit_str = ( + "// Copyright 2025 ETH Zurich and University of Bologna.\n" + + "// Licensed under the Apache License, Version 2.0, see LICENSE for details.\n" + + "// SPDX-License-Identifier: Apache-2.0\n\n" + + "// This file was generated automatically.\n\n" + ) + + file = file_path / ("data_" + str(kwargs["M"]) + "_" + str(kwargs["N"]) + "_" + str(kwargs["tot_nz"]) + "_" + str(kwargs["prec"]) + ".h") + emit_str += emit_gemv_layer(**kwargs) + with file.open("w") as f: + f.write(emit_str) + + +def emit_gemv_layer(name="gemv", **kwargs): + mat_A = kwargs["A"] + vec_B = kwargs["B"] + result = kwargs["result"] + + m = kwargs["M"] + n = kwargs["N"] + tot_nz = kwargs["tot_nz"] + + layer_str = "" + layer_str += '#include "layer.h"\n\n' + layer_str += f"const gemv_layer {name}_l = {{\n" + layer_str += f"\t.M = {m},\n" + layer_str += f"\t.N = {n},\n" + layer_str += f'\t.dtype = FP{kwargs["prec"]}' + layer_str += "};\n\n" + + # Export the total non-zeros directly so the kernel can use it + layer_str += f"const uint32_t tot_nz_dram = {tot_nz};\n\n" + + ctypes = {"64": "double", "32": "float", "16": "__fp16", "8": "char"} + + dtype = ctypes[str(kwargs["prec"])] + if dtype != "char": + layer_str += ( + f'static {dtype} {name}_mat_dram[{m}*{n}] __attribute__((section(".data"))) = ' + + array_to_cstr(mat_A) + + ";\n\n" + ) + layer_str += ( + f'static {dtype} {name}_vec_dram[{n}] __attribute__((section(".data"))) = ' + + array_to_cstr(vec_B) + + ";\n\n" + ) + layer_str += ( + f'static {dtype} {name}_result[{m}] __attribute__((section(".data"))) = ' + + array_to_cstr(result) + + ";\n" + ) + # Assuming you have variables like M (output size) and tot_nz (number of non-zeros) + layer_str += f'// Auto-generated buffers for Cache Mode\n' + layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n' + layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n' + layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n' + else: + layer_str += ( + f"static {dtype} {name}_mat_dram[{m}*{n}] = " + + array_to_cstr(kwargs["bits_A"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_vec_dram[{n}] = " + + array_to_cstr(kwargs["bits_B"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_result[{m}] = " + + array_to_cstr(kwargs["result"], fmt="char") + + ";\n\n\n" + ) + layer_str += ( + f"static {dtype} {name}_result_buf_dram[{m}] =" + + array_to_cstr(kwargs["result"], fmt="char") + + ";\n\n\n" + ) + + return layer_str + + +def rand_data_generator(shape, prec, alt=False): + if prec == 64: + return torch.randn(shape, requires_grad=False, dtype=torch.float64), {} + elif prec == 32: + return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} + elif prec == 16: + if alt: + # Generate in FP32, cast to BF16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} + else: + # Generate in FP32, cast to FP16 + return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.float16), {} + elif prec == 8: + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary + bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct + return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( + 1.0 + mantissa.double() / (2**2) + ), bits + + +def gemv(a, b): + print(a.shape, b.shape) + # Upcast to float32 for CPU math, then downcast back to the original dtype + return torch.matmul(a.float(), b.float()).to(a.dtype) + +def main(): + + parser = argparse.ArgumentParser(description="Generate data for kernels") + parser.add_argument( + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + + args = parser.parse_args() + + global verbose + verbose = args.verbose + + with args.cfg.open() as f: + param = hjson.loads(f.read()) + + # Read tot_nz from the hjson file + tot_nz = param["tot_nz"] + + mat_A, bits_A = rand_data_generator((param["M"], param["N"]), param["prec"]) + vec_B, bits_B = rand_data_generator((param["N"], 1), param["prec"]) + + # --- Sparsity Logic --- + # Randomly select `tot_nz` indices to keep, set the rest to 0.0 + nz_indices = torch.randperm(param["N"])[:tot_nz] + mask = torch.zeros((param["N"], 1), dtype=torch.bool) + mask[nz_indices, 0] = True + + # Temporarily upcast to float32 for the masking math, then cast back + vec_B = (vec_B.float() * mask).to(vec_B.dtype) + + # Also zero out the raw bits if using 8-bit precision to maintain parity + if bool(bits_B): + for k in bits_B.keys(): + # Apply the mask, ensuring the shape matches the 1D bits array format + bits_B[k] = bits_B[k] * mask.squeeze().byte() + # ---------------------- + + # Calculate result using the now-sparse vector + result = gemv(mat_A, vec_B) + + # Store A in col major format + mat_A = mat_A.T + + kwargs = { + "A": mat_A, + "B": vec_B, + "result": result, + "M": param["M"], + "N": param["N"], + "tot_nz": tot_nz, # Pass the new parameter down + "prec": param["prec"], + "expand": param["expand"], + "bits_A": bits_A, + "bits_B": bits_B, + } + + emit_header_file("gemv", **kwargs) + + +if __name__ == "__main__": + main() diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json new file mode 100644 index 00000000..e14c70e4 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_128_4096_512_16.json @@ -0,0 +1,16 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMV + +{ + kernel: "GEMV" + M: 128, + N: 4096, + tot_nz: 512 + transpose_A: false, + transpose_B: false, + prec: 16, + expand: 0 +} diff --git a/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json new file mode 100644 index 00000000..8942de89 --- /dev/null +++ b/sw/spatzBenchmarks/sa-gemv/script/sa_gemv_256_128_16_64.json @@ -0,0 +1,16 @@ +// Copyright 2020 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Parameters for a GEMV + +{ + kernel: "GEMV" + M: 256, + N: 128, + tot_nz: 16 + transpose_A: false, + transpose_B: false, + prec: 64, + expand: 0 +} From 69cdd8fd93b672850c9453764279e1e00b4d4cd2 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 10:50:45 +0200 Subject: [PATCH 03/13] [SW] Fix trailing whitespace . --- sw/spatzBenchmarks/sa-gemv/main.c | 26 +++++++++---------- sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c index e046c7df..d591806e 100644 --- a/sw/spatzBenchmarks/sa-gemv/main.c +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -86,7 +86,7 @@ int main() { const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram; const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram; const uint32_t result_size = sizeof(T) * gemv_l.M; - + // leave 8 KiB for Stack const uint32_t l1_size = (spm_size - 8) * 1024; const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size; @@ -94,7 +94,7 @@ int main() { // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? --- if (fixed_alloc_size >= l1_size) { if (cid == 0) { - printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", + printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", fixed_alloc_size, l1_size); } snrt_cluster_hw_barrier(); @@ -110,7 +110,7 @@ int main() { if (num_row_mat < 1) { if (cid == 0) { printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. " - "Chunk space left: %u bytes, Row size: %u bytes.\n", + "Chunk space left: %u bytes, Row size: %u bytes.\n", l1_for_chunk, row_size); } snrt_cluster_hw_barrier(); @@ -181,8 +181,8 @@ int main() { // Make sure the previous load completes snrt_dma_wait_all(); // Double buffer to search the next non-zero - uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) - ? (vec_size - (i + 1) * vec_chunk_size) + uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) + ? (vec_size - (i + 1) * vec_chunk_size) : vec_chunk_size; if (i < num_vec_chunk - 1) { @@ -257,7 +257,7 @@ int main() { printf("GEMV PreLD\n"); printf("Active Rows:%u\n", active_rows); #endif - + for (unsigned int i = 0; i < active_rows; i++) { #ifdef DEBUG printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", @@ -295,10 +295,10 @@ int main() { // Determine bounds for the NEXT chunk (for background DMA) uint32_t next_chunk_start = (chunk_idx + 1) * num_row_mat; uint32_t next_active_rows = 0; - + if (next_chunk_start < tot_nz_dram) { - next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) - ? (tot_nz_dram - next_chunk_start) + next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) + ? (tot_nz_dram - next_chunk_start) : num_row_mat; } @@ -318,16 +318,16 @@ int main() { mat_db_ptr + i * gemv_l.M, row_size); #endif - snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, - gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, row_size); idx_ptr++; } } // Calculate active rows for the CURRENT compute phase - uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) - ? (tot_nz_dram - chunk_idx * num_row_mat) + uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) + ? (tot_nz_dram - chunk_idx * num_row_mat) : num_row_mat; // Calculate GEMV on the current chunk diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py index 4daa6689..8a3734e1 100644 --- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py @@ -187,7 +187,7 @@ def main(): nz_indices = torch.randperm(param["N"])[:tot_nz] mask = torch.zeros((param["N"], 1), dtype=torch.bool) mask[nz_indices, 0] = True - + # Temporarily upcast to float32 for the masking math, then cast back vec_B = (vec_B.float() * mask).to(vec_B.dtype) From b8f60e702845f462b9e2004971a8ccb9bdf9b6b2 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 10:53:35 +0200 Subject: [PATCH 04/13] [SW] Change to use PRINTF for sa-gemv kernel. --- sw/spatzBenchmarks/sa-gemv/main.c | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c index d591806e..df2eb8ba 100644 --- a/sw/spatzBenchmarks/sa-gemv/main.c +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -94,7 +94,7 @@ int main() { // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? --- if (fixed_alloc_size >= l1_size) { if (cid == 0) { - printf("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", + PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", fixed_alloc_size, l1_size); } snrt_cluster_hw_barrier(); @@ -109,7 +109,7 @@ int main() { // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? --- if (num_row_mat < 1) { if (cid == 0) { - printf("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. " + PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. " "Chunk space left: %u bytes, Row size: %u bytes.\n", l1_for_chunk, row_size); } @@ -165,8 +165,8 @@ int main() { // Task 1: Find out the non-zeros if (cid == 0) { #ifdef DEBUG_NZ - printf("NZ-Calc PreLD\n"); - printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size); + PRINTF("NZ-Calc PreLD\n"); + PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size); #endif snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size); snrt_dma_wait_all(); @@ -187,8 +187,8 @@ int main() { if (i < num_vec_chunk - 1) { #ifdef DEBUG_NZ - printf("NZ-Calc DB Iter%u\n", i); - printf("DMA SRC:%p, TGT:%p, SIZE:%u\n", + PRINTF("NZ-Calc DB Iter%u\n", i); + PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", gemv_vec_dram + (i + 1) * vec_chunk_len, vec_db_ptr, next_bytes); @@ -227,13 +227,13 @@ int main() { #ifdef DEBUG_NZ if (cid == 0) - printf("Non-Zero Calc Complete\n"); + PRINTF("Non-Zero Calc Complete\n"); #endif #ifdef DEBUG_NZ_IDX if (cid == 0) { for (uint32_t i = 0; i < tot_nz_dram; i++) { - printf("IDX[%u]=%u\n", i, dense_idx[i]); + PRINTF("IDX[%u]=%u\n", i, dense_idx[i]); } } #endif @@ -254,13 +254,13 @@ int main() { uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat; #ifdef DEBUG_GEMV_PreLD - printf("GEMV PreLD\n"); - printf("Active Rows:%u\n", active_rows); + PRINTF("GEMV PreLD\n"); + PRINTF("Active Rows:%u\n", active_rows); #endif for (unsigned int i = 0; i < active_rows; i++) { #ifdef DEBUG - printf("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", + PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", i, gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, mat_ptr + i * gemv_l.M, @@ -277,12 +277,12 @@ int main() { #ifdef DEBUG_GEMV_PreLD if (cid == 0) - printf("GEMV PreLD Complete\n"); + PRINTF("GEMV PreLD Complete\n"); #endif #ifdef DEBUG_GEMV_DB if (cid == 0) - printf("Tot Chunks %u\n", num_mat_chunk); + PRINTF("Tot Chunks %u\n", num_mat_chunk); #endif for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) { @@ -304,14 +304,14 @@ int main() { #ifdef DEBUG_GEMV_DB if (cid == 0) - printf("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows); + PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows); #endif // Load NEXT chunk in the background if (cid == 0 && next_active_rows > 0) { for (unsigned int i = 0; i < next_active_rows; i++) { #ifdef DEBUG_GEMV_DB - printf("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", + PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", idx_ptr, i, gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, @@ -363,7 +363,7 @@ int main() { // Checking for (unsigned int i = 0; i < gemv_l.M; i++) { if (fp_check(&result[i], &gemv_result[i])) { - printf("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); + PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); } } } @@ -380,10 +380,10 @@ int main() { long unsigned int utilization = performance / (2 * num_cores * 4 * 8 / sizeof(T)); - printf("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram); - printf("The NZ finding takes %u cycles.\n", timer_nz); - printf("The GEMV execution took %u cycles.\n", timer); - printf("The performance is %ld OP/1000cycle (%ld%%o utilization).\n", + PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram); + PRINTF("The NZ finding takes %u cycles.\n", timer_nz); + PRINTF("The GEMV execution took %u cycles.\n", timer); + PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n", performance, utilization); } From dec39bb7a38ef9284bdff9f12aef24af03c7561a Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 14:20:14 +0200 Subject: [PATCH 05/13] [SW] Fix a problem of gen_data.py. --- .../hp-fmatmul/script/gen_data.py | 209 +++++++++++++----- sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 1 + .../sdotp-bp-fmatmul/script/gen_data.py | 172 +++++++++----- .../sdotp-hp-fmatmul/script/gen_data.py | 172 +++++++++----- .../widening-bp-fmatmul/script/gen_data.py | 172 +++++++++----- .../widening-hp-fmatmul/script/gen_data.py | 172 +++++++++----- 6 files changed, 634 insertions(+), 264 deletions(-) diff --git a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py index f33c12b1..4da0e20d 100755 --- a/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): + layer_str = "" return layer_str @@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(torch.sum(result.float(), dim=-1)) + + array_to_cstr(result) + ";\n\n\n" ) else: @@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): + ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - # Universal Fix: Generate FP32, cast to BF16 return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - # Universal Fix: Generate FP32, cast to FP16 return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) - exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) - mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): - # Universal Fix: Upcast for CPU Math orig_dtype = ifmap.dtype - ifmap, weights = ifmap.float(), weights.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - # Ensure bias natively generates in float32 - conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) - + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False + ) ofmap = conv2d(ifmap) - # Universal Fix: Downcast back return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) @@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel): def batchnorm(ifmap): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) - running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) @@ -424,12 +439,10 @@ def batchnorm(ifmap): def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): - # Universal Fix: Upcast EVERYTHING before starting the math orig_dtype = ifmap.dtype - ifmap = ifmap.float() - weights = weights.float() - bn_k = bn_k.float() - bn_l = bn_l.float() + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -443,13 +456,14 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, # Safely uses float32 + dtype=ifmap.dtype, ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap + # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -460,16 +474,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) - if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: + # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -480,6 +494,7 @@ def fused_conv( weights[:, :, c].flatten(), ) else: + # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -492,22 +507,34 @@ def fused_conv( ofmap += ofmap_before + # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l + # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - # Universal Fix: Downcast back - return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + + return ofmap, ofmap_before, ifmap_padded def main(): + parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + args = parser.parse_args() global verbose @@ -526,18 +553,31 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": - # Generate safely in float32, then cast ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) weights = torch.randn( - param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], - requires_grad=False, dtype=torch.float32, + param["channels"]["out"], + param["channels"]["in"], + param["filter"]["height"], + param["filter"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) + ofmap = conv2d( + ifmap, + weights, + padding=param["filter"]["padding"], + stride=param["filter"]["stride"], + ) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -549,8 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - # Upcast for CPU Math - result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -558,22 +600,38 @@ def main(): mat_B = mat_B.T kwargs = { - "A": mat_A, "B": mat_B, "C": mat_C, "result": result, - "M": param["M"], "N": param["N"], "K": param["K"], - "ta": param["transpose_A"], "tb": param["transpose_B"], "alpha": param["alpha"], - "prec": param["prec"], "expand": param["expand"], - "bits_A": bits_A, "bits_B": bits_B, "bits_C": bits_C, + "A": mat_A, + "B": mat_B, + "C": mat_C, + "result": result, + "M": param["M"], + "N": param["N"], + "K": param["K"], + "ta": param["transpose_A"], + "tb": param["transpose_B"], + "alpha": param["alpha"], + "prec": param["prec"], + "expand": param["expand"], + "bits_A": bits_A, + "bits_B": bits_B, + "bits_C": bits_C, } + emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -582,12 +640,17 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -596,28 +659,44 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], param["dim_in_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_in_y"], + param["dim_in_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["ch_out"], + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], - param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], param["depthwise"], + ifmap, + kernel, + bn_k, + bn_l, + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: @@ -626,15 +705,25 @@ def main(): kernel = kernel.permute(0, 3, 1, 2) kwargs = { - "ifmap": ifmap, "ifmap_padded": ifmap_padded, "ofmap": ofmap, - "ofmap_before": ofmap_before, "kernel": kernel, "bn_k": bn_k, "bn_l": bn_l, - "padding": param["padding"], "stride": param["stride"], "prec": param["prec"], - "flags": param["flags"], "depthwise": param["depthwise"], "chw_layer": param["chw_layer"], + "ifmap": ifmap, + "ifmap_padded": ifmap_padded, + "ofmap": ofmap, + "ofmap_before": ofmap_before, + "kernel": kernel, + "bn_k": bn_k, + "bn_l": bn_l, + "padding": param["padding"], + "stride": param["stride"], + "prec": param["prec"], + "flags": param["flags"], + "depthwise": param["depthwise"], + "chw_layer": param["chw_layer"], } emit_header_file("FusedConv", **kwargs) else: print("No valid kernel selected") + if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py index 8a3734e1..ecc91071 100644 --- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py @@ -156,6 +156,7 @@ def gemv(a, b): # Upcast to float32 for CPU math, then downcast back to the original dtype return torch.matmul(a.float(), b.float()).to(a.dtype) + def main(): parser = argparse.ArgumentParser(description="Generate data for kernels") diff --git a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py index 0c500558..4da0e20d 100755 --- a/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-bp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): + layer_str = "" return layer_str @@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(torch.sum(result.float(), dim=-1)) + + array_to_cstr(result) + ";\n\n\n" ) else: @@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): + ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - # Universal Fix: Generate FP32, cast to BF16 return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - # Universal Fix: Generate FP32, cast to FP16 return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) - exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) - mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): - # Universal Fix: Upcast for CPU Math orig_dtype = ifmap.dtype - ifmap, weights = ifmap.float(), weights.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - # Ensure bias natively generates in float32 - conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) - + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False + ) ofmap = conv2d(ifmap) - # Universal Fix: Downcast back return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) @@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel): def batchnorm(ifmap): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) - running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) @@ -424,12 +439,10 @@ def batchnorm(ifmap): def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): - # Universal Fix: Upcast EVERYTHING before starting the math orig_dtype = ifmap.dtype - ifmap = ifmap.float() - weights = weights.float() - bn_k = bn_k.float() - bn_l = bn_l.float() + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -443,13 +456,14 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, # Safely uses float32 + dtype=ifmap.dtype, ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap + # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -460,16 +474,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) - if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: + # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -480,6 +494,7 @@ def fused_conv( weights[:, :, c].flatten(), ) else: + # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -492,22 +507,34 @@ def fused_conv( ofmap += ofmap_before + # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l + # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - # Universal Fix: Downcast back - return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + + return ofmap, ofmap_before, ifmap_padded def main(): + parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + args = parser.parse_args() global verbose @@ -526,18 +553,31 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": - # Generate safely in float32, then cast ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) weights = torch.randn( - param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], - requires_grad=False, dtype=torch.float32, + param["channels"]["out"], + param["channels"]["in"], + param["filter"]["height"], + param["filter"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) + ofmap = conv2d( + ifmap, + weights, + padding=param["filter"]["padding"], + stride=param["filter"]["stride"], + ) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -549,8 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - # Upcast for CPU Math - result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -574,16 +616,22 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } + emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -592,12 +640,17 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -606,28 +659,44 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], param["dim_in_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_in_y"], + param["dim_in_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["ch_out"], + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], - param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], param["depthwise"], + ifmap, + kernel, + bn_k, + bn_l, + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: @@ -655,5 +724,6 @@ def main(): else: print("No valid kernel selected") + if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py index 0c500558..4da0e20d 100755 --- a/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/sdotp-hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): + layer_str = "" return layer_str @@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(torch.sum(result.float(), dim=-1)) + + array_to_cstr(result) + ";\n\n\n" ) else: @@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): + ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - # Universal Fix: Generate FP32, cast to BF16 return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - # Universal Fix: Generate FP32, cast to FP16 return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) - exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) - mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): - # Universal Fix: Upcast for CPU Math orig_dtype = ifmap.dtype - ifmap, weights = ifmap.float(), weights.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - # Ensure bias natively generates in float32 - conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) - + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False + ) ofmap = conv2d(ifmap) - # Universal Fix: Downcast back return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) @@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel): def batchnorm(ifmap): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) - running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) @@ -424,12 +439,10 @@ def batchnorm(ifmap): def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): - # Universal Fix: Upcast EVERYTHING before starting the math orig_dtype = ifmap.dtype - ifmap = ifmap.float() - weights = weights.float() - bn_k = bn_k.float() - bn_l = bn_l.float() + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -443,13 +456,14 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, # Safely uses float32 + dtype=ifmap.dtype, ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap + # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -460,16 +474,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) - if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: + # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -480,6 +494,7 @@ def fused_conv( weights[:, :, c].flatten(), ) else: + # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -492,22 +507,34 @@ def fused_conv( ofmap += ofmap_before + # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l + # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - # Universal Fix: Downcast back - return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + + return ofmap, ofmap_before, ifmap_padded def main(): + parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + args = parser.parse_args() global verbose @@ -526,18 +553,31 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": - # Generate safely in float32, then cast ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) weights = torch.randn( - param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], - requires_grad=False, dtype=torch.float32, + param["channels"]["out"], + param["channels"]["in"], + param["filter"]["height"], + param["filter"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) + ofmap = conv2d( + ifmap, + weights, + padding=param["filter"]["padding"], + stride=param["filter"]["stride"], + ) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -549,8 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - # Upcast for CPU Math - result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -574,16 +616,22 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } + emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -592,12 +640,17 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -606,28 +659,44 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], param["dim_in_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_in_y"], + param["dim_in_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["ch_out"], + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], - param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], param["depthwise"], + ifmap, + kernel, + bn_k, + bn_l, + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: @@ -655,5 +724,6 @@ def main(): else: print("No valid kernel selected") + if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py index 0c500558..4da0e20d 100755 --- a/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-bp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): + layer_str = "" return layer_str @@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(torch.sum(result.float(), dim=-1)) + + array_to_cstr(result) + ";\n\n\n" ) else: @@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): + ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - # Universal Fix: Generate FP32, cast to BF16 return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - # Universal Fix: Generate FP32, cast to FP16 return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) - exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) - mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): - # Universal Fix: Upcast for CPU Math orig_dtype = ifmap.dtype - ifmap, weights = ifmap.float(), weights.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - # Ensure bias natively generates in float32 - conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) - + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False + ) ofmap = conv2d(ifmap) - # Universal Fix: Downcast back return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) @@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel): def batchnorm(ifmap): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) - running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) @@ -424,12 +439,10 @@ def batchnorm(ifmap): def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): - # Universal Fix: Upcast EVERYTHING before starting the math orig_dtype = ifmap.dtype - ifmap = ifmap.float() - weights = weights.float() - bn_k = bn_k.float() - bn_l = bn_l.float() + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -443,13 +456,14 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, # Safely uses float32 + dtype=ifmap.dtype, ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap + # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -460,16 +474,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) - if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: + # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -480,6 +494,7 @@ def fused_conv( weights[:, :, c].flatten(), ) else: + # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -492,22 +507,34 @@ def fused_conv( ofmap += ofmap_before + # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l + # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - # Universal Fix: Downcast back - return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + + return ofmap, ofmap_before, ifmap_padded def main(): + parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + args = parser.parse_args() global verbose @@ -526,18 +553,31 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": - # Generate safely in float32, then cast ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) weights = torch.randn( - param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], - requires_grad=False, dtype=torch.float32, + param["channels"]["out"], + param["channels"]["in"], + param["filter"]["height"], + param["filter"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) + ofmap = conv2d( + ifmap, + weights, + padding=param["filter"]["padding"], + stride=param["filter"]["stride"], + ) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -549,8 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - # Upcast for CPU Math - result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -574,16 +616,22 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } + emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -592,12 +640,17 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -606,28 +659,44 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], param["dim_in_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_in_y"], + param["dim_in_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["ch_out"], + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], - param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], param["depthwise"], + ifmap, + kernel, + bn_k, + bn_l, + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: @@ -655,5 +724,6 @@ def main(): else: print("No valid kernel selected") + if __name__ == "__main__": main() diff --git a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py index 0c500558..4da0e20d 100755 --- a/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py +++ b/sw/spatzBenchmarks/widening-hp-fmatmul/script/gen_data.py @@ -26,6 +26,9 @@ def array_to_cstr(a, fmt=float): if isinstance(a, np.ndarray): a = a.flat if isinstance(a, torch.Tensor): + # Fallback to float32 if numpy struggles with bfloat16, otherwise leave alone + if a.dtype == torch.bfloat16: + a = a.float() a = a.cpu().numpy().flat for el in a: out += "{}, ".format(el) @@ -42,6 +45,7 @@ def array_to_cstr(a, fmt=float): def emit_header_file(layer_type: str, **kwargs): + file_path = pathlib.Path(__file__).parent.parent / "data" file_path.mkdir(parents=True, exist_ok=True) emit_str = ( @@ -95,7 +99,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): layer_str += f'static double {name}_result[{oh}][{ow}][{co}] __attribute__((section(".data")));\n\n' layer_str += ( f"static double {name}_checksum[{oh}][{ow}] = " - + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) # Safe sum for checksum + + array_to_cstr(torch.sum(ofmap.float(), dim=-1)) + ";\n\n\n" ) layer_str += ( @@ -118,6 +122,7 @@ def emit_conv2d_layer(name="conv2d", **kwargs): def emit_linear_layer(input, weights, ofmap): + layer_str = "" return layer_str @@ -166,7 +171,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): ) layer_str += ( f"static const {dtype} {name}_result[{m}*{n}] = " - + array_to_cstr(torch.sum(result.float(), dim=-1)) + + array_to_cstr(result) + ";\n\n\n" ) else: @@ -190,6 +195,7 @@ def emit_GEMM_layer(name="gemm", **kwargs): def emit_batchnorm_layer(name="batchnorm", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] beta = kwargs["beta"] @@ -236,6 +242,7 @@ def emit_batchnorm_layer(name="batchnorm", **kwargs): def emit_maxpool_layer(name="maxpool", **kwargs): + ifmap = kwargs["ifmap"] ofmap = kwargs["ofmap"] k = kwargs["kernel_size"] @@ -277,6 +284,7 @@ def emit_maxpool_layer(name="maxpool", **kwargs): def emit_fusedconv(name="fusedconv", **kwargs): + ifmap = kwargs["ifmap"] kernel = kwargs["kernel"] bn_k = kwargs["bn_k"] @@ -358,43 +366,48 @@ def rand_data_generator(shape, prec, alt=False): return torch.randn(shape, requires_grad=False, dtype=torch.float32), {} elif prec == 16: if alt: - # Universal Fix: Generate FP32, cast to BF16 return torch.randn(shape, requires_grad=False, dtype=torch.float32).to(torch.bfloat16), {} else: - # Universal Fix: Generate FP32, cast to FP16 return torch.randn(shape, requires_grad=False, dtype=torch.float32, device=device).to(torch.float16), {} elif prec == 8: - sign = torch.randint(0, 2, shape, requires_grad=False, dtype=torch.uint8) - exponent = torch.randint(0, 16, shape, requires_grad=False, dtype=torch.uint8) - mantissa = torch.randint(0, 4, shape, requires_grad=False, dtype=torch.uint8) + sign = torch.randint( + 0, 2, shape, requires_grad=False, dtype=torch.uint8 + ) # -1 or 1 + exponent = torch.randint( + 0, 16, shape, requires_grad=False, dtype=torch.uint8 + ) # < 0b01111 + mantissa = torch.randint( + 0, 4, shape, requires_grad=False, dtype=torch.uint8 + ) # can be arbitrary bits = {"sign": sign, "exponent": exponent, "mantissa": mantissa} + # TODO: not actually correct return ((-1.0) ** sign.double()) * (2.0 ** (exponent.double() - 15.0)) * ( 1.0 + mantissa.double() / (2**2) ), bits def conv2d(ifmap, weights, padding=1, stride=1): - # Universal Fix: Upcast for CPU Math orig_dtype = ifmap.dtype - ifmap, weights = ifmap.float(), weights.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap, weights = ifmap.float(), weights.float() n, ci, ih, iw = ifmap.shape co, _, fh, fw = weights.shape conv2d = nn.Conv2d(ci, co, (fh, fw), padding=((fh - 1) // 2, (fw - 1) // 2)) conv2d.weight = nn.Parameter(weights, requires_grad=False) - # Ensure bias natively generates in float32 - conv2d.bias = nn.Parameter(torch.zeros_like(conv2d.bias, dtype=torch.float32), requires_grad=False) - + conv2d.bias = nn.Parameter( + torch.zeros_like(conv2d.bias, dtype=weights.dtype), requires_grad=False + ) ofmap = conv2d(ifmap) - # Universal Fix: Downcast back return ofmap.to(orig_dtype) def max_pooling(ifmap, kernel): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape max_pool = nn.MaxPool2d(kernel_size=kernel) @@ -405,14 +418,16 @@ def max_pooling(ifmap, kernel): def batchnorm(ifmap): orig_dtype = ifmap.dtype - ifmap = ifmap.float() + if orig_dtype in [torch.float16, torch.bfloat16]: + ifmap = ifmap.float() n, ci, ih, iw = ifmap.shape bn = torch.nn.BatchNorm2d(ci) bn.weight.requires_grad = False bn.bias.requires_grad = False - running_mean = torch.randn_like(bn.running_mean, requires_grad=False, dtype=torch.float32) - running_var = torch.rand_like(bn.running_var, requires_grad=False, dtype=torch.float32) + + running_mean = torch.randn(bn.running_mean.shape, requires_grad=False, dtype=torch.float32).to(bn.running_mean.dtype) + running_var = torch.rand(bn.running_var.shape, requires_grad=False, dtype=torch.float32).to(bn.running_var.dtype) gamma = bn.weight / torch.sqrt(running_var + bn.eps) beta = bn.bias - running_mean * bn.weight / torch.sqrt(running_var + bn.eps) @@ -424,12 +439,10 @@ def batchnorm(ifmap): def fused_conv( ifmap, weights, bn_k, bn_l, padding, stride, bn, relu, accumulate, depthwise ): - # Universal Fix: Upcast EVERYTHING before starting the math orig_dtype = ifmap.dtype - ifmap = ifmap.float() - weights = weights.float() - bn_k = bn_k.float() - bn_l = bn_l.float() + is_half = orig_dtype in [torch.float16, torch.bfloat16] + if is_half: + ifmap, weights, bn_k, bn_l = ifmap.float(), weights.float(), bn_k.float(), bn_l.float() ih, iw, ci = ifmap.shape if not depthwise: @@ -443,13 +456,14 @@ def fused_conv( iw + padding["padding_x_left"] + padding["padding_x_right"], ci, requires_grad=False, - dtype=ifmap.dtype, # Safely uses float32 + dtype=ifmap.dtype, ) ifmap_padded[ padding["padding_y_top"] : ih + padding["padding_y_top"], padding["padding_x_left"] : iw + padding["padding_x_left"], ] = ifmap + # Don't cover undefined behaviour when there are steps without a complete kernel window if (ifmap_padded.shape[0] - (fh - 1) - 1) % stride["stride_y"] != 0: print("Warning: rounding h output dimension") if (ifmap_padded.shape[1] - (fw - 1) - 1) % stride["stride_x"] != 0: @@ -460,16 +474,16 @@ def fused_conv( (ifmap_padded.shape[1] - (fw - 1) - 1) // stride["stride_x"] + 1, co, ) - if accumulate: - ofmap_before = torch.randn_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.randn(ofmap.shape, requires_grad=False, dtype=torch.float32).to(ofmap.dtype) else: - ofmap_before = torch.zeros_like(ofmap, requires_grad=False, dtype=torch.float32) + ofmap_before = torch.zeros_like(ofmap, requires_grad=False) if verbose: print(ifmap.shape, ifmap_padded.shape, ofmap.shape) if depthwise: + # depthwise Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -480,6 +494,7 @@ def fused_conv( weights[:, :, c].flatten(), ) else: + # Conv2d for h in range(0, ifmap_padded.shape[0] - (fh - 1), stride["stride_y"]): for w in range(0, ifmap_padded.shape[1] - (fw - 1), stride["stride_x"]): for c in range(co): @@ -492,22 +507,34 @@ def fused_conv( ofmap += ofmap_before + # BatchNorm if bn: ofmap = ofmap * bn_k + bn_l + # ReLU if relu: ofmap = torch.nn.functional.relu(ofmap) - # Universal Fix: Downcast back - return ofmap.to(orig_dtype), ofmap_before.to(orig_dtype), ifmap_padded.to(orig_dtype) + if is_half: + ofmap = ofmap.to(orig_dtype) + ofmap_before = ofmap_before.to(orig_dtype) + ifmap_padded = ifmap_padded.to(orig_dtype) + + return ofmap, ofmap_before, ifmap_padded def main(): + parser = argparse.ArgumentParser(description="Generate data for kernels") parser.add_argument( - "-c", "--cfg", type=pathlib.Path, required=True, help="Select param config file kernel", + "-c", + "--cfg", + type=pathlib.Path, + required=True, + help="Select param config file kernel", ) parser.add_argument("-v", "--verbose", action="store_true", help="Set verbose") + args = parser.parse_args() global verbose @@ -526,18 +553,31 @@ def main(): dtype = torch.float32 if param["kernel"] == "Conv2d": - # Generate safely in float32, then cast ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) weights = torch.randn( - param["channels"]["out"], param["channels"]["in"], param["filter"]["height"], param["filter"]["width"], - requires_grad=False, dtype=torch.float32, + param["channels"]["out"], + param["channels"]["in"], + param["filter"]["height"], + param["filter"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - ofmap = conv2d(ifmap, weights, padding=param["filter"]["padding"], stride=param["filter"]["stride"]) + ofmap = conv2d( + ifmap, + weights, + padding=param["filter"]["padding"], + stride=param["filter"]["stride"], + ) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) weights = weights.permute(0, 2, 3, 1) @@ -549,8 +589,10 @@ def main(): mat_B, bits_B = rand_data_generator((param["K"], param["N"]), param["prec"]) mat_C, bits_C = rand_data_generator((param["M"], param["N"]), param["prec"]) - # Upcast for CPU Math - result = torch.matmul(mat_A.float(), mat_B.float()).to(dtype) + if mat_A.dtype in [torch.float16, torch.bfloat16]: + result = torch.matmul(mat_A.float(), mat_B.float()).to(mat_A.dtype) + else: + result = torch.matmul(mat_A, mat_B) if param["transpose_A"]: mat_A = mat_A.T @@ -574,16 +616,22 @@ def main(): "bits_B": bits_B, "bits_C": bits_C, } + emit_header_file("GEMM", **kwargs) elif param["kernel"] == "BatchNorm": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap, gamma, beta = batchnorm(ifmap) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -592,12 +640,17 @@ def main(): elif param["kernel"] == "MaxPool": ifmap = torch.randn( - 1, param["channels"]["in"], param["input_dim"]["height"], param["input_dim"]["width"], - requires_grad=False, dtype=torch.float32, + 1, + param["channels"]["in"], + param["input_dim"]["height"], + param["input_dim"]["width"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) ofmap = max_pooling(ifmap, param["kernel_size"]) + # convert from CHW to HWC format ifmap = ifmap.permute(0, 2, 3, 1) ofmap = ofmap.permute(0, 2, 3, 1) @@ -606,28 +659,44 @@ def main(): elif param["kernel"] == "FusedConv": ifmap = torch.randn( - param["dim_in_y"], param["dim_in_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_in_y"], + param["dim_in_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) - if not param["depthwise"]: kernel = torch.randn( - param["ch_out"], param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["ch_out"], + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) else: kernel = torch.randn( - param["dim_kernel_y"], param["dim_kernel_x"], param["ch_in"], - requires_grad=False, dtype=torch.float32, + param["dim_kernel_y"], + param["dim_kernel_x"], + param["ch_in"], + requires_grad=False, + dtype=torch.float32, ).to(dtype) bn_k = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) bn_l = torch.randn(param["ch_out"], requires_grad=False, dtype=torch.float32).to(dtype) ofmap, ofmap_before, ifmap_padded = fused_conv( - ifmap, kernel, bn_k, bn_l, param["padding"], param["stride"], - param["flags"]["flag_batch_norm"], param["flags"]["flag_relu"], - not param["flags"]["flag_y_accumulate_start"], param["depthwise"], + ifmap, + kernel, + bn_k, + bn_l, + padding=param["padding"], + stride=param["stride"], + bn=param["flags"]["flag_batch_norm"], + relu=param["flags"]["flag_relu"], + accumulate=not param["flags"]["flag_y_accumulate_start"], + depthwise=param["depthwise"], ) if param["chw_layer"]: @@ -655,5 +724,6 @@ def main(): else: print("No valid kernel selected") + if __name__ == "__main__": main() From 28ff7fbd562fe291d7f0a4268ed0106aed05c5aa Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 16:45:02 +0200 Subject: [PATCH 06/13] [SW] Fix a trailing whitespace in gen_data. --- sw/spatzBenchmarks/gemv/script/gen_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sw/spatzBenchmarks/gemv/script/gen_data.py b/sw/spatzBenchmarks/gemv/script/gen_data.py index f33d5154..532d725b 100644 --- a/sw/spatzBenchmarks/gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/gemv/script/gen_data.py @@ -194,4 +194,3 @@ def main(): if __name__ == "__main__": main() - \ No newline at end of file From 218b5a3e4abe458125fe142525df39a322fa34d6 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 16:46:38 +0200 Subject: [PATCH 07/13] [TB] Change error call to fatal to trigger abnormal exit when kernel fails. Deliberately changed a test to test the failure. --- hw/ip/snitch_test/src/tb_bin.sv | 2 +- sw/spatzBenchmarks/dp-fdotp/main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv index 469a3ebf..1a947f07 100644 --- a/hw/ip/snitch_test/src/tb_bin.sv +++ b/hw/ip/snitch_test/src/tb_bin.sv @@ -48,7 +48,7 @@ module tb_bin; end while (exit_code == 0); exit_code >>= 1; if (exit_code > 0) begin - $error("[FAILURE] Finished with exit code %2d", exit_code); + $fatal("[FAILURE] Finished with exit code %2d", exit_code); end else begin $info("[SUCCESS] Program finished successfully"); end diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c index f4a0524d..0b4ac88b 100644 --- a/sw/spatzBenchmarks/dp-fdotp/main.c +++ b/sw/spatzBenchmarks/dp-fdotp/main.c @@ -32,7 +32,7 @@ static inline int fp_check(const double a, const double b) { const double threshold = 0.00001; // Absolute value - double comp = a - b; + double comp = a - 1; if (comp < 0) comp = -comp; From bc32500aa7f8ff9d3eac34240cc9b5eb3c1a3461 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Fri, 24 Apr 2026 17:12:30 +0200 Subject: [PATCH 08/13] [TB] Only mark success when return 0. --- hw/ip/snitch_test/src/tb_bin.sv | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv index 1a947f07..cf81aaf0 100644 --- a/hw/ip/snitch_test/src/tb_bin.sv +++ b/hw/ip/snitch_test/src/tb_bin.sv @@ -46,11 +46,13 @@ module tb_bin; if (exit_code == 0) #200ns; end while (exit_code == 0); + exit_code >>= 1; - if (exit_code > 0) begin - $fatal("[FAILURE] Finished with exit code %2d", exit_code); - end else begin + + if (exit_code == 0) begin $info("[SUCCESS] Program finished successfully"); + end else begin + $fatal("[FAILURE] Finished with exit code %2d", exit_code); end $finish; end From f7e11f2f7765cee3dfc686ab0674f5b5c2b4bfe8 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 27 Apr 2026 09:36:47 +0200 Subject: [PATCH 09/13] [SW] Fix a lint issue in gen_data.py --- sw/spatzBenchmarks/sa-gemv/script/gen_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py index ecc91071..4d268d6c 100644 --- a/sw/spatzBenchmarks/sa-gemv/script/gen_data.py +++ b/sw/spatzBenchmarks/sa-gemv/script/gen_data.py @@ -93,7 +93,7 @@ def emit_gemv_layer(name="gemv", **kwargs): + ";\n" ) # Assuming you have variables like M (output size) and tot_nz (number of non-zeros) - layer_str += f'// Auto-generated buffers for Cache Mode\n' + layer_str += '// Auto-generated buffers for Cache Mode\n' layer_str += f'static uint16_t dense_idx_dram[{tot_nz}] __attribute__((section(".data"))) = {{0}};\n' layer_str += f'static {dtype} dense_vec_dram[{tot_nz}] __attribute__((section(".data"))) = {{0.0}};\n' layer_str += f'static {dtype} result_buf_dram[{m}] __attribute__((section(".data"))) = {{0.0}};\n' From 8df585e53351c33863623f761dfcc0ef37e354f9 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 27 Apr 2026 09:37:27 +0200 Subject: [PATCH 10/13] [SW] Change comparison type to float to avoid illegal instruction in 32b configuration. --- sw/spatzBenchmarks/sa-gemv/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c index df2eb8ba..ce877af2 100644 --- a/sw/spatzBenchmarks/sa-gemv/main.c +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -53,7 +53,7 @@ static inline int fp_check(const T *a, const T *b) { const T threshold = 0.001; // Absolute value - double comp = (double)*a - (double)*b; + float comp = (float)*a - (float)*b; if (comp < 0) comp = -comp; @@ -199,7 +199,7 @@ int main() { } for (unsigned int j = 0; j < vec_chunk_len; ++j) { - if ((double) vec_ptr[j] != 0.0) { + if ((float) vec_ptr[j] != 0.0) { dense_vec[nz_count] = vec_ptr[j]; dense_idx[nz_count] = i * vec_chunk_len + j; nz_count++; From 7c50b182b0617678cbe05868756758ceab9b4d7f Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 27 Apr 2026 10:36:37 +0200 Subject: [PATCH 11/13] [Verilator] Adjust the verilator's DPI-C library to ensure the capture of failed tests. --- hw/ip/snitch_test/src/verilator_lib.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/ip/snitch_test/src/verilator_lib.cc b/hw/ip/snitch_test/src/verilator_lib.cc index b04a44db..583d54c2 100644 --- a/hw/ip/snitch_test/src/verilator_lib.cc +++ b/hw/ip/snitch_test/src/verilator_lib.cc @@ -32,10 +32,10 @@ int Sim::run() { target.init(sim_thread_main, this); int exit_code = htif_t::run(); - if (exit_code > 0) - fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code); - else + if (exit_code == 0) fprintf(stderr, "[SUCCESS] Program finished successfully\n"); + else + fprintf(stderr, "[FAILURE] Finished with exit code %2d\n", exit_code); return exit_code; } From b95b644e0ce2a45af79cf805422c54269e0f2aa4 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 27 Apr 2026 13:21:06 +0200 Subject: [PATCH 12/13] [SW] Lint fix and restore the deliberate failed test. --- hw/ip/snitch_test/src/tb_bin.sv | 2 +- sw/spatzBenchmarks/dp-fdotp/main.c | 2 +- sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c | 6 +- sw/spatzBenchmarks/sa-gemv/main.c | 181 ++++++++++---------- 4 files changed, 100 insertions(+), 91 deletions(-) diff --git a/hw/ip/snitch_test/src/tb_bin.sv b/hw/ip/snitch_test/src/tb_bin.sv index cf81aaf0..423f9d9e 100644 --- a/hw/ip/snitch_test/src/tb_bin.sv +++ b/hw/ip/snitch_test/src/tb_bin.sv @@ -52,7 +52,7 @@ module tb_bin; if (exit_code == 0) begin $info("[SUCCESS] Program finished successfully"); end else begin - $fatal("[FAILURE] Finished with exit code %2d", exit_code); + $error("[FAILURE] Finished with exit code %2d", exit_code); end $finish; end diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c index 0b4ac88b..f4a0524d 100644 --- a/sw/spatzBenchmarks/dp-fdotp/main.c +++ b/sw/spatzBenchmarks/dp-fdotp/main.c @@ -32,7 +32,7 @@ static inline int fp_check(const double a, const double b) { const double threshold = 0.00001; // Absolute value - double comp = a - 1; + double comp = a - b; if (comp < 0) comp = -comp; diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c index 4e641041..733cd484 100644 --- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c @@ -19,7 +19,7 @@ #include "sa-gemv.h" -void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) { +void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N) { unsigned int vl, avl = M_core; double *a_, *a_start = a; double *c_ = c; @@ -66,7 +66,7 @@ void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N) { } while (avl > 0); } -void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) { +void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) { unsigned int vl, avl = M_core; float *a_, *a_start = a; float *c_ = c; @@ -114,7 +114,7 @@ void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N) { } while (avl > 0); } -void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N) { +void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N) { unsigned int vl, avl = M_core; __fp16 *a_, *a_start = a; __fp16 *c_ = c; diff --git a/sw/spatzBenchmarks/sa-gemv/main.c b/sw/spatzBenchmarks/sa-gemv/main.c index ce877af2..2875d52c 100644 --- a/sw/spatzBenchmarks/sa-gemv/main.c +++ b/sw/spatzBenchmarks/sa-gemv/main.c @@ -65,9 +65,9 @@ int main() { const unsigned int cid = snrt_cluster_core_idx(); // Reset timer - unsigned int timer = (unsigned int)-1; + unsigned int timer = (unsigned int)-1; unsigned int timer_best = (unsigned int)-1; - unsigned int timer_nz = (unsigned int)-1; + unsigned int timer_nz = (unsigned int)-1; const unsigned int m_core = gemv_l.M / num_cores; // Size (in KiB) of L1 SPM, used to calculate tiling window const unsigned int spm_size = 128; @@ -80,21 +80,23 @@ int main() { // 2. 2 chunks of matrix + densed vector (output) + densed idx // Sizes of each part we need - const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram; - const uint32_t row_size = sizeof(T) * gemv_l.M; - const uint32_t vec_size = sizeof(T) * gemv_l.N; + const uint32_t mat_size = sizeof(T) * gemv_l.M * tot_nz_dram; + const uint32_t row_size = sizeof(T) * gemv_l.M; + const uint32_t vec_size = sizeof(T) * gemv_l.N; const uint32_t dense_vec_size = sizeof(T) * tot_nz_dram; const uint32_t dense_idx_size = sizeof(uint16_t) * tot_nz_dram; - const uint32_t result_size = sizeof(T) * gemv_l.M; + const uint32_t result_size = sizeof(T) * gemv_l.M; // leave 8 KiB for Stack - const uint32_t l1_size = (spm_size - 8) * 1024; - const uint32_t fixed_alloc_size = dense_vec_size + dense_idx_size + result_size; + const uint32_t l1_size = (spm_size - 8) * 1024; + const uint32_t fixed_alloc_size = + dense_vec_size + dense_idx_size + result_size; // --- BOUNDS CHECK 1: Do the fixed arrays fit in L1? --- if (fixed_alloc_size >= l1_size) { if (cid == 0) { - PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but only %u bytes available.\n", + PRINTF("FATAL: L1 Memory Overflow! Fixed arrays require %u bytes, but " + "only %u bytes available.\n", fixed_alloc_size, l1_size); } snrt_cluster_hw_barrier(); @@ -109,7 +111,8 @@ int main() { // --- BOUNDS CHECK 2: Can we double buffer at least 1 row? --- if (num_row_mat < 1) { if (cid == 0) { - PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double buffering. " + PRINTF("FATAL: L1 Memory Overflow! Cannot fit at least 2 rows for double " + "buffering. " "Chunk space left: %u bytes, Row size: %u bytes.\n", l1_for_chunk, row_size); } @@ -119,7 +122,10 @@ int main() { // Always strictly split the available memory in half for double-buffering const uint32_t vec_chunk_size = l1_for_chunk / 2; - const uint32_t num_vec_chunk = (l1_for_chunk > vec_size) ? 1 : ((vec_size + vec_chunk_size - 1) / vec_chunk_size); + const uint32_t num_vec_chunk = + (l1_for_chunk > vec_size) + ? 1 + : ((vec_size + vec_chunk_size - 1) / vec_chunk_size); // Recalculate exact chunk size based on whole rows const uint32_t mat_chunk_size = num_row_mat * row_size; @@ -128,14 +134,13 @@ int main() { const uint32_t num_mat_chunk = (tot_nz_dram + num_row_mat - 1) / num_row_mat; // Number of elements in each chunk - const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T); - + const uint32_t vec_chunk_len = vec_chunk_size / sizeof(T); // Memory Allocation if (cid == 0) { - result = (T *)snrt_l1alloc(result_size); + result = (T *)snrt_l1alloc(result_size); dense_vec = (T *)snrt_l1alloc(dense_vec_size); - vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk); + vec_buf0 = (T *)snrt_l1alloc(l1_for_chunk); dense_idx = (uint16_t *)snrt_l1alloc(dense_idx_size); // Offset by half of the size if needed by double buffering @@ -159,47 +164,45 @@ int main() { timer = benchmark_get_cycle(); // Calculate internal pointers - T *vec_ptr = vec_buf0; - T *vec_db_ptr = vec_buf1; + T *vec_ptr = vec_buf0; + T *vec_db_ptr = vec_buf1; // Task 1: Find out the non-zeros if (cid == 0) { - #ifdef DEBUG_NZ +#ifdef DEBUG_NZ PRINTF("NZ-Calc PreLD\n"); - PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, vec_chunk_size); - #endif + PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", vec_ptr, gemv_vec_dram, + vec_chunk_size); +#endif snrt_dma_start_1d(vec_ptr, gemv_vec_dram, vec_chunk_size); snrt_dma_wait_all(); } uint32_t nz_count = 0; - if (cid == 0) { for (unsigned int i = 0; i < num_vec_chunk; ++i) { // Step 1.1: preload the next chunk if not the end // Make sure the previous load completes snrt_dma_wait_all(); // Double buffer to search the next non-zero - uint32_t next_bytes = (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) - ? (vec_size - (i + 1) * vec_chunk_size) - : vec_chunk_size; + uint32_t next_bytes = + (vec_size - (i + 1) * vec_chunk_size < vec_chunk_size) + ? (vec_size - (i + 1) * vec_chunk_size) + : vec_chunk_size; if (i < num_vec_chunk - 1) { - #ifdef DEBUG_NZ +#ifdef DEBUG_NZ PRINTF("NZ-Calc DB Iter%u\n", i); PRINTF("DMA SRC:%p, TGT:%p, SIZE:%u\n", - gemv_vec_dram + (i + 1) * vec_chunk_len, - vec_db_ptr, - next_bytes); - #endif - snrt_dma_start_1d(vec_db_ptr, - gemv_vec_dram + (i + 1) * vec_chunk_len, + gemv_vec_dram + (i + 1) * vec_chunk_len, vec_db_ptr, next_bytes); +#endif + snrt_dma_start_1d(vec_db_ptr, gemv_vec_dram + (i + 1) * vec_chunk_len, next_bytes); // Use exact bytes } for (unsigned int j = 0; j < vec_chunk_len; ++j) { - if ((float) vec_ptr[j] != 0.0) { + if ((float)vec_ptr[j] != 0.0) { dense_vec[nz_count] = vec_ptr[j]; dense_idx[nz_count] = i * vec_chunk_len + j; nz_count++; @@ -214,10 +217,10 @@ int main() { if (i % 2 == 0) { // pointer exchange - vec_ptr = vec_buf1; + vec_ptr = vec_buf1; vec_db_ptr = vec_buf0; } else { - vec_ptr = vec_buf0; + vec_ptr = vec_buf0; vec_db_ptr = vec_buf1; } } @@ -225,49 +228,48 @@ int main() { snrt_cluster_hw_barrier(); - #ifdef DEBUG_NZ +#ifdef DEBUG_NZ if (cid == 0) PRINTF("Non-Zero Calc Complete\n"); - #endif +#endif - #ifdef DEBUG_NZ_IDX +#ifdef DEBUG_NZ_IDX if (cid == 0) { for (uint32_t i = 0; i < tot_nz_dram; i++) { PRINTF("IDX[%u]=%u\n", i, dense_idx[i]); } } - #endif +#endif timer_nz = benchmark_get_cycle() - timer_nz; timer = benchmark_get_cycle(); - // Task 2: GEMV calculation // Calculate internal pointers - T *mat_ptr = mat_buf0; - T *mat_db_ptr = mat_buf1; + T *mat_ptr = mat_buf0; + T *mat_db_ptr = mat_buf1; T *result_core = result + m_core * cid; uint16_t *idx_ptr = dense_idx; // Corrected pointer type if (cid == 0) { // Determine how many rows are actually active for this very first chunk - uint32_t active_rows = (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat; + uint32_t active_rows = + (tot_nz_dram < num_row_mat) ? tot_nz_dram : num_row_mat; - #ifdef DEBUG_GEMV_PreLD +#ifdef DEBUG_GEMV_PreLD PRINTF("GEMV PreLD\n"); PRINTF("Active Rows:%u\n", active_rows); - #endif +#endif for (unsigned int i = 0; i < active_rows; i++) { - #ifdef DEBUG - PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", - i, - gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, - mat_ptr + i * gemv_l.M, - row_size); - #endif +#ifdef DEBUG + PRINTF("Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_ptr + i * gemv_l.M, row_size); +#endif snrt_dma_start_1d(mat_ptr + i * gemv_l.M, // Pack linearly into L1 - gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM + gemv_mat_dram + + (size_t)(*idx_ptr) * gemv_l.M, // Source from DRAM row_size); idx_ptr++; } @@ -275,15 +277,15 @@ int main() { snrt_cluster_hw_barrier(); - #ifdef DEBUG_GEMV_PreLD +#ifdef DEBUG_GEMV_PreLD if (cid == 0) PRINTF("GEMV PreLD Complete\n"); - #endif +#endif - #ifdef DEBUG_GEMV_DB +#ifdef DEBUG_GEMV_DB if (cid == 0) PRINTF("Tot Chunks %u\n", num_mat_chunk); - #endif +#endif for (unsigned int chunk_idx = 0; chunk_idx < num_mat_chunk; chunk_idx++) { // Wait for the CURRENT chunk to finish loading @@ -297,27 +299,24 @@ int main() { uint32_t next_active_rows = 0; if (next_chunk_start < tot_nz_dram) { - next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) - ? (tot_nz_dram - next_chunk_start) - : num_row_mat; + next_active_rows = (tot_nz_dram - next_chunk_start < num_row_mat) + ? (tot_nz_dram - next_chunk_start) + : num_row_mat; } - #ifdef DEBUG_GEMV_DB +#ifdef DEBUG_GEMV_DB if (cid == 0) PRINTF("Chunk%u, DB Rows%u\n", chunk_idx, next_active_rows); - #endif +#endif // Load NEXT chunk in the background if (cid == 0 && next_active_rows > 0) { for (unsigned int i = 0; i < next_active_rows; i++) { - #ifdef DEBUG_GEMV_DB - PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", - idx_ptr, - i, - gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, - mat_db_ptr + i * gemv_l.M, - row_size); - #endif +#ifdef DEBUG_GEMV_DB + PRINTF("Ptr:%p, Row:%u, SRC:%p, TGT:%p, SIZE:%u\n", idx_ptr, i, + gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, + mat_db_ptr + i * gemv_l.M, row_size); +#endif snrt_dma_start_1d(mat_db_ptr + i * gemv_l.M, gemv_mat_dram + (size_t)(*idx_ptr) * gemv_l.M, row_size); @@ -326,28 +325,32 @@ int main() { } // Calculate active rows for the CURRENT compute phase - uint32_t curr_active_rows = (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) - ? (tot_nz_dram - chunk_idx * num_row_mat) - : num_row_mat; + uint32_t curr_active_rows = + (tot_nz_dram - chunk_idx * num_row_mat < num_row_mat) + ? (tot_nz_dram - chunk_idx * num_row_mat) + : num_row_mat; // Calculate GEMV on the current chunk T *current_dense_vec = dense_vec + chunk_idx * num_row_mat; - // Offset the matrix pointer by m_core * cid so each core reads its correct rows + // Offset the matrix pointer by m_core * cid so each core reads its correct + // rows T *mat_core_ptr = mat_ptr + m_core * cid; - #if (PREC == 64) - gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); - #elif (PREC == 32) - gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); - #else - gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, curr_active_rows); - #endif - +#if (PREC == 64) + gemv_v64b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#elif (PREC == 32) + gemv_v32b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#else + gemv_v16b_m4(mat_core_ptr, current_dense_vec, result_core, gemv_l.M, m_core, + curr_active_rows); +#endif // Swap pointers for the next iteration - T *temp = mat_ptr; - mat_ptr = mat_db_ptr; + T *temp = mat_ptr; + mat_ptr = mat_db_ptr; mat_db_ptr = temp; } @@ -363,14 +366,19 @@ int main() { // Checking for (unsigned int i = 0; i < gemv_l.M; i++) { if (fp_check(&result[i], &gemv_result[i])) { - PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], gemv_result[i]); +#if (PREC == 64) + PRINTF("Error: ID: %i Result = %f, Golden = %f\n", i, result[i], + gemv_result[i]); +#else + PRINTF("Error: ID: %i Result = %x, Golden = %x\n", i, + *(int *)&result[i], *(int *)&gemv_result[i]); +#endif } } } snrt_cluster_hw_barrier(); - // Check and display results // Assume 2 core 4 fpu configuration if (cid == 0) { @@ -380,7 +388,8 @@ int main() { long unsigned int utilization = performance / (2 * num_cores * 4 * 8 / sizeof(T)); - PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, tot_nz_dram); + PRINTF("\n----- (%d x %d) x (%d x 1) sa-gemv -----\n", gemv_l.M, gemv_l.N, + tot_nz_dram); PRINTF("The NZ finding takes %u cycles.\n", timer_nz); PRINTF("The GEMV execution took %u cycles.\n", timer); PRINTF("The performance is %ld OP/1000cycle (%ld%%o utilization).\n", From fb610d7ff05e60da6e7e38541fcbcfc33a66381c Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Mon, 27 Apr 2026 14:55:41 +0200 Subject: [PATCH 13/13] [SW] Keep lint fixing. --- sw/spatzBenchmarks/sa-gemv/data/layer.h | 6 +++--- sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c | 4 ++-- sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sw/spatzBenchmarks/sa-gemv/data/layer.h b/sw/spatzBenchmarks/sa-gemv/data/layer.h index 0a07ad53..62da8ce6 100644 --- a/sw/spatzBenchmarks/sa-gemv/data/layer.h +++ b/sw/spatzBenchmarks/sa-gemv/data/layer.h @@ -9,8 +9,8 @@ typedef enum { FP64 = 8, FP32 = 4, FP16 = 2, FP8 = 1 } precision_t; typedef struct gemv_layer_struct { - uint32_t M; - uint32_t N; + uint32_t M; + uint32_t N; - precision_t dtype; + precision_t dtype; } gemv_layer; diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c index 733cd484..31973656 100644 --- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.c @@ -102,11 +102,11 @@ void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N) { asm volatile("vle32.v v16, (%0)" ::"r"(c_)); asm volatile("vfadd.vv v4, v4, v16"); - asm volatile("vse32.v v12, (%0)" ::"r"(c_)); // wait, mapping v4 to v12? No, use v4. + asm volatile("vse32.v v12, (%0)" ::"r"(c_)); // Correction: // asm volatile("vse32.v v4, (%0)" ::"r"(c_)); // Let's rewrite this block safely: - asm volatile("vse32.v v4, (%0)" ::"r"(c_)); // Fixed register writeback + asm volatile("vse32.v v4, (%0)" ::"r"(c_)); avl -= vl; c_ += vl; diff --git a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h index 1bf377e1..1b172260 100644 --- a/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h +++ b/sw/spatzBenchmarks/sa-gemv/kernel/sa-gemv.h @@ -19,8 +19,8 @@ #ifndef _GEMV_H #define _GEMV_H -void gemv_v64b_m4(double *a, double* b, double* c, int M, int M_core, int N); -void gemv_v32b_m4(float *a, float* b, float* c, int M, int M_core, int N); -void gemv_v16b_m4(__fp16 *a, __fp16* b, __fp16* c, int M, int M_core, int N); +void gemv_v64b_m4(double *a, double *b, double *c, int M, int M_core, int N); +void gemv_v32b_m4(float *a, float *b, float *c, int M, int M_core, int N); +void gemv_v16b_m4(__fp16 *a, __fp16 *b, __fp16 *c, int M, int M_core, int N); #endif