From d7b13fec900ad621bee91d4e7b5e1019ace09558 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Mar 2026 08:25:53 +0100 Subject: [PATCH 1/6] Provide a default GEMM_DIVIDE_LIMIT and add it to DYNAMIC_ARCH --- common_param.h | 1 + param.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/common_param.h b/common_param.h index 9e5edbb816..108d593f9f 100644 --- a/common_param.h +++ b/common_param.h @@ -47,6 +47,7 @@ typedef struct { int dtb_entries; int switch_ratio; + int divide_limit; int offsetA, offsetB, align; #if BUILD_HFLOAT16 == 1 int shgemm_p, shgemm_q, shgemm_r; diff --git a/param.h b/param.h index 7e4a04501b..ca12cb630f 100644 --- a/param.h +++ b/param.h @@ -4260,6 +4260,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SWITCH_RATIO 2 #endif +#ifndef GEMM_DIVIDE_LIMIT +#define GEMM_DIVIDE_LIMIT 0 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif From 8f5e49556fb9e1a22cf42528260f52e390c470eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Mar 2026 08:26:33 +0100 Subject: [PATCH 2/6] Add GEMM_DIVIDE_LIMIT to parameters --- kernel/setparam-ref.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 51981c6253..88e95b830b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -54,6 +54,8 @@ gotoblas_t TABLE_NAME = { SWITCH_RATIO, + GEMM_DIVIDE_LIMIT, + GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, #ifdef BUILD_HFLOAT16 From b7601ea92f6e950670610da2f6d5b0c0211bec3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Mar 2026 08:29:15 +0100 Subject: [PATCH 3/6] Retrieve cpu-specific GEMM_DIVIDE_LIMIT if DYNAMIC_ARCH --- driver/level3/gemm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c index e37d86c28d..99320bab58 100644 --- a/driver/level3/gemm.c +++ b/driver/level3/gemm.c @@ -63,6 +63,10 @@ #define DIVIDE_RATE GEMM_DIVIDE_RATE #endif +#ifdef DYNAMIC_ARCH +#define GEMM_DIVIDE_LIMIT gotoblas->divide_limit +#endif + #ifdef GEMM_DIVIDE_LIMIT #define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT #endif From 6bf687b2ef75e2662bca0631e00d694f260c1b46 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Mar 2026 15:30:53 +0100 Subject: [PATCH 4/6] Make divide_rate and preferred_size available to DYNAMIC_ARCH too --- common_param.h | 2 ++ param.h | 42 ++++++++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/common_param.h b/common_param.h index 108d593f9f..3245bae6ef 100644 --- a/common_param.h +++ b/common_param.h @@ -47,7 +47,9 @@ typedef struct { int dtb_entries; int switch_ratio; + int divide_rate; int divide_limit; + int preferred_size; int offsetA, offsetB, align; #if BUILD_HFLOAT16 == 1 int shgemm_p, shgemm_q, shgemm_r; diff --git a/param.h b/param.h index ca12cb630f..4faaebff7c 100644 --- a/param.h +++ b/param.h @@ -630,10 +630,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 4 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #ifdef ARCH_X86 @@ -1539,10 +1539,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 4 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #ifdef ARCH_X86 @@ -1665,10 +1665,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -1786,10 +1786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -1919,10 +1919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif #define USE_SGEMM_KERNEL_DIRECT 1 @@ -2577,7 +2577,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2616,7 +2616,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_ALIGN 0x0ffffUL #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -3611,10 +3611,10 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 -#define GEMM_PREFERED_SIZE 4 +#define GEMM_PREFERRED_SIZE 4 #else #define SWITCH_RATIO 16 -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #endif #undef BGEMM_ALIGN_K @@ -3662,8 +3662,6 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #elif defined(NEOVERSEN2) || defined(NEOVERSEV2) -#define GEMM_DIVIDE_LIMIT 3 - #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 #else @@ -3751,9 +3749,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DIVIDE_RATE 1 #if defined(XDOUBLE) || defined(DOUBLE) -#define GEMM_PREFERED_SIZE 8 +#define GEMM_PREFERRED_SIZE 8 #else -#define GEMM_PREFERED_SIZE 16 +#define GEMM_PREFERRED_SIZE 16 #endif /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". @@ -4260,10 +4258,18 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define SWITCH_RATIO 2 #endif +#ifndef GEMM_DIVIDE_RATE +#define GEMM_DIVIDE_RATE 2 +#endif + #ifndef GEMM_DIVIDE_LIMIT #define GEMM_DIVIDE_LIMIT 0 #endif +#ifndef GEMM_PREFERRED_SIZE +#define GEMM_PREFERRED_SIZE 1 +#endif + #ifndef QGEMM_DEFAULT_UNROLL_M #define QGEMM_DEFAULT_UNROLL_M 2 #endif From 0dd501d794b71343b65ad94a261e4832f10f1b13 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Mar 2026 15:32:06 +0100 Subject: [PATCH 5/6] Add GEMM_DIVIDE_RATE and GEMM_PREFERRED_SIZE to parameters --- kernel/setparam-ref.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 88e95b830b..044ececd18 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -54,8 +54,12 @@ gotoblas_t TABLE_NAME = { SWITCH_RATIO, + GEMM_DIVIDE_RATE, + GEMM_DIVIDE_LIMIT, + GEMM_PREFERRED_SIZE, + GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, #ifdef BUILD_HFLOAT16 From c9185e91ade0abc91dc84b7015b0f8f963f54c8e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 20 Mar 2026 15:34:04 +0100 Subject: [PATCH 6/6] Make GEMM_DIVIDE_RATE and GEMM_PREFERRED_SIZE available in DYNAMIC_ARCH builds --- driver/level3/gemm.c | 18 +++++++++++------- driver/level3/level3_gemm3m_thread.c | 5 +++-- driver/level3/level3_syrk_threaded.c | 6 ++++-- driver/level3/level3_thread.c | 17 +++++++++++------ 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c index 99320bab58..41e7d43e70 100644 --- a/driver/level3/gemm.c +++ b/driver/level3/gemm.c @@ -59,17 +59,21 @@ #define GEMM_Q 128 #endif -#ifdef GEMM_DIVIDE_RATE +#ifdef DYNAMIC_ARCH +#define DIVIDE_LIMIT gotoblas->divide_limit +#define DIVIDE_RATE gotoblas->divide_rate +#else +#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT #define DIVIDE_RATE GEMM_DIVIDE_RATE #endif -#ifdef DYNAMIC_ARCH -#define GEMM_DIVIDE_LIMIT gotoblas->divide_limit -#endif +//#ifdef GEMM_DIVIDE_RATE +//#define DIVIDE_RATE GEMM_DIVIDE_RATE +//#endif -#ifdef GEMM_DIVIDE_LIMIT -#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT -#endif +//#ifdef GEMM_DIVIDE_LIMIT +//#define DIVIDE_LIMIT GEMM_DIVIDE_LIMIT +//#endif #ifdef THREADED_LEVEL3 #include "level3_thread.c" diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 26d07fa944..318d7d553e 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -41,6 +41,7 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif @@ -93,7 +94,7 @@ typedef struct { #else volatile #endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -294,7 +295,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *a, *b, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; - FLOAT *buffer[DIVIDE_RATE]; + FLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG ls, min_l, jjs, min_jj; BLASLONG is, min_i, div_n; diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 1b656f902d..47f303c1d2 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -41,6 +41,8 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 + #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif @@ -69,7 +71,7 @@ _Atomic #else volatile #endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -133,7 +135,7 @@ _Atomic static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - FLOAT *buffer[DIVIDE_RATE]; + FLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG k, lda, ldc; BLASLONG m_from, m_to, n_from, n_to; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 327dc2d01d..83403aef70 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -41,12 +41,17 @@ #define CACHE_LINE_SIZE 8 #endif +#define DIVIDE_RATE_MAX 2 + #ifndef DIVIDE_RATE #define DIVIDE_RATE 2 #endif -#ifndef GEMM_PREFERED_SIZE -#define GEMM_PREFERED_SIZE 1 +#ifdef DYNAMIC_ARCH +#define GEMM_PREFERRED_SIZE gotoblas->preferred_size +#endif +#ifndef GEMM_PREFERRED_SIZE +#define GEMM_PREFERRED_SIZE 1 #endif //The array of job_t may overflow the stack. @@ -93,7 +98,7 @@ typedef struct { volatile - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE_MAX]; } job_t; @@ -234,7 +239,7 @@ typedef struct { static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ - IFLOAT *buffer[DIVIDE_RATE]; + IFLOAT *buffer[DIVIDE_RATE_MAX]; BLASLONG k, lda, ldb, ldc; BLASLONG m_from, m_to, n_from, n_to; @@ -707,7 +712,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG while (m > 0){ width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); - width = round_up(m, width, GEMM_PREFERED_SIZE); + width = round_up(m, width, GEMM_PREFERRED_SIZE); m -= width; @@ -758,7 +763,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG if (width < switch_ratio) { width = switch_ratio; } - width = round_up(width_n, width, GEMM_PREFERED_SIZE); + width = round_up(width_n, width, GEMM_PREFERRED_SIZE); width_n -= width; if (width_n < 0) {