diff --git a/.gitignore b/.gitignore index 475c359..2f9ffe1 100644 --- a/.gitignore +++ b/.gitignore @@ -41,4 +41,7 @@ *.dwo # build files -**/build/ \ No newline at end of file +**/build/ + +# vscode settings +.vscode/ \ No newline at end of file diff --git a/include/.vscode/settings.json b/include/.vscode/settings.json deleted file mode 100644 index 64f45f4..0000000 --- a/include/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "files.associations": { - "array": "cpp", - "string": "cpp", - "string_view": "cpp", - "span": "cpp" - } -} diff --git a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp index 263703c..6cfeb1f 100644 --- a/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp +++ b/include/sofieBLAS/backends/cpu/sofieBLAS_cpu.hpp @@ -15,6 +15,8 @@ #endif #include +#include +#include class BlasCpu { public: @@ -36,18 +38,186 @@ class BlasCpu { } } + // C = alpha * op(A) * op(B) + beta * C (no bias, leading dims inferred) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, beta, alpaka::getPtrNative(C), static_cast(m)); + } + + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per + // row) Matches the cuBLASLt EPILOGUE_BIAS semantics: the bias buffer serves + // as both the beta-scaled accumulator and provides the per-row bias vector + // (first m elements). + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + // Step 1: C = alpha * op(A) * op(B) (beta=0 so C is fully overwritten) + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + // Step 2: C += beta * bias_matrix + bias_vec (per-row broadcast) + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + template inline void - gemm(char transa, char transb, const unsigned int m, const unsigned int n, - const unsigned int k, const float alpha, - alpaka::BufCpu, TIdx> const &A, const int lda, - alpaka::BufCpu, TIdx> const &B, const int ldb, - const float beta, alpaka::BufCpu, TIdx> &C, - const int ldc) { - CBLAS_TRANSPOSE TransA = charToTranspose(transa); - CBLAS_TRANSPOSE TransB = charToTranspose(transb); - cblas_sgemm(CblasColMajor, TransA, TransB, m, n, k, alpha, A.data(), lda, - B.data(), ldb, beta, C.data(), ldc); + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) + : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) + : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, alpaka::getPtrNative(A), lda, alpaka::getPtrNative(B), + ldb, 0.0f, alpaka::getPtrNative(C), static_cast(m)); + float *c = alpaka::getPtrNative(C); + const float *b = alpaka::getPtrNative(bias); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + c[j * m + i] += beta * b[j * m + i] + b[i]; + } + + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + for (unsigned int i = 0; i < m * n; ++i) + c[i] = c[i] > 0.0f ? c[i] : 0.0f; + } + + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) + // Uses the standard GELU: x * 0.5 * (1 + erf(x / sqrt(2))) + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCpu, TIdx> const &A, + alpaka::BufCpu, TIdx> const &B, + float beta, + alpaka::BufCpu, TIdx> &bias, + alpaka::BufCpu, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + float *c = alpaka::getPtrNative(C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + c[i] *= 0.5f * (1.0f + std::erff(c[i] * kInvSqrt2)); + } + + // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + int lda = (transa == 'N' || transa == 'n') ? static_cast(m) : static_cast(k); + int ldb = (transb == 'N' || transb == 'n') ? static_cast(k) : static_cast(n); + cblas_sgemm(CblasColMajor, charToTranspose(transa), charToTranspose(transb), + static_cast(m), static_cast(n), static_cast(k), + alpha, A, lda, B, ldb, 0.0f, C, static_cast(m)); + for (unsigned int j = 0; j < n; ++j) + for (unsigned int i = 0; i < m; ++i) + C[j * m + i] += beta * bias[j * m + i] + bias[i]; + } + + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + for (unsigned int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.0f ? C[i] : 0.0f; + } + + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + gemm(transa, transb, m, n, k, alpha, A, B, beta, bias, C); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (unsigned int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.0f + std::erff(C[i] * kInvSqrt2)); } }; diff --git a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp index cbf18e0..33baadd 100644 --- a/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp +++ b/include/sofieBLAS/backends/cuda/sofieBLAS_cublas.hpp @@ -15,7 +15,7 @@ #include #define CHECK_CUDA(err) \ - if (err != cudaSuccess) { \ + if ((err) != cudaSuccess) { \ std::cerr << "CUDA error: " << cudaGetErrorString(err) << " at line " \ << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ @@ -23,10 +23,9 @@ #define CHECK_CUBLAS(status) \ do { \ - cublasStatus_t s = (status); \ - if (s != CUBLAS_STATUS_SUCCESS) { \ - std::cerr << "cuBLAS error " << s << " at line " << __LINE__ \ - << std::endl; \ + cublasStatus_t _s = (status); \ + if (_s != CUBLAS_STATUS_SUCCESS) { \ + std::cerr << "cuBLAS error " << _s << " at line " << __LINE__ << "\n"; \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -49,33 +48,23 @@ struct PairEq { class BlasCuda { cublasLtHandle_t ltHandle = nullptr; - cublasHandle_t handle = nullptr; - cublasLtMatmulDesc_t operationDesc = nullptr; cublasLtMatmulPreference_t preference = nullptr; void *d_workspace = nullptr; - size_t workspaceSize = 1 << 22; // 4MB + size_t workspaceSize = 1 << 22; // 4 MB cudaStream_t stream = nullptr; - cublasLtMatmulHeuristicResult_t heuristic; - cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; - int error_flag = 0; - std::unordered_map, cublasLtMatrixLayout_t, PairHash, PairEq> - LayoutStore; + layoutStore; public: - BlasCuda(const BlasCuda&) = delete; - BlasCuda& operator=(const BlasCuda&) = delete; - BlasCuda(BlasCuda&&) = delete; - BlasCuda& operator=(BlasCuda&&) = delete; + BlasCuda(const BlasCuda &) = delete; + BlasCuda &operator=(const BlasCuda &) = delete; + BlasCuda(BlasCuda &&) = delete; + BlasCuda &operator=(BlasCuda &&) = delete; BlasCuda(alpaka::QueueCudaRtNonBlocking &queue) : m_queue{queue} { stream = static_cast(m_queue.getNativeHandle()); CHECK_CUBLAS(cublasLtCreate(<Handle)); - CHECK_CUBLAS(cublasCreate(&handle)); - heuristic = {}; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, - CUDA_R_32F)); CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); CHECK_CUDA(cudaMalloc(&d_workspace, workspaceSize)); CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( @@ -84,22 +73,15 @@ class BlasCuda { } ~BlasCuda() { - for (auto& [key, layout] : LayoutStore) { - if (layout) { + for (auto &[key, layout] : layoutStore) + if (layout) cublasLtMatrixLayoutDestroy(layout); - } - } - LayoutStore.clear(); - if (preference) cublasLtMatmulPreferenceDestroy(preference); - if (operationDesc) - cublasLtMatmulDescDestroy(operationDesc); if (ltHandle) cublasLtDestroy(ltHandle); if (d_workspace) cudaFree(d_workspace); - } inline cublasOperation_t charToCuBlasTranspose(char trans) { @@ -118,231 +100,290 @@ class BlasCuda { } } - void AddLayoutConfig(std::size_t m, std::size_t n, std::size_t k) { - CheckAndAddLayout(k, m); - CheckAndAddLayout(k, n); - CheckAndAddLayout(m, n); + // Register matrix layouts for a given (m, n, k, lda, ldb, ldc, transa, + // transb). Must be called before gemm/gemmrelu/gemmgelu/matmul for each + // unique combination. + void addLayoutConfig(std::size_t m, std::size_t n, std::size_t k, + std::size_t lda, std::size_t ldb, std::size_t ldc, + char transa, char transb) { + // Physical A: (m×k) if NoTrans, (k×m) if Trans + if (transa == 'N' || transa == 'n') + checkAndAddLayout(m, k, lda); + else + checkAndAddLayout(k, m, lda); + // Physical B: (k×n) if NoTrans, (n×k) if Trans + if (transb == 'N' || transb == 'n') + checkAndAddLayout(k, n, ldb); + else + checkAndAddLayout(n, k, ldb); + // C is always (m×n) + checkAndAddLayout(m, n, ldc); } -template -inline void -gemm(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); - - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &ep, - sizeof(ep))); - - - cublasLtMatmulHeuristicResult_t localHeuristic{}; - int returnedResults = 0; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &returnedResults)); - - if (returnedResults == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } - - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} - -template -inline void -gemmrelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, - alpaka::BufCudaRt, TIdx> const &A, - alpaka::BufCudaRt, TIdx> const &B, - const float beta, - alpaka::BufCudaRt, TIdx> &bias, - alpaka::BufCudaRt, TIdx> &C) -{ - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - cublasOperation_t transB_op = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB_op, sizeof(transB_op))); - - cublasOperation_t transA_op = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA_op, sizeof(transA_op))); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, sizeof(bias_ptr))); + // C = alpha * op(A) * op(B) + beta * bias + bias_vec (bias_vec broadcast per + // row) + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtEpilogue_t ep = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &ep, sizeof(ep))); + template + inline void + gemm(char transa, char transb, unsigned int m, unsigned int n, unsigned int k, + float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, - localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, - 1, - &localHeuristic, - &error_flag)); - - if (error_flag == 0) { - cublasLtMatmulDescDestroy(localDesc); - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } + // C = relu(alpha * op(A) * op(B) + beta * bias + bias_vec) + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &bias, + alpaka::BufCudaRt, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, - localDesc, - &alpha, - alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), - &beta, - alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), - &(localHeuristic.algo), - d_workspace, - workspaceSize, - stream)); - - cudaDeviceSynchronize(); - CHECK_CUBLAS(cublasLtMatmulDescDestroy(localDesc)); -} + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } + // C = gelu(alpha * op(A) * op(B) + beta * bias + bias_vec) template - inline void gemmgelu(char transa, char transb, const unsigned int m, - const unsigned int n, const unsigned int k, - const float alpha, + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, alpaka::BufCudaRt, TIdx> const &A, alpaka::BufCudaRt, TIdx> const &B, - const float beta, + float beta, alpaka::BufCudaRt, TIdx> &bias, alpaka::BufCudaRt, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulDesc_t localDesc = nullptr; - CHECK_CUBLAS(cublasLtMatmulDescCreate(&localDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); - - void *bias_ptr = reinterpret_cast(alpaka::getPtrNative(bias)); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, - sizeof(bias_ptr))); + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &bias, + alpaka::ViewPlainPtr, TIdx> &C) { + const void *bptr = alpaka::getPtrNative(bias); + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, alpaka::getPtrNative(bias), alpaka::getPtrNative(C), + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasOperation_t transB = charToCuBlasTranspose(transb); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + // Raw-pointer overloads: accept T const*/T* from any BufXxx or ViewPlainPtr via getPtrNative() + template + inline void gemm(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasOperation_t transA = charToCuBlasTranspose(transa); - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( - localDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); - SetGeluActivation(); + template + inline void gemmrelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_RELU_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - cublasLtMatmulHeuristicResult_t localHeuristic{}; - CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( - ltHandle, localDesc, - LayoutStore.at({k, m}), - LayoutStore.at({k, n}), - LayoutStore.at({m, n}), - LayoutStore.at({m, n}), - preference, 1, &localHeuristic, &error_flag)); - if (error_flag == 0) { - std::cerr << "No suitable cuBLASLt algorithm found!\n"; - exit(EXIT_FAILURE); - } + template + inline void gemmgelu(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const *A, T const *B, + float beta, T *bias, T *C) { + const void *bptr = bias; + auto desc = makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_GELU_BIAS, bptr); + executeMatmul(desc, alpha, A, B, beta, bias, C, + layoutKeyA(transa, m, k), layoutKeyB(transb, k, n), {m, n}); + } - CHECK_CUBLAS(cublasLtMatmul( - ltHandle, localDesc, &alpha, alpaka::getPtrNative(A), LayoutStore.at({k, m}), - alpaka::getPtrNative(B), LayoutStore.at({k, n}), &beta, alpaka::getPtrNative(bias), LayoutStore.at({m, n}), - alpaka::getPtrNative(C), LayoutStore.at({m, n}), &(localHeuristic.algo), d_workspace, - workspaceSize, stream)); + // C = alpha * op(A) * op(B) + beta * C (no bias) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::BufCudaRt, TIdx> const &A, + alpaka::BufCudaRt, TIdx> const &B, + float beta, + alpaka::BufCudaRt, TIdx> &C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + float *c = alpaka::getPtrNative(C); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, c, c, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); + } + + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, + alpaka::ViewPlainPtr, TIdx> const &A, + alpaka::ViewPlainPtr, TIdx> const &B, + float beta, + alpaka::ViewPlainPtr, TIdx> &C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + T *c = alpaka::getPtrNative(C); + executeMatmul(desc, alpha, alpaka::getPtrNative(A), alpaka::getPtrNative(B), + beta, c, c, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); + } + + // matmul on raw pointers directly (no layout caching, caller must ensure + // correct leading dims and layout) + template + inline void matmul(char transa, char transb, unsigned int m, unsigned int n, + unsigned int k, float alpha, T const * A, T const * B, + float beta, T * C) { + auto desc = + makeDesc(charToCuBlasTranspose(transa), charToCuBlasTranspose(transb), + CUBLASLT_EPILOGUE_DEFAULT); + executeMatmul(desc, alpha, A, B, beta, C, C, layoutKeyA(transa, m, k), + layoutKeyB(transb, k, n), {m, n}); } private: alpaka::QueueCudaRtNonBlocking m_queue; - void CheckAndAddLayout(size_t rows, size_t cols) { + // Returns the layout map key for matrix A based on its transpose flag. + // Physical dimensions: NoTrans → (m×k), Trans → (k×m). + static std::pair + layoutKeyA(char trans, std::size_t m, std::size_t k) { + return (trans == 'N' || trans == 'n') ? std::make_pair(m, k) + : std::make_pair(k, m); + } + + // Returns the layout map key for matrix B based on its transpose flag. + // Physical dimensions: NoTrans → (k×n), Trans → (n×k). + static std::pair + layoutKeyB(char trans, std::size_t k, std::size_t n) { + return (trans == 'N' || trans == 'n') ? std::make_pair(k, n) + : std::make_pair(n, k); + } + + void checkAndAddLayout(std::size_t rows, std::size_t cols, std::size_t ld) { auto key = std::make_pair(rows, cols); - if (LayoutStore.find(key) == LayoutStore.end()) { - cublasLtMatrixLayout_t temp = nullptr; - size_t ld = rows; + if (layoutStore.find(key) == layoutStore.end()) { + cublasLtMatrixLayout_t layout = nullptr; CHECK_CUBLAS( - cublasLtMatrixLayoutCreate(&temp, CUDA_R_32F, rows, cols, ld)); - LayoutStore.emplace(key, temp); + cublasLtMatrixLayoutCreate(&layout, CUDA_R_32F, rows, cols, ld)); + layoutStore.emplace(key, layout); } } - void ResetActivation() { - epilogue = CUBLASLT_EPILOGUE_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); - } - - void SetReluActivation() { - epilogue = CUBLASLT_EPILOGUE_RELU_BIAS; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + // Creates a per-call matmul descriptor with transpose ops, epilogue, and + // optional bias pointer. Caller owns the returned descriptor. + cublasLtMatmulDesc_t makeDesc(cublasOperation_t transA, + cublasOperation_t transB, + cublasLtEpilogue_t epilogue, + const void *bias_ptr = nullptr) { + cublasLtMatmulDesc_t desc = nullptr; + CHECK_CUBLAS( + cublasLtMatmulDescCreate(&desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSA, &transA, sizeof(transA))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_TRANSB, &transB, sizeof(transB))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue))); + if (bias_ptr) { + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_ptr, + sizeof(bias_ptr))); + } + return desc; } - void SetGeluActivation() { - epilogue = CUBLASLT_EPILOGUE_GELU; - CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(operationDesc, - CUBLASLT_MATMUL_DESC_EPILOGUE, - &epilogue, sizeof(epilogue))); + // Runs heuristic selection, executes cublasLtMatmul, syncs stream, and + // destroys desc. D_in is the matrix scaled by beta (may equal C_out for + // in-place accumulation). + void executeMatmul(cublasLtMatmulDesc_t desc, float alpha, const float *A, + const float *B, float beta, const float *D_in, + float *C_out, + const std::pair &kA, + const std::pair &kB, + const std::pair &kC) { + cublasLtMatmulHeuristicResult_t h{}; + int returnedResults = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + ltHandle, desc, layoutStore.at(kA), layoutStore.at(kB), + layoutStore.at(kC), layoutStore.at(kC), preference, 1, &h, + &returnedResults)); + if (returnedResults == 0) { + cublasLtMatmulDescDestroy(desc); + std::cerr << "No suitable cuBLASLt algorithm found!\n"; + exit(EXIT_FAILURE); + } + CHECK_CUBLAS(cublasLtMatmul(ltHandle, desc, &alpha, A, layoutStore.at(kA), + B, layoutStore.at(kB), &beta, D_in, + layoutStore.at(kC), C_out, layoutStore.at(kC), + &h.algo, d_workspace, workspaceSize, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(desc)); } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2ebded2..0a4aa01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -19,10 +19,15 @@ endif() # --- Compiler flags --- set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF) set(CXX_HOST_FLAGS -fPIC -pthread) -set(CUDA_ARCH "sm_86") -set(CXX_CUDA_FLAGS -arch=${CUDA_ARCH} -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) +set(CXX_CUDA_FLAGS -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr) set(XCOMPILER_FLAGS -Xcompiler=-fPIC,-pthread) +# --- CUDA architecture: must be set before enable_language(CUDA) so all +# targets inherit a valid default (CMake 3.18+ requires CUDA_ARCHITECTURES +# to be non-empty on every target once CUDA is enabled globally). +set(CMAKE_CUDA_ARCHITECTURES 86) +enable_language(CUDA) + # --- Include directories --- include_directories(${ALPAKA_BASE}/include "../include") @@ -86,12 +91,9 @@ target_include_directories(test_cpu PRIVATE "../sofieBLAS/include" ${ALPAKA_BASE target_link_libraries(test_cpu PRIVATE ${BLAS_LIBS}) -set(TEST_SRC test.cc) - add_executable(test_cuda) target_sources(test_cuda PRIVATE test.cc) set_source_files_properties(test.cc PROPERTIES LANGUAGE CUDA) -enable_language(CUDA) set_target_properties(test_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_features(test_cuda PUBLIC cxx_std_20) @@ -104,7 +106,7 @@ target_compile_options(test_cuda PRIVATE target_compile_definitions(test_cuda PRIVATE ALPAKA_ACC_GPU_CUDA_ENABLED) target_include_directories(test_cuda PRIVATE ${ALPAKA_BASE}/include "../sofieBLAS/include" ${CUDA_BASE}/include) target_link_directories(test_cuda PRIVATE ${CUDA_BASE}/lib64) -target_link_libraries(test_cuda PRIVATE cublas cudart) +target_link_libraries(test_cuda PRIVATE cublasLt cublas cudart) # --- clean target equivalent --- add_custom_target(clean-all diff --git a/tests/test.cc b/tests/test.cc index 5812bc9..e106275 100644 --- a/tests/test.cc +++ b/tests/test.cc @@ -1,121 +1,591 @@ #include "sofieBLAS/sofieBLAS.hpp" #include +#include +#include #include #include -#include +#include +#include -// index and size type using Idx = uint32_t; - -// dimensions -using Dim0D = alpaka::DimInt<0u>; using Dim1D = alpaka::DimInt<1u>; -using Dim2D = alpaka::DimInt<2u>; -using Dim3D = alpaka::DimInt<3u>; - -// Print a column-major matrix -template -void print(alpaka::BufCpu const &M, TIdx size) { - assert(alpaka::getExtentProduct(M) == size * size); - - for (TIdx row = 0; row < size; ++row) { - for (TIdx col = 0; col < size; ++col) { - std::cout << std::fixed << std::setprecision(2) << std::setw(7) - << M[col * size + row] << " "; + +// --------------------------------------------------------------------------- +// Reference implementations (column-major, float) +// --------------------------------------------------------------------------- + +// Access element (row, col) of a column-major matrix with leading dim `ld`. +static inline float cm(const float *M, int row, int col, int ld) { + return M[col * ld + row]; +} + +// C = alpha * op(A) * op(B) + beta * C (in-place, column-major) +static void refMatmul(float *C, const float *A, const float *B, int m, int n, + int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * C[j * m + i]; } - std::cout << "\n"; } } -int main() { - constexpr Idx size = 4; +// C = alpha * op(A) * op(B) + beta * bias_matrix + bias_vec (per-row broadcast) +static void refGemm(float *C, const float *A, const float *B, const float *bias, + int m, int n, int k, float alpha, float beta, bool transA, + bool transB) { + int lda = transA ? k : m; + int ldb = transB ? n : k; + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + float sum = 0.f; + for (int p = 0; p < k; ++p) { + float a = transA ? cm(A, p, i, lda) : cm(A, i, p, lda); + float b = transB ? cm(B, j, p, ldb) : cm(B, p, j, ldb); + sum += a * b; + } + C[j * m + i] = alpha * sum + beta * bias[j * m + i] + bias[i]; + } + } +} - // Host platform and device - alpaka::PlatformCpu host_platform{}; - auto host = alpaka::getDevByIdx(host_platform, 0u); +static void refGemmRelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + for (int i = 0; i < m * n; ++i) + C[i] = C[i] > 0.f ? C[i] : 0.f; +} - // Allocate matrices (column-major) - auto A = alpaka::allocBuf(host, size * size); - auto B = alpaka::allocBuf(host, size * size); - auto C = alpaka::allocBuf(host, size * size); +static void refGemmGelu(float *C, const float *A, const float *B, + const float *bias, int m, int n, int k, float alpha, + float beta, bool transA, bool transB) { + refGemm(C, A, B, bias, m, n, k, alpha, beta, transA, transB); + constexpr float kInvSqrt2 = 0.7071067811865476f; + for (int i = 0; i < m * n; ++i) + C[i] *= 0.5f * (1.f + std::erff(C[i] * kInvSqrt2)); +} - // Fill A and B with random floats centered around 0 - std::random_device rd; - std::mt19937 gen(rd()); - std::normal_distribution dist(0.0f, 10.0f); +static int gFailures = 0; - for (int i = 0; i < size * size; ++i) { - A[i] = dist(gen); - B[i] = dist(gen); +static void checkClose(const float *got, const float *expected, int n, + const std::string &name, float rtol = 1e-4f, + float atol = 1e-4f) { + bool pass = true; + for (int i = 0; i < n; ++i) { + float diff = std::abs(got[i] - expected[i]); + float thr = atol + rtol * std::abs(expected[i]); + if (diff > thr) { + std::cerr << " FAIL [" << name << "] idx=" << i << " got=" << got[i] + << " expected=" << expected[i] << " diff=" << diff << "\n"; + pass = false; + } } - std::cout << "Matrix A:\n"; - print(A, size); - std::cout << '\n'; - std::cout << "Matrix B:\n"; - print(B, size); - std::cout << '\n'; + if (pass) + std::cout << " PASS " << name << "\n"; + else + ++gFailures; +} -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +// --------------------------------------------------------------------------- +// Helpers to fill test matrices +// --------------------------------------------------------------------------- + +static void fillSeq(float *M, int n, float start = 1.f, float step = 1.f) { + for (int i = 0; i < n; ++i) + M[i] = start + static_cast(i) * step; +} + +static void fillVal(float *M, int n, float v) { + for (int i = 0; i < n; ++i) + M[i] = v; +} + +// --------------------------------------------------------------------------- +// CPU tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +static void runCpuTests() { + std::cout << "\n=== CPU Tests ===\n"; + + alpaka::PlatformCpu platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + constexpr int M = 4, N = 3, K = 5; + + // Allocate host buffers + auto hA = alpaka::allocBuf(dev, static_cast(M * K)); + auto hB = alpaka::allocBuf(dev, static_cast(K * N)); + auto hC = alpaka::allocBuf(dev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(dev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *C = alpaka::getPtrNative(hC); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillSeq(bias, M * N, 0.1f, 0.1f); + + std::vector ref(M * N); + + // --- matmul NN --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hC); + std::copy(C, C + M * N, ref.data()); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul NN"); + + // --- matmul TN (A^T: K×M physical → M×K logical) --- + // For TN we need A to be K×M so we create a separate buffer + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::matmul TN"); + } + + // --- matmul NT --- + { + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('N', 'T', M, N, K, 1.f, hA, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + checkClose(C, ref.data(), M * N, "cpu::matmul NT"); + } + + // --- matmul TT --- { - alpaka::PlatformCudaRt platform; - alpaka::DevCudaRt device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + auto hBt = alpaka::allocBuf(dev, static_cast(N * K)); + float *At = alpaka::getPtrNative(hAt); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(At, K * M); + fillSeq(Bt, N * K, 1.f, 0.5f); + fillVal(C, M * N, 0.f); + blas.matmul('T', 'T', M, N, K, 1.f, hAt, hBt, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, Bt, M, N, K, 1.f, 0.f, true, true); + checkClose(C, ref.data(), M * N, "cpu::matmul TT"); + } - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // --- matmul: alpha scaling --- + fillVal(C, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 2.5f, hA, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::matmul alpha=2.5"); - const float alpha = 1.0f; - const float beta = 0.0f; + // --- matmul: beta accumulation --- + fillSeq(C, M * N, 10.f); // pre-fill C + blas.matmul('N', 'N', M, N, K, 1.f, hA, hB, 0.5f, hC); + { + std::vector C0(M * N); + fillSeq(C0.data(), M * N, 10.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + // ref already has the pre-filled values baked in via refMatmul beta path + // but refMatmul reads C in-place, so we need to re-run with correct init + std::copy(C0.begin(), C0.end(), ref.data()); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.5f, false, false); + } + checkClose(C, ref.data(), M * N, "cpu::matmul beta=0.5"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; + // --- gemm NN (beta=0, no prior accumulation) --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, 0.1f, 0.1f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=0"); - auto A_d = alpaka::allocAsyncBuf(queue, size * size); - auto B_d = alpaka::allocAsyncBuf(queue, size * size); - auto C_d = alpaka::allocAsyncBuf(queue, size * size); - alpaka::memcpy(queue, A_d, A); - alpaka::memcpy(queue, B_d, B); + // --- gemm NN (beta=1 accumulation) --- + fillVal(C, M * N, 0.f); + blas.gemm('N', 'N', M, N, K, 1.f, hA, hB, 1.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemm NN beta=1"); - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A_d, lda, B_d, ldb, beta, C_d, ldc); - alpaka::memcpy(queue, C, C_d); + // --- gemm TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + blas.gemm('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemm TN"); + } + // --- gemmrelu: all-positive matmul result stays unchanged --- + { + // A and B with positive values ensure result is positive before bias + auto hAp = alpaka::allocBuf(dev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(dev, static_cast(K * N)); + auto hBiasp = alpaka::allocBuf(dev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + float *biasp = alpaka::getPtrNative(hBiasp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(biasp, M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, 1.f, hAp, hBp, 0.f, hBiasp, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, biasp, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu all-positive"); + } + + // --- gemmrelu: negative values clamped to zero --- + { + // Use alpha=-1 to force negative results + auto hBiasz = alpaka::allocBuf(dev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + fillVal(C, M * N, 0.f); + blas.gemmrelu('N', 'N', M, N, K, -1.f, hA, hB, 0.f, hBiasz, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu alpha=-1 (clamped)"); + } + + // --- gemmrelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -5.f, 2.f); // mixed sign bias + blas.gemmrelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmrelu with mixed bias"); + + // --- gemmgelu NN --- + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu NN"); + + // --- gemmgelu with bias --- + fillVal(C, M * N, 0.f); + fillSeq(bias, M * N, -2.f, 0.5f); + blas.gemmgelu('N', 'N', M, N, K, 1.f, hA, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu with bias"); + + // --- gemmgelu TN --- + { + auto hAt = alpaka::allocBuf(dev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + fillVal(C, M * N, 0.f); + fillVal(bias, M * N, 0.f); + blas.gemmgelu('T', 'N', M, N, K, 1.f, hAt, hB, 0.f, hBias, hC); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + checkClose(C, ref.data(), M * N, "cpu::gemmgelu TN"); + } + + // --- edge: zero matrix --- + { + auto hZ = alpaka::allocBuf(dev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZ), M * K, 0.f); + fillVal(C, M * N, 99.f); + fillVal(bias, M * N, 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, hZ, hB, 0.f, hC); + std::fill(ref.begin(), ref.end(), 0.f); + checkClose(C, ref.data(), M * N, "cpu::matmul zero-A"); + } + + // --- edge: identity-like (square, known result) --- + { + constexpr int S = 3; + auto hI = alpaka::allocBuf(dev, static_cast(S * S)); + auto hX = alpaka::allocBuf(dev, static_cast(S * S)); + auto hY = alpaka::allocBuf(dev, static_cast(S * S)); + float *I = alpaka::getPtrNative(hI); + float *X = alpaka::getPtrNative(hX); + float *Y = alpaka::getPtrNative(hY); + fillVal(I, S * S, 0.f); + for (int i = 0; i < S; ++i) + I[i * S + i] = 1.f; + fillSeq(X, S * S); + fillVal(Y, S * S, 0.f); + blas.matmul('N', 'N', S, S, S, 1.f, hI, hX, 0.f, hY); + checkClose(Y, X, S * S, "cpu::matmul identity×X=X"); + } +} + +#endif // ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + +// --------------------------------------------------------------------------- +// CUDA tests +// --------------------------------------------------------------------------- + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + +// Helper: infer packed column-major leading dimensions +static int ldaFor(char trans, int m, int k) { + return (trans == 'N' || trans == 'n') ? m : k; +} +static int ldbFor(char trans, int k, int n) { + return (trans == 'N' || trans == 'n') ? k : n; +} + +static void runCudaTests() { + std::cout << "\n=== CUDA Tests ===\n"; + + alpaka::PlatformCudaRt platform{}; + auto dev = alpaka::getDevByIdx(platform, 0u); + alpaka::Queue queue{dev}; + sofieBLAS blas(queue); + + alpaka::PlatformCpu hostPlatform{}; + auto hostDev = alpaka::getDevByIdx(hostPlatform, 0u); + + constexpr int M = 4, N = 3, K = 5; + + auto hA = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hB = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hC = alpaka::allocBuf(hostDev, static_cast(M * N)); + auto hBias = alpaka::allocBuf(hostDev, static_cast(M * N)); + + float *A = alpaka::getPtrNative(hA); + float *B = alpaka::getPtrNative(hB); + float *bias = alpaka::getPtrNative(hBias); + + fillSeq(A, M * K); + fillSeq(B, K * N, 1.f, 0.5f); + fillVal(bias, M * N, 0.f); + + auto dA = alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dB = alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dC = alpaka::allocAsyncBuf(queue, static_cast(M * N)); + auto dBias = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + + alpaka::memcpy(queue, dA, hA); + alpaka::memcpy(queue, dB, hB); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + + std::vector ref(M * N); + float *C = alpaka::getPtrNative(hC); + + auto verify = [&](const std::string &name) { + alpaka::memcpy(queue, hC, dC); + alpaka::wait(queue); + checkClose(C, ref.data(), M * N, name); + }; + + // ---- matmul NN ---- + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('N', K, N), M, 'N', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 1.f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dC); + verify("cuda::matmul NN"); + + // ---- matmul TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); alpaka::wait(queue); - std::cout << "CUDA Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), At, B, M, N, K, 1.f, 0.f, true, false); + blas.matmul('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dC); + verify("cuda::matmul TN"); } -#endif -#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + // ---- matmul NT ---- { - alpaka::PlatformCpu platform; - alpaka::DevCpu device = alpaka::getDevByIdx(platform, 0u); - alpaka::Queue queue{device}; + auto hBt = alpaka::allocBuf(hostDev, static_cast(N * K)); + float *Bt = alpaka::getPtrNative(hBt); + fillSeq(Bt, N * K, 1.f, 0.5f); + auto dBt = + alpaka::allocAsyncBuf(queue, static_cast(N * K)); + alpaka::memcpy(queue, dBt, hBt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('N', M, K), ldbFor('T', K, N), M, 'N', + 'T'); + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, Bt, M, N, K, 1.f, 0.f, false, true); + blas.matmul('N', 'T', M, N, K, 1.f, dA, dBt, 0.f, dC); + verify("cuda::matmul NT"); + } + + // ---- matmul alpha=2.5 ---- + std::fill(ref.begin(), ref.end(), 0.f); + refMatmul(ref.data(), A, B, M, N, K, 2.5f, 0.f, false, false); + blas.matmul('N', 'N', M, N, K, 2.5f, dA, dB, 0.f, dC); + verify("cuda::matmul alpha=2.5"); - const Idx m = size; // rows of A and C - const Idx n = size; // columns of B and C - const Idx k = size; // columns of A and rows of B + // ---- gemm NN beta=0 ---- + fillSeq(bias, M * N, 0.1f, 0.1f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemm NN beta=0"); - const float alpha = 1.0f; - const float beta = 0.0f; + // ---- gemm NN beta=1 ---- + // D_in = bias, so result = A*B + 1*bias_matrix + bias_vec + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), A, B, bias, M, N, K, 1.f, 1.f, false, false); + blas.gemm('N', 'N', M, N, K, 1.f, dA, dB, 1.f, dBias, dC); + verify("cuda::gemm NN beta=1"); - const Idx lda = size; // leading dimension of A - const Idx ldb = size; // leading dimension of B - const Idx ldc = size; // leading dimension of C + // ---- gemm TN ---- + { + auto hAt = alpaka::allocBuf(hostDev, static_cast(K * M)); + float *At = alpaka::getPtrNative(hAt); + fillSeq(At, K * M); + auto dAt = + alpaka::allocAsyncBuf(queue, static_cast(K * M)); + alpaka::memcpy(queue, dAt, hAt); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, ldaFor('T', M, K), ldbFor('N', K, N), M, 'T', + 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemm(ref.data(), At, B, bias, M, N, K, 1.f, 0.f, true, false); + blas.gemm('T', 'N', M, N, K, 1.f, dAt, dB, 0.f, dBias, dC); + verify("cuda::gemm TN"); + } - sofieBLAS blas(queue); - blas.gemm('n', 'n', m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + // ---- gemmrelu: all-positive (relu is identity) ---- + { + auto hAp = alpaka::allocBuf(hostDev, static_cast(M * K)); + auto hBp = alpaka::allocBuf(hostDev, static_cast(K * N)); + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + float *Ap = alpaka::getPtrNative(hAp); + float *Bp = alpaka::getPtrNative(hBp); + fillSeq(Ap, M * K, 0.1f, 0.1f); + fillSeq(Bp, K * N, 0.1f, 0.1f); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dAp = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + auto dBp = + alpaka::allocAsyncBuf(queue, static_cast(K * N)); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dAp, hAp); + alpaka::memcpy(queue, dBp, hBp); + alpaka::memcpy(queue, dBiasz, hBiasz); + alpaka::wait(queue); + blas.addLayoutConfig(M, N, K, M, K, M, 'N', 'N'); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), Ap, Bp, alpaka::getPtrNative(hBiasz), M, N, K, 1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dAp, dBp, 0.f, dBiasz, dC); + verify("cuda::gemmrelu all-positive"); + } + // ---- gemmrelu: alpha=-1 forces negatives -> clamped to zero ---- + { + auto hBiasz = + alpaka::allocBuf(hostDev, static_cast(M * N)); + fillVal(alpaka::getPtrNative(hBiasz), M * N, 0.f); + auto dBiasz = + alpaka::allocAsyncBuf(queue, static_cast(M * N)); + alpaka::memcpy(queue, dBiasz, hBiasz); alpaka::wait(queue); - std::cout << "CPU Matrix C = A × B:\n"; - print(C, size); - std::cout << '\n'; + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, alpaka::getPtrNative(hBiasz), M, N, K, -1.f, + 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, -1.f, dA, dB, 0.f, dBiasz, dC); + verify("cuda::gemmrelu alpha=-1 (clamped)"); } + + // ---- gemmrelu with mixed bias ---- + fillSeq(bias, M * N, -5.f, 2.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmRelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmrelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmrelu with mixed bias"); + + // ---- gemmgelu NN ---- + fillVal(bias, M * N, 0.f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu NN"); + + // ---- gemmgelu with bias ---- + fillSeq(bias, M * N, -2.f, 0.5f); + alpaka::memcpy(queue, dBias, hBias); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + refGemmGelu(ref.data(), A, B, bias, M, N, K, 1.f, 0.f, false, false); + blas.gemmgelu('N', 'N', M, N, K, 1.f, dA, dB, 0.f, dBias, dC); + verify("cuda::gemmgelu with bias"); + + // ---- edge: zero A ---- + { + auto hZero = alpaka::allocBuf(hostDev, static_cast(M * K)); + fillVal(alpaka::getPtrNative(hZero), M * K, 0.f); + auto dZero = + alpaka::allocAsyncBuf(queue, static_cast(M * K)); + alpaka::memcpy(queue, dZero, hZero); + alpaka::wait(queue); + std::fill(ref.begin(), ref.end(), 0.f); + blas.matmul('N', 'N', M, N, K, 1.f, dZero, dB, 0.f, dC); + verify("cuda::matmul zero-A"); + } +} + +#endif // ALPAKA_ACC_GPU_CUDA_ENABLED + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +int main() { +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + runCpuTests(); #endif +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + runCudaTests(); +#endif + + std::cout << "\n"; + if (gFailures == 0) + std::cout << "All tests passed.\n"; + else + std::cout << gFailures << " test(s) FAILED.\n"; - return 0; + return gFailures > 0 ? EXIT_FAILURE : EXIT_SUCCESS; }