From c873e4644c95f77572a3a14fa2663173ddcbb1b6 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Tue, 19 May 2026 22:25:50 +0000
Subject: [PATCH 01/13] add production GEMM tests

---
 tests/cpp/operator/CMakeLists.txt        |   1 +
 tests/cpp/operator/test_gemm_prodgemm.cu | 396 +++++++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 tests/cpp/operator/test_gemm_prodgemm.cu
diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
index 0ebd7fdfe..0eded7219 100644
--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -39,6 +39,7 @@ if(USE_CUDA)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu
+              test_gemm_prodgemm.cu
               test_cast_mxfp4_transpose.cu)
 endif()
 
diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
new file mode 100644
index 000000000..2a086ddea
--- /dev/null
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -0,0 +1,396 @@
+/*************************************************************************
+ * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * License for AMD contributions = MIT. See LICENSE for more information
+ ************************************************************************/
+
+/*
+ * MXFP8 GEMM correctness tests for production LLM shapes.
+ *
+ * Tests forward, dgrad, and wgrad passes with appropriate FP8 type combos:
+ *   Forward:  E4M3 x E4M3 -> BF16
+ *   Dgrad:    E5M2 x E4M3 -> BF16
+ *   Wgrad:    E4M3 x E5M2 -> BF16
+ *
+ * Each shape is tested with 3 transpose configs (TN, NN, NT) and
+ * 3 micro-batch sizes (MBS = 1, 2, 4 -> tokens = 4096, 8192, 16384).
+ */
+
+#ifdef __HIP_PLATFORM_AMD__
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <transformer_engine/cast.h>
+#include <transformer_engine/gemm.h>
+#include <transformer_engine/swizzle.h>
+#include <transformer_engine/transformer_engine.h>
+#include "../test_common.h"
+
+using namespace transformer_engine;
+using namespace test;
+
+namespace {
+
+using fp32 = float;
+using fp8  = fp8e4m3;
+using bf8  = fp8e5m2;
+
+using TShape = std::vector<size_t>;
+using Layout = std::pair<bool, bool>;  // {transa, transb}
+
+static const Layout kTN{true, false};
+static const Layout kNN{false, false};
+static const Layout kNT{false, true};
+static const std::vector<Layout> kLayouts = {kTN, kNN, kNT};
+
+// ============================================================================
+// GemmPass: determines A/B FP8 type combination
+//   FWD:   fp8  x fp8  (E4M3 x E4M3)
+//   DGRAD: bf8  x fp8  (E5M2 x E4M3)
+//   WGRAD: fp8  x bf8  (E4M3 x E5M2)
+// ============================================================================
+
+enum class GemmPass { FWD, DGRAD, WGRAD };
+
+// ============================================================================
+// Shape definition: describes a GEMM from the model architecture.
+//
+//   Forward / Dgrad: M = tokens, dim1 = N, dim2 = K
+//   Wgrad:           K = tokens, dim1 = M, dim2 = N
+// ============================================================================
+
+struct ShapeDef {
+    const char* label;
+    size_t dim1;
+    size_t dim2;
+    GemmPass pass;
+};
+
+// LLM1 (hidden=7168, MLA, seq=4096)
+
+static const ShapeDef llm1_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"LLM1_Linear0_fwd",           1536,  7168, GemmPass::FWD},
+    {"LLM1_Linear1_fwd",            576,  7168, GemmPass::FWD},
+    {"LLM1_LNLinear0_fwd",        24576,  1536, GemmPass::FWD},
+    {"LLM1_LNLinear1_fwd",        32768,   512, GemmPass::FWD},
+    {"LLM1_Linear_attn_fwd",       7168, 16384, GemmPass::FWD},
+    {"LLM1_LNMLP_gateup_fwd",     36864,  7168, GemmPass::FWD},
+    {"LLM1_LNMLP_down_fwd",        7168, 18432, GemmPass::FWD},
+    {"LLM1_SharedExp_gu_fwd",      4096,  7168, GemmPass::FWD},
+    {"LLM1_SharedExp_dn_fwd",      7168,  2048, GemmPass::FWD},
+    {"LLM1_TopKRouter_fwd",         256,  7168, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"LLM1_attn_dgrad",           16384,  7168, GemmPass::DGRAD},
+    {"LLM1_LNLinear1_dgrad",        512, 32768, GemmPass::DGRAD},
+    {"LLM1_LNLinear0_dgrad",       1536, 24576, GemmPass::DGRAD},
+    {"LLM1_SharedExp_dn_dgrad",    2048,  7168, GemmPass::DGRAD},
+    {"LLM1_SharedExp_gu_dgrad",    7168,  4096, GemmPass::DGRAD},
+    {"LLM1_TopKRouter_dgrad",      7168,   256, GemmPass::DGRAD},
+    {"LLM1_MLP_post_dgrad",        7168, 14336, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"LLM1_attn_wgrad",           16384,  7168, GemmPass::WGRAD},
+    {"LLM1_LNLinear1_wgrad",        512, 32768, GemmPass::WGRAD},
+    {"LLM1_LNLinear0_wgrad",       1536, 24576, GemmPass::WGRAD},
+    {"LLM1_SharedExp_dn_wgrad",    2048,  7168, GemmPass::WGRAD},
+    {"LLM1_SharedExp_gu_wgrad",    7168,  4096, GemmPass::WGRAD},
+    {"LLM1_TopKRouter_wgrad",      7168,   256, GemmPass::WGRAD},
+};
+
+// LLM1 LM Head (large N, memory-intensive)
+static const ShapeDef llm1_lm_head_shapes[] = {
+    {"LLM1_LMHead_fwd",          129280,  7168, GemmPass::FWD},
+    {"LLM1_LMHead_dgrad",          7168,129280, GemmPass::DGRAD},
+    {"LLM1_LMHead_wgrad",          7168,129280, GemmPass::WGRAD},
+};
+
+// LLM2 (hidden=4096, GQA, seq=4096)
+
+static const ShapeDef llm2_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"LLM2_LNLinear_QKV_fwd",   9216,  4096, GemmPass::FWD},
+    {"LLM2_Linear_attn_fwd",    4096,  8192, GemmPass::FWD},
+    {"LLM2_Router_fwd",          128,  4096, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"LLM2_Router_dgrad",       4096,   128, GemmPass::DGRAD},
+    {"LLM2_Linear_attn_dgrad",  8192,  4096, GemmPass::DGRAD},
+    {"LLM2_LNLinear_dgrad",     4096,  9216, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"LLM2_Router_wgrad",       4096,   128, GemmPass::WGRAD},
+    {"LLM2_Linear_attn_wgrad",  8192,  4096, GemmPass::WGRAD},
+    {"LLM2_LNLinear_wgrad",     4096,  9216, GemmPass::WGRAD},
+};
+
+// LLM2 LM Head (large N, memory-intensive)
+static const ShapeDef llm2_lm_head_shapes[] = {
+    {"LLM2_LMHead_fwd",        151936,  4096, GemmPass::FWD},
+    {"LLM2_LMHead_dgrad",        4096,151936, GemmPass::DGRAD},
+    {"LLM2_LMHead_wgrad",        4096,151936, GemmPass::WGRAD},
+};
+
+// ============================================================================
+// Test case: a concrete (M, K, N) shape with pass info, ready for execution
+// ============================================================================
+
+struct ProdGemmTestCase {
+    std::string label;
+    size_t m, k, n;
+    GemmPass pass;
+};
+
+std::ostream& operator<<(std::ostream& os, const ProdGemmTestCase& tc) {
+    return os << tc.label;
+}
+
+static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t count) {
+    std::vector<ProdGemmTestCase> cases;
+    for (size_t i = 0; i < count; ++i) {
+        const auto& s = defs[i];
+        for (size_t mbs : {1, 2, 4}) {
+            size_t tokens = mbs * 4096;
+            ProdGemmTestCase tc;
+            tc.label = std::string(s.label) + "_mbs" + std::to_string(mbs);
+            tc.pass = s.pass;
+            switch (s.pass) {
+                case GemmPass::FWD:
+                case GemmPass::DGRAD:
+                    tc.m = tokens;
+                    tc.n = s.dim1;
+                    tc.k = s.dim2;
+                    break;
+                case GemmPass::WGRAD:
+                    tc.m = s.dim1;
+                    tc.n = s.dim2;
+                    tc.k = tokens;
+                    break;
+            }
+            cases.push_back(std::move(tc));
+        }
+    }
+    return cases;
+}
+
+static std::vector<ProdGemmTestCase> generate_model_test_cases() {
+    auto v1   = expand_shapes(llm1_shapes,   std::size(llm1_shapes));
+    auto v2   = expand_shapes(llm2_shapes,   std::size(llm2_shapes));
+    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
+                        std::make_move_iterator(v2.end()));
+    return v1;
+}
+
+static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
+    auto v1   = expand_shapes(llm1_lm_head_shapes,   std::size(llm1_lm_head_shapes));
+    auto v2   = expand_shapes(llm2_lm_head_shapes,   std::size(llm2_lm_head_shapes));
+    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
+                        std::make_move_iterator(v2.end()));
+    return v1;
+}
+
+// ============================================================================
+// Swizzle helper for gfx1250 MXFP8 scales (same as test_cublaslt_gemm.cu)
+// ============================================================================
+
+static void swizzle_mxfp8_scales(test::Tensor& t, bool rowwise) {
+    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
+                              : t.columnwise_scale_inv_dptr();
+    if (!scale_ptr) return;
+
+    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
+                                          : t.columnwise_scale_inv_shape();
+    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
+                                          : t.columnwise_shape();
+
+    size_t num_scales = 1;
+    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
+
+    uint8_t* d_tmp = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
+
+    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+    output_tw.set_with_gemm_swizzled_scales(true);
+
+    if (rowwise) {
+        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    } else {
+        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    }
+
+    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
+    NVTE_CHECK_CUDA(cudaFree(d_tmp));
+}
+
+// ============================================================================
+// MXFP8 dequantize-based GEMM correctness test
+//
+// 1. Create random source matrices A_src, B_src in D_Type (bf16)
+// 2. Quantize:   A_src -> A_fp8, B_src -> B_fp8  (MXFP8 block scaling)
+// 3. Dequantize: A_fp8 -> A_ref, B_fp8 -> B_ref  (back to D_Type)
+// 4. Swizzle scales for gfx1250 (if needed)
+// 5. MXFP8 GEMM:     D     = A_fp8 * B_fp8
+// 6. Non-FP8 GEMM:   D_ref = A_ref * B_ref
+// 7. Compare D vs D_ref
+// ============================================================================
+
+template <typename A_Type, typename B_Type, typename D_Type>
+void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb) {
+    DType atype = TypeInfo<A_Type>::dtype;
+    DType btype = TypeInfo<B_Type>::dtype;
+    DType dtype = TypeInfo<D_Type>::dtype;
+
+    ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input types expected";
+    ASSERT_FALSE(isFp8Type(dtype)) << "Non-FP8 output type expected";
+
+    if (m % 16 || n % 16) {
+        GTEST_SKIP() << "MXFP8 requires M & N to be multiples of 16";
+    }
+    if (k % 128) {
+        GTEST_SKIP() << "MXFP8 requires K to be a multiple of 128";
+    }
+
+    cudaDeviceProp prop;
+    (void)cudaGetDeviceProperties(&prop, 0);
+
+    bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5) || prop.major >= 12;
+    if (!mxfp8_supported) {
+        GTEST_SKIP() << "MXFP8 is not supported on this GPU";
+    }
+
+    TShape a_shape = transa ? TShape{m, k} : TShape{k, m};
+    TShape b_shape = transb ? TShape{k, n} : TShape{n, k};
+
+    // 1. Create random source matrices
+    Tensor A_src("A_src", a_shape, dtype);
+    Tensor B_src("B_src", b_shape, dtype);
+    fillUniform(&A_src);
+    fillUniform(&B_src);
+
+    // 2. Quantize to FP8 with MXFP8 scaling
+    Tensor A_fp8("A_fp8", a_shape, atype, transa, !transa,
+                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+    Tensor B_fp8("B_fp8", b_shape, btype, !transb, transb,
+                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
+    nvte_quantize(A_src.data(), A_fp8.data(), 0);
+    nvte_quantize(B_src.data(), B_fp8.data(), 0);
+
+    // 3. Dequantize back to reference type
+    Tensor A_ref("A_ref", a_shape, dtype);
+    Tensor B_ref("B_ref", b_shape, dtype);
+    nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
+    nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
+
+    // 4. Swizzle scales for gfx1250
+    if (prop.major == 12) {
+        const bool a_colwise = !transa;
+        const bool b_colwise = transb;
+        if (!a_colwise) swizzle_mxfp8_scales(A_fp8, true);
+        if (a_colwise)  swizzle_mxfp8_scales(A_fp8, false);
+        if (!b_colwise) swizzle_mxfp8_scales(B_fp8, true);
+        if (b_colwise)  swizzle_mxfp8_scales(B_fp8, false);
+    }
+
+    Tensor bias;
+    Tensor pre_gelu_out;
+
+    size_t workspace_size = 67108864;  // 64 MB
+    Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte);
+
+    // 5. MXFP8 GEMM
+    Tensor D("D", TShape{n, m}, dtype);
+    nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(),
+                     bias.data(), pre_gelu_out.data(),
+                     transa, transb, false,
+                     Workspace.data(), false, false,
+                     prop.multiProcessorCount, 0);
+    D.to_cpu();
+
+    // 6. Non-FP8 reference GEMM
+    Tensor D_ref("D_ref", TShape{n, m}, dtype);
+    nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(),
+                     bias.data(), pre_gelu_out.data(),
+                     transa, transb, false,
+                     Workspace.data(), false, false,
+                     prop.multiProcessorCount, 0);
+    D_ref.to_cpu();
+
+    // Check for CUDA errors
+    (void)cudaDeviceSynchronize();
+    auto err = cudaGetLastError();
+    ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
+
+    // 7. Compare results
+    auto [atol, rtol] = getTolerances(dtype);
+    atol = std::max(atol, 5e-4);
+    rtol = std::max(rtol, 1e-3);
+    compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
+}
+
+// ============================================================================
+// Test suite
+// ============================================================================
+
+using ProdGemmParam = std::tuple<ProdGemmTestCase, Layout>;
+
+class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
+
+TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
+    const auto& tc = std::get<0>(GetParam());
+    const auto& layout = std::get<1>(GetParam());
+    bool transa = layout.first;
+    bool transb = layout.second;
+
+    switch (tc.pass) {
+        case GemmPass::FWD:
+            performMxfp8DqTest<fp8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+        case GemmPass::DGRAD:
+            performMxfp8DqTest<bf8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+        case GemmPass::WGRAD:
+            performMxfp8DqTest<fp8, bf8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            break;
+    }
+}
+
+static inline std::string TN(const Layout& layout) {
+    static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
+    return map[layout.first][layout.second];
+}
+
+// Regular model shapes (excluding LM Head)
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmModel, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(generate_model_test_cases()),
+        ::testing::ValuesIn(kLayouts)),
+    [](const testing::TestParamInfo<ProdGemmParam>& info) {
+        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
+    });
+
+// LM Head shapes (very large N, memory-intensive)
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmLMHead, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(generate_lm_head_test_cases()),
+        ::testing::ValuesIn(kLayouts)),
+    [](const testing::TestParamInfo<ProdGemmParam>& info) {
+        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
+    });
+
+}  // namespace
+
+#endif  // __HIP_PLATFORM_AMD__

From c4c2ea53fb19e030111e165ac60ca44fc0394f15 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Tue, 26 May 2026 10:55:28 -0500
Subject: [PATCH 02/13] rename

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 101 +++++++++++------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 2a086ddea..863ec8bb1 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -37,7 +37,6 @@ using namespace test;
 
 namespace {
 
-using fp32 = float;
 using fp8  = fp8e4m3;
 using bf8  = fp8e5m2;
 
@@ -72,66 +71,66 @@ struct ShapeDef {
     GemmPass pass;
 };
 
-// LLM1 (hidden=7168, MLA, seq=4096)
+// DeepSeek3 (hidden=7168, MLA, seq=4096)
 
-static const ShapeDef llm1_shapes[] = {
+static const ShapeDef deepseek3_shapes[] = {
     // Forward (M=tokens, N, K)
-    {"LLM1_Linear0_fwd",           1536,  7168, GemmPass::FWD},
-    {"LLM1_Linear1_fwd",            576,  7168, GemmPass::FWD},
-    {"LLM1_LNLinear0_fwd",        24576,  1536, GemmPass::FWD},
-    {"LLM1_LNLinear1_fwd",        32768,   512, GemmPass::FWD},
-    {"LLM1_Linear_attn_fwd",       7168, 16384, GemmPass::FWD},
-    {"LLM1_LNMLP_gateup_fwd",     36864,  7168, GemmPass::FWD},
-    {"LLM1_LNMLP_down_fwd",        7168, 18432, GemmPass::FWD},
-    {"LLM1_SharedExp_gu_fwd",      4096,  7168, GemmPass::FWD},
-    {"LLM1_SharedExp_dn_fwd",      7168,  2048, GemmPass::FWD},
-    {"LLM1_TopKRouter_fwd",         256,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
+    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
+    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
+    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
     // Dgrad (M=tokens, N, K)
-    {"LLM1_attn_dgrad",           16384,  7168, GemmPass::DGRAD},
-    {"LLM1_LNLinear1_dgrad",        512, 32768, GemmPass::DGRAD},
-    {"LLM1_LNLinear0_dgrad",       1536, 24576, GemmPass::DGRAD},
-    {"LLM1_SharedExp_dn_dgrad",    2048,  7168, GemmPass::DGRAD},
-    {"LLM1_SharedExp_gu_dgrad",    7168,  4096, GemmPass::DGRAD},
-    {"LLM1_TopKRouter_dgrad",      7168,   256, GemmPass::DGRAD},
-    {"LLM1_MLP_post_dgrad",        7168, 14336, GemmPass::DGRAD},
+    {"DeepSeek3_attn_dgrad",        16384,  7168, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear1_dgrad",     512, 32768, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear0_dgrad",    1536, 24576, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_dn_dgrad", 2048,  7168, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_gu_dgrad", 7168,  4096, GemmPass::DGRAD},
+    {"DeepSeek3_TopKRouter_dgrad",   7168,   256, GemmPass::DGRAD},
+    {"DeepSeek3_MLP_post_dgrad",     7168, 14336, GemmPass::DGRAD},
     // Wgrad (M, N, K=tokens)
-    {"LLM1_attn_wgrad",           16384,  7168, GemmPass::WGRAD},
-    {"LLM1_LNLinear1_wgrad",        512, 32768, GemmPass::WGRAD},
-    {"LLM1_LNLinear0_wgrad",       1536, 24576, GemmPass::WGRAD},
-    {"LLM1_SharedExp_dn_wgrad",    2048,  7168, GemmPass::WGRAD},
-    {"LLM1_SharedExp_gu_wgrad",    7168,  4096, GemmPass::WGRAD},
-    {"LLM1_TopKRouter_wgrad",      7168,   256, GemmPass::WGRAD},
+    {"DeepSeek3_attn_wgrad",        16384,  7168, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear1_wgrad",     512, 32768, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear0_wgrad",    1536, 24576, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_dn_wgrad", 2048,  7168, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_gu_wgrad", 7168,  4096, GemmPass::WGRAD},
+    {"DeepSeek3_TopKRouter_wgrad",   7168,   256, GemmPass::WGRAD},
 };
 
-// LLM1 LM Head (large N, memory-intensive)
-static const ShapeDef llm1_lm_head_shapes[] = {
-    {"LLM1_LMHead_fwd",          129280,  7168, GemmPass::FWD},
-    {"LLM1_LMHead_dgrad",          7168,129280, GemmPass::DGRAD},
-    {"LLM1_LMHead_wgrad",          7168,129280, GemmPass::WGRAD},
+// DeepSeek3 LM Head (large N, memory-intensive)
+static const ShapeDef deepseek3_lm_head_shapes[] = {
+    {"DeepSeek3_LMHead_fwd",     129280,   7168, GemmPass::FWD},
+    {"DeepSeek3_LMHead_dgrad",     7168, 129280, GemmPass::DGRAD},
+    {"DeepSeek3_LMHead_wgrad",     7168, 129280, GemmPass::WGRAD},
 };
 
-// LLM2 (hidden=4096, GQA, seq=4096)
+// Qwen3 (hidden=4096, GQA, seq=4096)
 
-static const ShapeDef llm2_shapes[] = {
+static const ShapeDef qwen3_shapes[] = {
     // Forward (M=tokens, N, K)
-    {"LLM2_LNLinear_QKV_fwd",   9216,  4096, GemmPass::FWD},
-    {"LLM2_Linear_attn_fwd",    4096,  8192, GemmPass::FWD},
-    {"LLM2_Router_fwd",          128,  4096, GemmPass::FWD},
+    {"Qwen3_LNLinear_QKV_fwd",  9216,  4096, GemmPass::FWD},
+    {"Qwen3_Linear_attn_fwd",   4096,  8192, GemmPass::FWD},
+    {"Qwen3_Router_fwd",         128,  4096, GemmPass::FWD},
     // Dgrad (M=tokens, N, K)
-    {"LLM2_Router_dgrad",       4096,   128, GemmPass::DGRAD},
-    {"LLM2_Linear_attn_dgrad",  8192,  4096, GemmPass::DGRAD},
-    {"LLM2_LNLinear_dgrad",     4096,  9216, GemmPass::DGRAD},
+    {"Qwen3_Router_dgrad",      4096,   128, GemmPass::DGRAD},
+    {"Qwen3_Linear_attn_dgrad", 8192,  4096, GemmPass::DGRAD},
+    {"Qwen3_LNLinear_dgrad",    4096,  9216, GemmPass::DGRAD},
     // Wgrad (M, N, K=tokens)
-    {"LLM2_Router_wgrad",       4096,   128, GemmPass::WGRAD},
-    {"LLM2_Linear_attn_wgrad",  8192,  4096, GemmPass::WGRAD},
-    {"LLM2_LNLinear_wgrad",     4096,  9216, GemmPass::WGRAD},
+    {"Qwen3_Router_wgrad",      4096,   128, GemmPass::WGRAD},
+    {"Qwen3_Linear_attn_wgrad", 8192,  4096, GemmPass::WGRAD},
+    {"Qwen3_LNLinear_wgrad",    4096,  9216, GemmPass::WGRAD},
 };
 
-// LLM2 LM Head (large N, memory-intensive)
-static const ShapeDef llm2_lm_head_shapes[] = {
-    {"LLM2_LMHead_fwd",        151936,  4096, GemmPass::FWD},
-    {"LLM2_LMHead_dgrad",        4096,151936, GemmPass::DGRAD},
-    {"LLM2_LMHead_wgrad",        4096,151936, GemmPass::WGRAD},
+// Qwen3 LM Head (large N, memory-intensive)
+static const ShapeDef qwen3_lm_head_shapes[] = {
+    {"Qwen3_LMHead_fwd",       151936,   4096, GemmPass::FWD},
+    {"Qwen3_LMHead_dgrad",       4096, 151936, GemmPass::DGRAD},
+    {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
 };
 
 // ============================================================================
@@ -177,16 +176,16 @@ static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t
 }
 
 static std::vector<ProdGemmTestCase> generate_model_test_cases() {
-    auto v1   = expand_shapes(llm1_shapes,   std::size(llm1_shapes));
-    auto v2   = expand_shapes(llm2_shapes,   std::size(llm2_shapes));
+    auto v1   = expand_shapes(deepseek3_shapes,   std::size(deepseek3_shapes));
+    auto v2   = expand_shapes(qwen3_shapes,       std::size(qwen3_shapes));
     v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
                         std::make_move_iterator(v2.end()));
     return v1;
 }
 
 static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
-    auto v1   = expand_shapes(llm1_lm_head_shapes,   std::size(llm1_lm_head_shapes));
-    auto v2   = expand_shapes(llm2_lm_head_shapes,   std::size(llm2_lm_head_shapes));
+    auto v1   = expand_shapes(deepseek3_lm_head_shapes, std::size(deepseek3_lm_head_shapes));
+    auto v2   = expand_shapes(qwen3_lm_head_shapes,     std::size(qwen3_lm_head_shapes));
     v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
                         std::make_move_iterator(v2.end()));
     return v1;

From 8eaf06d4516a553b2c1006758b956dc7ab0b88f7 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 14:03:52 -0500
Subject: [PATCH 03/13] restructure based on review comments

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 174 ++++++++---------------
 tests/cpp/test_common.cu                 |  43 ++++++
 tests/cpp/test_common.h                  |   4 +
 3 files changed, 107 insertions(+), 114 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 863ec8bb1..db29b4030 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -28,7 +28,6 @@
 #include <gtest/gtest.h>
 #include <transformer_engine/cast.h>
 #include <transformer_engine/gemm.h>
-#include <transformer_engine/swizzle.h>
 #include <transformer_engine/transformer_engine.h>
 #include "../test_common.h"
 
@@ -133,104 +132,24 @@ static const ShapeDef qwen3_lm_head_shapes[] = {
     {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
 };
 
-// ============================================================================
-// Test case: a concrete (M, K, N) shape with pass info, ready for execution
-// ============================================================================
-
-struct ProdGemmTestCase {
-    std::string label;
-    size_t m, k, n;
-    GemmPass pass;
-};
-
-std::ostream& operator<<(std::ostream& os, const ProdGemmTestCase& tc) {
-    return os << tc.label;
-}
+// ====================================================
+// Test case: a concrete (M, K, N) shape with pass info
+// ====================================================
 
-static std::vector<ProdGemmTestCase> expand_shapes(const ShapeDef* defs, size_t count) {
-    std::vector<ProdGemmTestCase> cases;
-    for (size_t i = 0; i < count; ++i) {
-        const auto& s = defs[i];
-        for (size_t mbs : {1, 2, 4}) {
-            size_t tokens = mbs * 4096;
-            ProdGemmTestCase tc;
-            tc.label = std::string(s.label) + "_mbs" + std::to_string(mbs);
-            tc.pass = s.pass;
-            switch (s.pass) {
-                case GemmPass::FWD:
-                case GemmPass::DGRAD:
-                    tc.m = tokens;
-                    tc.n = s.dim1;
-                    tc.k = s.dim2;
-                    break;
-                case GemmPass::WGRAD:
-                    tc.m = s.dim1;
-                    tc.n = s.dim2;
-                    tc.k = tokens;
-                    break;
-            }
-            cases.push_back(std::move(tc));
-        }
-    }
-    return cases;
-}
-
-static std::vector<ProdGemmTestCase> generate_model_test_cases() {
-    auto v1   = expand_shapes(deepseek3_shapes,   std::size(deepseek3_shapes));
-    auto v2   = expand_shapes(qwen3_shapes,       std::size(qwen3_shapes));
-    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
-                        std::make_move_iterator(v2.end()));
-    return v1;
-}
-
-static std::vector<ProdGemmTestCase> generate_lm_head_test_cases() {
-    auto v1   = expand_shapes(deepseek3_lm_head_shapes, std::size(deepseek3_lm_head_shapes));
-    auto v2   = expand_shapes(qwen3_lm_head_shapes,     std::size(qwen3_lm_head_shapes));
-    v1.insert(v1.end(), std::make_move_iterator(v2.begin()),
-                        std::make_move_iterator(v2.end()));
-    return v1;
+std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
+    return os << s.label;
 }
 
-// ============================================================================
-// Swizzle helper for gfx1250 MXFP8 scales (same as test_cublaslt_gemm.cu)
-// ============================================================================
-
-static void swizzle_mxfp8_scales(test::Tensor& t, bool rowwise) {
-    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
-                              : t.columnwise_scale_inv_dptr();
-    if (!scale_ptr) return;
-
-    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
-                                          : t.columnwise_scale_inv_shape();
-    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
-                                          : t.columnwise_shape();
-
-    size_t num_scales = 1;
-    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
-
-    uint8_t* d_tmp = nullptr;
-    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
-
-    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
-    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
-    output_tw.set_with_gemm_swizzled_scales(true);
-
-    if (rowwise) {
-        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    } else {
-        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+static void resolve_mkn(const ShapeDef& s, size_t mbs,
+                         size_t& m, size_t& k, size_t& n) {
+    size_t tokens = mbs * 4096;
+    switch (s.pass) {
+        case GemmPass::FWD:
+        case GemmPass::DGRAD:
+            m = tokens; n = s.dim1; k = s.dim2; break;
+        case GemmPass::WGRAD:
+            m = s.dim1; n = s.dim2; k = tokens; break;
     }
-
-    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
-    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
-    NVTE_CHECK_CUDA(cudaFree(d_tmp));
 }
 
 // ============================================================================
@@ -342,25 +261,29 @@ void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb)
 // Test suite
 // ============================================================================
 
-using ProdGemmParam = std::tuple<ProdGemmTestCase, Layout>;
+using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
 
 class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
 TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
-    const auto& tc = std::get<0>(GetParam());
-    const auto& layout = std::get<1>(GetParam());
+    const auto& shape = std::get<0>(GetParam());
+    size_t mbs = std::get<1>(GetParam());
+    const auto& layout = std::get<2>(GetParam());
     bool transa = layout.first;
     bool transb = layout.second;
 
-    switch (tc.pass) {
+    size_t m, k, n;
+    resolve_mkn(shape, mbs, m, k, n);
+
+    switch (shape.pass) {
         case GemmPass::FWD:
-            performMxfp8DqTest<fp8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<fp8, fp8, bf16>(m, k, n, transa, transb);
             break;
         case GemmPass::DGRAD:
-            performMxfp8DqTest<bf8, fp8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<bf8, fp8, bf16>(m, k, n, transa, transb);
             break;
         case GemmPass::WGRAD:
-            performMxfp8DqTest<fp8, bf8, bf16>(tc.m, tc.k, tc.n, transa, transb);
+            performMxfp8DqTest<fp8, bf8, bf16>(m, k, n, transa, transb);
             break;
     }
 }
@@ -370,25 +293,48 @@ static inline std::string TN(const Layout& layout) {
     return map[layout.first][layout.second];
 }
 
-// Regular model shapes (excluding LM Head)
+static inline auto testName(const testing::TestParamInfo<ProdGemmParam>& info) {
+    const auto& shape = std::get<0>(info.param);
+    size_t mbs = std::get<1>(info.param);
+    const auto& layout = std::get<2>(info.param);
+    return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
+}
+
+// DeepSeek3 model shapes
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmDeepSeek3, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(deepseek3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    testName);
+
+// Qwen3 model shapes
+INSTANTIATE_TEST_SUITE_P(
+    ProdGemmQwen3, ProdGemmTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(qwen3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    testName);
+
+// DeepSeek3 LM Head shapes (very large N, memory-intensive)
 INSTANTIATE_TEST_SUITE_P(
-    ProdGemmModel, ProdGemmTestSuite,
+    ProdGemmDeepSeek3LMHead, ProdGemmTestSuite,
     ::testing::Combine(
-        ::testing::ValuesIn(generate_model_test_cases()),
+        ::testing::ValuesIn(deepseek3_lm_head_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
         ::testing::ValuesIn(kLayouts)),
-    [](const testing::TestParamInfo<ProdGemmParam>& info) {
-        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
-    });
+    testName);
 
-// LM Head shapes (very large N, memory-intensive)
+// Qwen3 LM Head shapes (very large N, memory-intensive)
 INSTANTIATE_TEST_SUITE_P(
-    ProdGemmLMHead, ProdGemmTestSuite,
+    ProdGemmQwen3LMHead, ProdGemmTestSuite,
     ::testing::Combine(
-        ::testing::ValuesIn(generate_lm_head_test_cases()),
+        ::testing::ValuesIn(qwen3_lm_head_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
         ::testing::ValuesIn(kLayouts)),
-    [](const testing::TestParamInfo<ProdGemmParam>& info) {
-        return std::get<0>(info.param).label + "_" + TN(std::get<1>(info.param));
-    });
+    testName);
 
 }  // namespace
 
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index fbcfdf89d..392e641d5 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -22,6 +22,9 @@
 #endif
 #include <omp.h>
 
+#ifdef __HIP_PLATFORM_AMD__
+#include <transformer_engine/swizzle.h>
+#endif
 #include <transformer_engine/transformer_engine.h>
 #include "util/logging.h"
 
@@ -1314,4 +1317,44 @@ std::array<size_t, 4> get_scale_tensor_dims(const size_t rows,
     return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X};
 }
 
+#ifdef __HIP_PLATFORM_AMD__
+void swizzle_mxfp8_scales(Tensor& t, bool rowwise) {
+    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
+                              : t.columnwise_scale_inv_dptr();
+    if (!scale_ptr) return;
+
+    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
+                                          : t.columnwise_scale_inv_shape();
+    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
+                                          : t.columnwise_shape();
+
+    size_t num_scales = 1;
+    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
+
+    uint8_t* d_tmp = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
+
+    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+    output_tw.set_with_gemm_swizzled_scales(true);
+
+    if (rowwise) {
+        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    } else {
+        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+    }
+
+    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
+    NVTE_CHECK_CUDA(cudaFree(d_tmp));
+}
+#endif // #ifdef __HIP_PLATFORM_AMD__
+
 }  // namespace test
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index a25b7b61e..6c37ccc57 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -581,6 +581,10 @@ int32_t getDeviceComputeCapability();
 constexpr int32_t hopperComputeCapability = 90;
 constexpr int32_t blackwellComputeCapability = 100;
 
+#ifdef USE_ROCM
+void swizzle_mxfp8_scales(Tensor& t, bool rowwise);
+#endif
+
 }  // namespace test
 
 #if FP4_TYPE_SUPPORTED

From 77f1c4535f341a9410e313a1829b242457e71409 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 14:15:43 -0500
Subject: [PATCH 04/13] clarify switch

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index db29b4030..97410d182 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -144,11 +144,17 @@ static void resolve_mkn(const ShapeDef& s, size_t mbs,
                          size_t& m, size_t& k, size_t& n) {
     size_t tokens = mbs * 4096;
     switch (s.pass) {
-        case GemmPass::FWD:
+        case GemmPass::FWD: // Fallthrough, same as DGRAD
         case GemmPass::DGRAD:
-            m = tokens; n = s.dim1; k = s.dim2; break;
+            m = tokens;
+            n = s.dim1;
+            k = s.dim2;
+            break;
         case GemmPass::WGRAD:
-            m = s.dim1; n = s.dim2; k = tokens; break;
+            m = s.dim1;
+            n = s.dim2;
+            k = tokens;
+            break;
     }
 }
 

From 76c8d9894d1039428e2dfa95a96ccfbcc124beb5 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 17:14:36 -0500
Subject: [PATCH 05/13] skip known-bad tests

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 97410d182..02695c387 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -21,6 +21,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
+#include <set>
 #include <string>
 #include <vector>
 #include <cuda_bf16.h>
@@ -271,6 +272,47 @@ using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
 
 class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
+// Known-failing GEMM shapes on gfx950
+static const std::set<std::string> kMI355XSkips = {
+    // N=576 + NT: rocroller LDS stride mismatch
+    "DeepSeek3_Linear1_fwd_mbs1_NT",
+    "DeepSeek3_Linear1_fwd_mbs2_NT",
+    "DeepSeek3_Linear1_fwd_mbs4_NT",
+    // Sporadic kernel failures
+    "DeepSeek3_LNLinear0_fwd_mbs2_NT",
+    "DeepSeek3_Linear_attn_fwd_mbs4_NT",
+    "DeepSeek3_LNMLP_gateup_fwd_mbs4_NN",
+    "DeepSeek3_LNMLP_down_fwd_mbs2_NN",
+    "DeepSeek3_attn_wgrad_mbs2_NN",
+    "DeepSeek3_LNLinear0_dgrad_mbs2_NN",
+    "DeepSeek3_LNLinear0_wgrad_mbs4_NN",
+    "DeepSeek3_SharedExp_dn_wgrad_mbs4_NT",
+    // K=128 (minimum for MXFP8): unreliable across layouts
+    "Qwen3_Router_fwd_mbs1_NN",
+    "Qwen3_Router_dgrad_mbs1_NN",
+    "Qwen3_Router_dgrad_mbs1_NT",
+    "Qwen3_Router_dgrad_mbs2_NT",
+    "Qwen3_Router_dgrad_mbs4_TN",
+    "Qwen3_Router_dgrad_mbs4_NT",
+    // Other failures
+    "Qwen3_Linear_attn_wgrad_mbs2_NT",
+    "DeepSeek3_LMHead_fwd_mbs1_NT",
+    "DeepSeek3_LMHead_fwd_mbs4_NN",
+    // Qwen3 LM Head dgrad (N=151936): nearly all combos fail
+    "Qwen3_LMHead_dgrad_mbs1_NN",
+    "Qwen3_LMHead_dgrad_mbs1_NT",
+    "Qwen3_LMHead_dgrad_mbs2_TN",
+    "Qwen3_LMHead_dgrad_mbs2_NN",
+    "Qwen3_LMHead_dgrad_mbs2_NT",
+    "Qwen3_LMHead_dgrad_mbs4_TN",
+    "Qwen3_LMHead_dgrad_mbs4_NN",
+    "Qwen3_LMHead_dgrad_mbs4_NT",
+    // Crash (likely OOM / kernel fault)
+    "Qwen3_LMHead_fwd_mbs4_TN",
+    "Qwen3_LMHead_fwd_mbs4_NN",
+    "Qwen3_LMHead_fwd_mbs4_NT",
+};
+
 TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
     const auto& shape = std::get<0>(GetParam());
     size_t mbs = std::get<1>(GetParam());
@@ -278,6 +320,13 @@ TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
     bool transa = layout.first;
     bool transb = layout.second;
 
+    static const char* tn_map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
+    std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
+                       + "_" + tn_map[transa][transb];
+    if (kMI355XSkips.count(name)) {
+        GTEST_SKIP() << "Known MI355X hipBLASLt failure: " << name;
+    }
+
     size_t m, k, n;
     resolve_mkn(shape, mbs, m, k, n);
 

From db3123fb2a111fea7f09a7eba8e299d403ad4a9a Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Thu, 28 May 2026 17:47:35 -0500
Subject: [PATCH 06/13] loosen tolerances a bit

---
 tests/cpp/operator/test_gemm_prodgemm.cu | 37 ++++--------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
index 02695c387..b25d67f35 100644
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ b/tests/cpp/operator/test_gemm_prodgemm.cu
@@ -258,9 +258,11 @@ void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb)
     ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 
     // 7. Compare results
+    // The MXFP8 GEMM and bf16 reference GEMM use different internal accumulation
+    // paths, so results can differ by up to 1 ULP in bf16 (~1.5-2% relative).
     auto [atol, rtol] = getTolerances(dtype);
-    atol = std::max(atol, 5e-4);
-    rtol = std::max(rtol, 1e-3);
+    atol = std::max(atol, 1e-3);
+    rtol = std::max(rtol, 2e-2);
     compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
 }
 
@@ -274,39 +276,10 @@ class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
 
 // Known-failing GEMM shapes on gfx950
 static const std::set<std::string> kMI355XSkips = {
-    // N=576 + NT: rocroller LDS stride mismatch
+    // N=576 + NT: rocroller LDS stride mismatch (all elements wrong, ~100x off)
     "DeepSeek3_Linear1_fwd_mbs1_NT",
     "DeepSeek3_Linear1_fwd_mbs2_NT",
     "DeepSeek3_Linear1_fwd_mbs4_NT",
-    // Sporadic kernel failures
-    "DeepSeek3_LNLinear0_fwd_mbs2_NT",
-    "DeepSeek3_Linear_attn_fwd_mbs4_NT",
-    "DeepSeek3_LNMLP_gateup_fwd_mbs4_NN",
-    "DeepSeek3_LNMLP_down_fwd_mbs2_NN",
-    "DeepSeek3_attn_wgrad_mbs2_NN",
-    "DeepSeek3_LNLinear0_dgrad_mbs2_NN",
-    "DeepSeek3_LNLinear0_wgrad_mbs4_NN",
-    "DeepSeek3_SharedExp_dn_wgrad_mbs4_NT",
-    // K=128 (minimum for MXFP8): unreliable across layouts
-    "Qwen3_Router_fwd_mbs1_NN",
-    "Qwen3_Router_dgrad_mbs1_NN",
-    "Qwen3_Router_dgrad_mbs1_NT",
-    "Qwen3_Router_dgrad_mbs2_NT",
-    "Qwen3_Router_dgrad_mbs4_TN",
-    "Qwen3_Router_dgrad_mbs4_NT",
-    // Other failures
-    "Qwen3_Linear_attn_wgrad_mbs2_NT",
-    "DeepSeek3_LMHead_fwd_mbs1_NT",
-    "DeepSeek3_LMHead_fwd_mbs4_NN",
-    // Qwen3 LM Head dgrad (N=151936): nearly all combos fail
-    "Qwen3_LMHead_dgrad_mbs1_NN",
-    "Qwen3_LMHead_dgrad_mbs1_NT",
-    "Qwen3_LMHead_dgrad_mbs2_TN",
-    "Qwen3_LMHead_dgrad_mbs2_NN",
-    "Qwen3_LMHead_dgrad_mbs2_NT",
-    "Qwen3_LMHead_dgrad_mbs4_TN",
-    "Qwen3_LMHead_dgrad_mbs4_NN",
-    "Qwen3_LMHead_dgrad_mbs4_NT",
     // Crash (likely OOM / kernel fault)
     "Qwen3_LMHead_fwd_mbs4_TN",
     "Qwen3_LMHead_fwd_mbs4_NN",

From c6cc59f810dfca622de2dd5bd1425b11969be4d2 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Mon, 1 Jun 2026 10:56:20 -0500
Subject: [PATCH 07/13] restructure tests into test_cublaslt_gemm

---
 tests/cpp/operator/CMakeLists.txt        |   1 -
 tests/cpp/operator/test_cublaslt_gemm.cu | 185 +++++++++++-
 tests/cpp/operator/test_gemm_prodgemm.cu | 369 -----------------------
 tests/cpp/test_common.cu                 |  43 ---
 tests/cpp/test_common.h                  |   4 -
 5 files changed, 184 insertions(+), 418 deletions(-)
 delete mode 100644 tests/cpp/operator/test_gemm_prodgemm.cu

diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
index 0eded7219..0ebd7fdfe 100644
--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -39,7 +39,6 @@ if(USE_CUDA)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu
-              test_gemm_prodgemm.cu
               test_cast_mxfp4_transpose.cu)
 endif()
 
diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index 85f183bf7..669238baf 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -5,6 +5,8 @@
  ************************************************************************/
 #include <cmath>
 #include <iostream>
+#include <optional>
+#include <set>
 #include <string>
 #include <cuda_bf16.h>
 #include <cuda_runtime.h>
@@ -33,6 +35,98 @@ std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes_mxfp8 = {
   {768, 3072, 4096},
 };
 
+// ============================================================================
+// Production LLM shapes for MXFP8 GEMM testing.
+//
+// Each shape is tested with 3 micro-batch sizes (MBS = 1, 2, 4)
+// yielding tokens = 4096, 8192, 16384, and 3 layouts (TN, NN, NT)
+// via ::testing::Combine.
+//
+// GemmPass selects the FP8 type combination:
+//   FWD:   E4M3 x E4M3 -> BF16
+//   DGRAD: E5M2 x E4M3 -> BF16
+//   WGRAD: E4M3 x E5M2 -> BF16
+// ============================================================================
+
+enum class GemmPass { FWD, DGRAD, WGRAD };
+
+struct ShapeDef {
+  const char* label;
+  size_t dim1;   // FWD/DGRAD: N, WGRAD: M
+  size_t dim2;   // FWD/DGRAD: K, WGRAD: N
+  GemmPass pass;
+};
+
+std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
+  return os << s.label;
+}
+
+static void resolve_mkn(const ShapeDef& s, size_t mbs,
+                         size_t& m, size_t& k, size_t& n) {
+  size_t tokens = mbs * 4096;
+  switch (s.pass) {
+    case GemmPass::FWD:
+    case GemmPass::DGRAD:
+      m = tokens; n = s.dim1; k = s.dim2;
+      break;
+    case GemmPass::WGRAD:
+      m = s.dim1; n = s.dim2; k = tokens;
+      break;
+  }
+}
+
+// DeepSeek3 (hidden=7168, MLA, seq=4096, incl. LM Head)
+static const ShapeDef deepseek3_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
+    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
+    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
+    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
+    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
+    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
+    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
+    {"DeepSeek3_LMHead_fwd",       129280,  7168, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"DeepSeek3_attn_dgrad",        16384,   7168, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear1_dgrad",     512,  32768, GemmPass::DGRAD},
+    {"DeepSeek3_LNLinear0_dgrad",    1536,  24576, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_dn_dgrad", 2048,   7168, GemmPass::DGRAD},
+    {"DeepSeek3_SharedExp_gu_dgrad", 7168,   4096, GemmPass::DGRAD},
+    {"DeepSeek3_TopKRouter_dgrad",   7168,    256, GemmPass::DGRAD},
+    {"DeepSeek3_MLP_post_dgrad",     7168,  14336, GemmPass::DGRAD},
+    {"DeepSeek3_LMHead_dgrad",       7168, 129280, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"DeepSeek3_attn_wgrad",        16384,   7168, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear1_wgrad",     512,  32768, GemmPass::WGRAD},
+    {"DeepSeek3_LNLinear0_wgrad",    1536,  24576, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_dn_wgrad", 2048,   7168, GemmPass::WGRAD},
+    {"DeepSeek3_SharedExp_gu_wgrad", 7168,   4096, GemmPass::WGRAD},
+    {"DeepSeek3_TopKRouter_wgrad",   7168,    256, GemmPass::WGRAD},
+    {"DeepSeek3_LMHead_wgrad",       7168, 129280, GemmPass::WGRAD},
+};
+
+// Qwen3 (hidden=4096, GQA, seq=4096, incl. LM Head)
+static const ShapeDef qwen3_shapes[] = {
+    // Forward (M=tokens, N, K)
+    {"Qwen3_LNLinear_QKV_fwd",  9216,   4096, GemmPass::FWD},
+    {"Qwen3_Linear_attn_fwd",   4096,   8192, GemmPass::FWD},
+    {"Qwen3_Router_fwd",         128,   4096, GemmPass::FWD},
+    {"Qwen3_LMHead_fwd",      151936,   4096, GemmPass::FWD},
+    // Dgrad (M=tokens, N, K)
+    {"Qwen3_Router_dgrad",      4096,    128, GemmPass::DGRAD},
+    {"Qwen3_Linear_attn_dgrad", 8192,   4096, GemmPass::DGRAD},
+    {"Qwen3_LNLinear_dgrad",    4096,   9216, GemmPass::DGRAD},
+    {"Qwen3_LMHead_dgrad",      4096, 151936, GemmPass::DGRAD},
+    // Wgrad (M, N, K=tokens)
+    {"Qwen3_Router_wgrad",      4096,    128, GemmPass::WGRAD},
+    {"Qwen3_Linear_attn_wgrad", 8192,   4096, GemmPass::WGRAD},
+    {"Qwen3_LNLinear_wgrad",    4096,   9216, GemmPass::WGRAD},
+    {"Qwen3_LMHead_wgrad",      4096, 151936, GemmPass::WGRAD},
+};
+
 //  A, B, Bias, Gelu, D
 //  Bias type choose as bf16 in use_fp8, D_type otherwise
 //  Gelu type the same as Bias_Type
@@ -559,7 +653,9 @@ void performTest(const TestParams& params) {
 
 #ifdef __HIP_PLATFORM_AMD__
 template <typename A_Type, typename B_Type, typename D_Type>
-void performDqTest(const TestParams &params) {
+void performDqTest(const TestParams &params,
+                   std::optional<double> atol_override = std::nullopt,
+                   std::optional<double> rtol_override = std::nullopt) {
   DType atype = TypeInfo<A_Type>::dtype;
   DType btype = TypeInfo<B_Type>::dtype;
   DType dtype = TypeInfo<D_Type>::dtype;
@@ -633,6 +729,10 @@ void performDqTest(const TestParams &params) {
 
   //compare results
   auto [atol, rtol] = getTestTolerances(dtype, true, true);
+  if (atol_override)
+    atol = *atol_override;
+  if (rtol_override)
+    rtol = *rtol_override;
   compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
 }
 #endif // __HIP_PLATFORM_AMD__
@@ -751,6 +851,89 @@ INSTANTIATE_TEST_SUITE_P(OperatorTest, DqGEMMTestSuite,
                            return MKN(std::get<0>(info.param)) + "x" + TN(std::get<3>(info.param));
                          });
 
+// ============================================================================
+// Production GEMM shape instantiations (run with --gtest_filter='ProdGemm*')
+// ============================================================================
+
+// Known-failing GEMM shapes on gfx950
+static const std::set<std::string> kGfx950Skips = {
+    "DeepSeek3_Linear1_fwd_mbs1_NT",
+    "DeepSeek3_Linear1_fwd_mbs2_NT",
+    "DeepSeek3_Linear1_fwd_mbs4_NT",
+    "DeepSeek3_LNLinear0_fwd_mbs4_NN",
+    "DeepSeek3_LNLinear0_fwd_mbs4_NT",
+    "DeepSeek3_attn_wgrad_mbs1_NN",
+    "Qwen3_LMHead_fwd_mbs2_NN",
+    "Qwen3_Router_fwd_mbs2_NT",
+    "Qwen3_LMHead_fwd_mbs4_TN",
+    "Qwen3_LMHead_fwd_mbs4_NN",
+    "Qwen3_LMHead_fwd_mbs4_NT",
+};
+
+// Production GEMM test suite using ShapeDef x MBS x Layout via testing::Combine.
+using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
+
+class ProdDqGEMMTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
+
+TEST_P(ProdDqGEMMTestSuite, TestMxfp8Dq) {
+  const auto& shape = std::get<0>(GetParam());
+  size_t mbs = std::get<1>(GetParam());
+  const auto& layout = std::get<2>(GetParam());
+
+  std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
+                     + "_" + TN(layout);
+  if (kGfx950Skips.count(name)) {
+    GTEST_SKIP() << "Known gfx950 hipBLASLt failure: " << name;
+  }
+
+  size_t m, k, n;
+  resolve_mkn(shape, mbs, m, k, n);
+
+  TestParams params = {.m = m, .k = k, .n = n,
+                       .use_bias = false, .use_gelu = false,
+                       .transa = layout.first, .transb = layout.second,
+                       .scaling_mode = NVTEScalingMode::NVTE_MXFP8_1D_SCALING};
+
+  // Production shapes use looser tolerances: the MXFP8 and bf16 reference
+  // GEMM use different internal accumulation paths, so results can differ
+  // by up to 1 ULP in bf16 (~1.5-2% relative).
+  const double prod_atol = 1e-3;
+  const double prod_rtol = 2e-2;
+
+  switch (shape.pass) {
+    case GemmPass::FWD:
+      performDqTest<fp8, fp8, bf16>(params, prod_atol, prod_rtol);
+      break;
+    case GemmPass::DGRAD:
+      performDqTest<bf8, fp8, bf16>(params, prod_atol, prod_rtol);
+      break;
+    case GemmPass::WGRAD:
+      performDqTest<fp8, bf8, bf16>(params, prod_atol, prod_rtol);
+      break;
+  }
+}
+
+static auto prodTestName = [](const testing::TestParamInfo<ProdGemmParam>& info) {
+  const auto& shape = std::get<0>(info.param);
+  size_t mbs = std::get<1>(info.param);
+  const auto& layout = std::get<2>(info.param);
+  return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
+};
+
+INSTANTIATE_TEST_SUITE_P(ProdGemmDeepSeek3, ProdDqGEMMTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(deepseek3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    prodTestName);
+
+INSTANTIATE_TEST_SUITE_P(ProdGemmQwen3, ProdDqGEMMTestSuite,
+    ::testing::Combine(
+        ::testing::ValuesIn(qwen3_shapes),
+        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
+        ::testing::ValuesIn(kLayouts)),
+    prodTestName);
+
 TEST(InputGenTest, FillUniform_DoesNotGetOverwrittenByFromCpu) {
   const size_t rows = 128;
   const size_t cols = 256;
diff --git a/tests/cpp/operator/test_gemm_prodgemm.cu b/tests/cpp/operator/test_gemm_prodgemm.cu
deleted file mode 100644
index b25d67f35..000000000
--- a/tests/cpp/operator/test_gemm_prodgemm.cu
+++ /dev/null
@@ -1,369 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
- *
- * License for AMD contributions = MIT. See LICENSE for more information
- ************************************************************************/
-
-/*
- * MXFP8 GEMM correctness tests for production LLM shapes.
- *
- * Tests forward, dgrad, and wgrad passes with appropriate FP8 type combos:
- *   Forward:  E4M3 x E4M3 -> BF16
- *   Dgrad:    E5M2 x E4M3 -> BF16
- *   Wgrad:    E4M3 x E5M2 -> BF16
- *
- * Each shape is tested with 3 transpose configs (TN, NN, NT) and
- * 3 micro-batch sizes (MBS = 1, 2, 4 -> tokens = 4096, 8192, 16384).
- */
-
-#ifdef __HIP_PLATFORM_AMD__
-
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <set>
-#include <string>
-#include <vector>
-#include <cuda_bf16.h>
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <transformer_engine/cast.h>
-#include <transformer_engine/gemm.h>
-#include <transformer_engine/transformer_engine.h>
-#include "../test_common.h"
-
-using namespace transformer_engine;
-using namespace test;
-
-namespace {
-
-using fp8  = fp8e4m3;
-using bf8  = fp8e5m2;
-
-using TShape = std::vector<size_t>;
-using Layout = std::pair<bool, bool>;  // {transa, transb}
-
-static const Layout kTN{true, false};
-static const Layout kNN{false, false};
-static const Layout kNT{false, true};
-static const std::vector<Layout> kLayouts = {kTN, kNN, kNT};
-
-// ============================================================================
-// GemmPass: determines A/B FP8 type combination
-//   FWD:   fp8  x fp8  (E4M3 x E4M3)
-//   DGRAD: bf8  x fp8  (E5M2 x E4M3)
-//   WGRAD: fp8  x bf8  (E4M3 x E5M2)
-// ============================================================================
-
-enum class GemmPass { FWD, DGRAD, WGRAD };
-
-// ============================================================================
-// Shape definition: describes a GEMM from the model architecture.
-//
-//   Forward / Dgrad: M = tokens, dim1 = N, dim2 = K
-//   Wgrad:           K = tokens, dim1 = M, dim2 = N
-// ============================================================================
-
-struct ShapeDef {
-    const char* label;
-    size_t dim1;
-    size_t dim2;
-    GemmPass pass;
-};
-
-// DeepSeek3 (hidden=7168, MLA, seq=4096)
-
-static const ShapeDef deepseek3_shapes[] = {
-    // Forward (M=tokens, N, K)
-    {"DeepSeek3_Linear0_fwd",        1536,  7168, GemmPass::FWD},
-    {"DeepSeek3_Linear1_fwd",         576,  7168, GemmPass::FWD},
-    {"DeepSeek3_LNLinear0_fwd",     24576,  1536, GemmPass::FWD},
-    {"DeepSeek3_LNLinear1_fwd",     32768,   512, GemmPass::FWD},
-    {"DeepSeek3_Linear_attn_fwd",    7168, 16384, GemmPass::FWD},
-    {"DeepSeek3_LNMLP_gateup_fwd",  36864,  7168, GemmPass::FWD},
-    {"DeepSeek3_LNMLP_down_fwd",     7168, 18432, GemmPass::FWD},
-    {"DeepSeek3_SharedExp_gu_fwd",   4096,  7168, GemmPass::FWD},
-    {"DeepSeek3_SharedExp_dn_fwd",   7168,  2048, GemmPass::FWD},
-    {"DeepSeek3_TopKRouter_fwd",      256,  7168, GemmPass::FWD},
-    // Dgrad (M=tokens, N, K)
-    {"DeepSeek3_attn_dgrad",        16384,  7168, GemmPass::DGRAD},
-    {"DeepSeek3_LNLinear1_dgrad",     512, 32768, GemmPass::DGRAD},
-    {"DeepSeek3_LNLinear0_dgrad",    1536, 24576, GemmPass::DGRAD},
-    {"DeepSeek3_SharedExp_dn_dgrad", 2048,  7168, GemmPass::DGRAD},
-    {"DeepSeek3_SharedExp_gu_dgrad", 7168,  4096, GemmPass::DGRAD},
-    {"DeepSeek3_TopKRouter_dgrad",   7168,   256, GemmPass::DGRAD},
-    {"DeepSeek3_MLP_post_dgrad",     7168, 14336, GemmPass::DGRAD},
-    // Wgrad (M, N, K=tokens)
-    {"DeepSeek3_attn_wgrad",        16384,  7168, GemmPass::WGRAD},
-    {"DeepSeek3_LNLinear1_wgrad",     512, 32768, GemmPass::WGRAD},
-    {"DeepSeek3_LNLinear0_wgrad",    1536, 24576, GemmPass::WGRAD},
-    {"DeepSeek3_SharedExp_dn_wgrad", 2048,  7168, GemmPass::WGRAD},
-    {"DeepSeek3_SharedExp_gu_wgrad", 7168,  4096, GemmPass::WGRAD},
-    {"DeepSeek3_TopKRouter_wgrad",   7168,   256, GemmPass::WGRAD},
-};
-
-// DeepSeek3 LM Head (large N, memory-intensive)
-static const ShapeDef deepseek3_lm_head_shapes[] = {
-    {"DeepSeek3_LMHead_fwd",     129280,   7168, GemmPass::FWD},
-    {"DeepSeek3_LMHead_dgrad",     7168, 129280, GemmPass::DGRAD},
-    {"DeepSeek3_LMHead_wgrad",     7168, 129280, GemmPass::WGRAD},
-};
-
-// Qwen3 (hidden=4096, GQA, seq=4096)
-
-static const ShapeDef qwen3_shapes[] = {
-    // Forward (M=tokens, N, K)
-    {"Qwen3_LNLinear_QKV_fwd",  9216,  4096, GemmPass::FWD},
-    {"Qwen3_Linear_attn_fwd",   4096,  8192, GemmPass::FWD},
-    {"Qwen3_Router_fwd",         128,  4096, GemmPass::FWD},
-    // Dgrad (M=tokens, N, K)
-    {"Qwen3_Router_dgrad",      4096,   128, GemmPass::DGRAD},
-    {"Qwen3_Linear_attn_dgrad", 8192,  4096, GemmPass::DGRAD},
-    {"Qwen3_LNLinear_dgrad",    4096,  9216, GemmPass::DGRAD},
-    // Wgrad (M, N, K=tokens)
-    {"Qwen3_Router_wgrad",      4096,   128, GemmPass::WGRAD},
-    {"Qwen3_Linear_attn_wgrad", 8192,  4096, GemmPass::WGRAD},
-    {"Qwen3_LNLinear_wgrad",    4096,  9216, GemmPass::WGRAD},
-};
-
-// Qwen3 LM Head (large N, memory-intensive)
-static const ShapeDef qwen3_lm_head_shapes[] = {
-    {"Qwen3_LMHead_fwd",       151936,   4096, GemmPass::FWD},
-    {"Qwen3_LMHead_dgrad",       4096, 151936, GemmPass::DGRAD},
-    {"Qwen3_LMHead_wgrad",       4096, 151936, GemmPass::WGRAD},
-};
-
-// ====================================================
-// Test case: a concrete (M, K, N) shape with pass info
-// ====================================================
-
-std::ostream& operator<<(std::ostream& os, const ShapeDef& s) {
-    return os << s.label;
-}
-
-static void resolve_mkn(const ShapeDef& s, size_t mbs,
-                         size_t& m, size_t& k, size_t& n) {
-    size_t tokens = mbs * 4096;
-    switch (s.pass) {
-        case GemmPass::FWD: // Fallthrough, same as DGRAD
-        case GemmPass::DGRAD:
-            m = tokens;
-            n = s.dim1;
-            k = s.dim2;
-            break;
-        case GemmPass::WGRAD:
-            m = s.dim1;
-            n = s.dim2;
-            k = tokens;
-            break;
-    }
-}
-
-// ============================================================================
-// MXFP8 dequantize-based GEMM correctness test
-//
-// 1. Create random source matrices A_src, B_src in D_Type (bf16)
-// 2. Quantize:   A_src -> A_fp8, B_src -> B_fp8  (MXFP8 block scaling)
-// 3. Dequantize: A_fp8 -> A_ref, B_fp8 -> B_ref  (back to D_Type)
-// 4. Swizzle scales for gfx1250 (if needed)
-// 5. MXFP8 GEMM:     D     = A_fp8 * B_fp8
-// 6. Non-FP8 GEMM:   D_ref = A_ref * B_ref
-// 7. Compare D vs D_ref
-// ============================================================================
-
-template <typename A_Type, typename B_Type, typename D_Type>
-void performMxfp8DqTest(size_t m, size_t k, size_t n, bool transa, bool transb) {
-    DType atype = TypeInfo<A_Type>::dtype;
-    DType btype = TypeInfo<B_Type>::dtype;
-    DType dtype = TypeInfo<D_Type>::dtype;
-
-    ASSERT_TRUE(isFp8Type(atype) && isFp8Type(btype)) << "FP8/BF8 input types expected";
-    ASSERT_FALSE(isFp8Type(dtype)) << "Non-FP8 output type expected";
-
-    if (m % 16 || n % 16) {
-        GTEST_SKIP() << "MXFP8 requires M & N to be multiples of 16";
-    }
-    if (k % 128) {
-        GTEST_SKIP() << "MXFP8 requires K to be a multiple of 128";
-    }
-
-    cudaDeviceProp prop;
-    (void)cudaGetDeviceProperties(&prop, 0);
-
-    bool mxfp8_supported = (prop.major == 9 && prop.minor >= 5) || prop.major >= 12;
-    if (!mxfp8_supported) {
-        GTEST_SKIP() << "MXFP8 is not supported on this GPU";
-    }
-
-    TShape a_shape = transa ? TShape{m, k} : TShape{k, m};
-    TShape b_shape = transb ? TShape{k, n} : TShape{n, k};
-
-    // 1. Create random source matrices
-    Tensor A_src("A_src", a_shape, dtype);
-    Tensor B_src("B_src", b_shape, dtype);
-    fillUniform(&A_src);
-    fillUniform(&B_src);
-
-    // 2. Quantize to FP8 with MXFP8 scaling
-    Tensor A_fp8("A_fp8", a_shape, atype, transa, !transa,
-                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
-    Tensor B_fp8("B_fp8", b_shape, btype, !transb, transb,
-                 NVTEScalingMode::NVTE_MXFP8_1D_SCALING);
-    nvte_quantize(A_src.data(), A_fp8.data(), 0);
-    nvte_quantize(B_src.data(), B_fp8.data(), 0);
-
-    // 3. Dequantize back to reference type
-    Tensor A_ref("A_ref", a_shape, dtype);
-    Tensor B_ref("B_ref", b_shape, dtype);
-    nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
-    nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
-
-    // 4. Swizzle scales for gfx1250
-    if (prop.major == 12) {
-        const bool a_colwise = !transa;
-        const bool b_colwise = transb;
-        if (!a_colwise) swizzle_mxfp8_scales(A_fp8, true);
-        if (a_colwise)  swizzle_mxfp8_scales(A_fp8, false);
-        if (!b_colwise) swizzle_mxfp8_scales(B_fp8, true);
-        if (b_colwise)  swizzle_mxfp8_scales(B_fp8, false);
-    }
-
-    Tensor bias;
-    Tensor pre_gelu_out;
-
-    size_t workspace_size = 67108864;  // 64 MB
-    Tensor Workspace("Workspace", TShape{workspace_size}, DType::kByte);
-
-    // 5. MXFP8 GEMM
-    Tensor D("D", TShape{n, m}, dtype);
-    nvte_cublas_gemm(A_fp8.data(), B_fp8.data(), D.data(),
-                     bias.data(), pre_gelu_out.data(),
-                     transa, transb, false,
-                     Workspace.data(), false, false,
-                     prop.multiProcessorCount, 0);
-    D.to_cpu();
-
-    // 6. Non-FP8 reference GEMM
-    Tensor D_ref("D_ref", TShape{n, m}, dtype);
-    nvte_cublas_gemm(A_ref.data(), B_ref.data(), D_ref.data(),
-                     bias.data(), pre_gelu_out.data(),
-                     transa, transb, false,
-                     Workspace.data(), false, false,
-                     prop.multiProcessorCount, 0);
-    D_ref.to_cpu();
-
-    // Check for CUDA errors
-    (void)cudaDeviceSynchronize();
-    auto err = cudaGetLastError();
-    ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-
-    // 7. Compare results
-    // The MXFP8 GEMM and bf16 reference GEMM use different internal accumulation
-    // paths, so results can differ by up to 1 ULP in bf16 (~1.5-2% relative).
-    auto [atol, rtol] = getTolerances(dtype);
-    atol = std::max(atol, 1e-3);
-    rtol = std::max(rtol, 2e-2);
-    compareResults("D", D, D_ref.rowwise_cpu_dptr<D_Type>(), true, atol, rtol);
-}
-
-// ============================================================================
-// Test suite
-// ============================================================================
-
-using ProdGemmParam = std::tuple<ShapeDef, size_t, Layout>;
-
-class ProdGemmTestSuite : public ::testing::TestWithParam<ProdGemmParam> {};
-
-// Known-failing GEMM shapes on gfx950
-static const std::set<std::string> kMI355XSkips = {
-    // N=576 + NT: rocroller LDS stride mismatch (all elements wrong, ~100x off)
-    "DeepSeek3_Linear1_fwd_mbs1_NT",
-    "DeepSeek3_Linear1_fwd_mbs2_NT",
-    "DeepSeek3_Linear1_fwd_mbs4_NT",
-    // Crash (likely OOM / kernel fault)
-    "Qwen3_LMHead_fwd_mbs4_TN",
-    "Qwen3_LMHead_fwd_mbs4_NN",
-    "Qwen3_LMHead_fwd_mbs4_NT",
-};
-
-TEST_P(ProdGemmTestSuite, TestMxfp8Dq) {
-    const auto& shape = std::get<0>(GetParam());
-    size_t mbs = std::get<1>(GetParam());
-    const auto& layout = std::get<2>(GetParam());
-    bool transa = layout.first;
-    bool transb = layout.second;
-
-    static const char* tn_map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
-    std::string name = std::string(shape.label) + "_mbs" + std::to_string(mbs)
-                       + "_" + tn_map[transa][transb];
-    if (kMI355XSkips.count(name)) {
-        GTEST_SKIP() << "Known MI355X hipBLASLt failure: " << name;
-    }
-
-    size_t m, k, n;
-    resolve_mkn(shape, mbs, m, k, n);
-
-    switch (shape.pass) {
-        case GemmPass::FWD:
-            performMxfp8DqTest<fp8, fp8, bf16>(m, k, n, transa, transb);
-            break;
-        case GemmPass::DGRAD:
-            performMxfp8DqTest<bf8, fp8, bf16>(m, k, n, transa, transb);
-            break;
-        case GemmPass::WGRAD:
-            performMxfp8DqTest<fp8, bf8, bf16>(m, k, n, transa, transb);
-            break;
-    }
-}
-
-static inline std::string TN(const Layout& layout) {
-    static const char* map[2][2] = {{"NN", "NT"}, {"TN", "TT"}};
-    return map[layout.first][layout.second];
-}
-
-static inline auto testName(const testing::TestParamInfo<ProdGemmParam>& info) {
-    const auto& shape = std::get<0>(info.param);
-    size_t mbs = std::get<1>(info.param);
-    const auto& layout = std::get<2>(info.param);
-    return std::string(shape.label) + "_mbs" + std::to_string(mbs) + "_" + TN(layout);
-}
-
-// DeepSeek3 model shapes
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmDeepSeek3, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(deepseek3_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// Qwen3 model shapes
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmQwen3, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(qwen3_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// DeepSeek3 LM Head shapes (very large N, memory-intensive)
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmDeepSeek3LMHead, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(deepseek3_lm_head_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-// Qwen3 LM Head shapes (very large N, memory-intensive)
-INSTANTIATE_TEST_SUITE_P(
-    ProdGemmQwen3LMHead, ProdGemmTestSuite,
-    ::testing::Combine(
-        ::testing::ValuesIn(qwen3_lm_head_shapes),
-        ::testing::Values(size_t{1}, size_t{2}, size_t{4}),
-        ::testing::ValuesIn(kLayouts)),
-    testName);
-
-}  // namespace
-
-#endif  // __HIP_PLATFORM_AMD__
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 392e641d5..fbcfdf89d 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -22,9 +22,6 @@
 #endif
 #include <omp.h>
 
-#ifdef __HIP_PLATFORM_AMD__
-#include <transformer_engine/swizzle.h>
-#endif
 #include <transformer_engine/transformer_engine.h>
 #include "util/logging.h"
 
@@ -1317,44 +1314,4 @@ std::array<size_t, 4> get_scale_tensor_dims(const size_t rows,
     return {unpadded_blocks_Y, unpadded_blocks_X, blocks_Y, blocks_X};
 }
 
-#ifdef __HIP_PLATFORM_AMD__
-void swizzle_mxfp8_scales(Tensor& t, bool rowwise) {
-    void* scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
-                              : t.columnwise_scale_inv_dptr();
-    if (!scale_ptr) return;
-
-    const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
-                                          : t.columnwise_scale_inv_shape();
-    const NVTEShape data_shape  = rowwise ? t.rowwise_shape()
-                                          : t.columnwise_shape();
-
-    size_t num_scales = 1;
-    for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
-
-    uint8_t* d_tmp = nullptr;
-    NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
-
-    TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
-    TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
-    output_tw.set_with_gemm_swizzled_scales(true);
-
-    if (rowwise) {
-        input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    } else {
-        input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
-        output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
-        output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
-    }
-
-    nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
-    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
-    NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
-    NVTE_CHECK_CUDA(cudaFree(d_tmp));
-}
-#endif // #ifdef __HIP_PLATFORM_AMD__
-
 }  // namespace test
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index 6c37ccc57..a25b7b61e 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -581,10 +581,6 @@ int32_t getDeviceComputeCapability();
 constexpr int32_t hopperComputeCapability = 90;
 constexpr int32_t blackwellComputeCapability = 100;
 
-#ifdef USE_ROCM
-void swizzle_mxfp8_scales(Tensor& t, bool rowwise);
-#endif
-
 }  // namespace test
 
 #if FP4_TYPE_SUPPORTED

From e46d6da16d18db6c473964b87c895847e3b6b96d Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@amd.com>
Date: Thu, 21 May 2026 13:10:22 -0500
Subject: [PATCH 08/13] add MXFP8 pre-swizzling for gfx1250 GEMM (#568)

---
 tests/cpp/operator/CMakeLists.txt             |   4 +-
 tests/cpp/operator/test_cublaslt_gemm.cu      | 101 ++++++++--
 tests/cpp/operator/test_swizzle.cu            | 180 ++++++++++++++++++
 transformer_engine/common/swizzle/swizzle.cu  | 178 +++++++++++++++++
 .../jax/csrc/extensions/gemm.cpp              |  28 ++-
 .../pytorch/csrc/extensions/gemm.cpp          |   6 +-
 .../pytorch/csrc/extensions/swizzle.cpp       |  15 ++
 transformer_engine/pytorch/csrc/quantizer.cpp |  14 ++
 transformer_engine/pytorch/csrc/util.h        |   6 +-
 .../pytorch/tensor/mxfp8_tensor.py            |  28 ++-
 10 files changed, 524 insertions(+), 36 deletions(-)

diff --git a/tests/cpp/operator/CMakeLists.txt b/tests/cpp/operator/CMakeLists.txt
index 2cf44c063..ae2afd486 100644
--- a/tests/cpp/operator/CMakeLists.txt
+++ b/tests/cpp/operator/CMakeLists.txt
@@ -31,11 +31,11 @@ list(APPEND test_cuda_sources
             test_multi_unpadding.cu
             test_causal_softmax.cu
             test_swap_first_dims.cu
+            test_swizzle.cu
 	          ../test_common.cu)
 if(USE_CUDA)
   list(APPEND test_cuda_sources
-              test_cast_float8blockwise.cu
-              test_swizzle.cu)
+              test_cast_float8blockwise.cu)
 else()
   list(APPEND test_cuda_sources
               test_cublaslt_gemm.cu
diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index 85f183bf7..b8312de00 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -11,6 +11,7 @@
 #include <gtest/gtest.h>
 #include <transformer_engine/cast.h>
 #include <transformer_engine/gemm.h>
+#include <transformer_engine/swizzle.h>
 #include <transformer_engine/transformer_engine.h>
 #include "../test_common.h"
 
@@ -30,7 +31,15 @@ std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes = {
 
 std::vector<std::tuple<size_t, size_t, size_t>> test_case_sizes_mxfp8 = {
   {32, 128, 16},
+  {64, 128, 32},
+  {128, 128, 64},
+  {64, 256, 32},
+  {128, 384, 64},
+  {256, 512, 128},
+  {512, 1024, 256},
   {768, 3072, 4096},
+  {1024, 2048, 128},
+  {4096, 8192, 64},
 };
 
 //  A, B, Bias, Gelu, D
@@ -303,6 +312,40 @@ void cpu_rowwise_to_columnwise(
   }
 }
 
+// Swizzle MXFP8 scale_inv of a test::Tensor in-place for gfx1250.
+static void swizzle_mxfp8_scales(test::Tensor &t, bool rowwise) {
+  using namespace transformer_engine;
+  void *scale_ptr = rowwise ? t.rowwise_scale_inv_dptr()
+                            : t.columnwise_scale_inv_dptr();
+  if (!scale_ptr) return;
+  const NVTEShape scale_shape = rowwise ? t.rowwise_scale_inv_shape()
+                                        : t.columnwise_scale_inv_shape();
+  const NVTEShape data_shape = rowwise ? t.rowwise_shape()
+                                       : t.columnwise_shape();
+  size_t num_scales = 1;
+  for (size_t d = 0; d < scale_shape.ndim; d++) num_scales *= scale_shape.data[d];
+  uint8_t *d_tmp = nullptr;
+  NVTE_CHECK_CUDA(cudaMalloc(&d_tmp, num_scales));
+  TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+  TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+  output_tw.set_with_gemm_swizzled_scales(true);
+  if (rowwise) {
+    input_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+    input_tw.set_rowwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+    output_tw.set_rowwise_data(nullptr, t.dtype(), data_shape);
+    output_tw.set_rowwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+  } else {
+    input_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+    input_tw.set_columnwise_scale_inv(scale_ptr, DType::kFloat8E8M0, scale_shape);
+    output_tw.set_columnwise_data(nullptr, t.dtype(), data_shape);
+    output_tw.set_columnwise_scale_inv(d_tmp, DType::kFloat8E8M0, scale_shape);
+  }
+  nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+  NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+  NVTE_CHECK_CUDA(cudaMemcpy(scale_ptr, d_tmp, num_scales, cudaMemcpyDeviceToDevice));
+  NVTE_CHECK_CUDA(cudaFree(d_tmp));
+}
+
 std::pair<double, double> getTestTolerances(const DType type, bool use_fp8, bool use_mxfp8) {
   auto [atol, rtol] = getTolerances(type);
 
@@ -318,6 +361,12 @@ std::pair<double, double> getTestTolerances(const DType type, bool use_fp8, bool
   else if (use_fp8) {
     atol = 1e-3;
     rtol = std::max(rtol, 1e-2);
+    // Relax for gfx1250
+    cudaDeviceProp prop;
+    (void)cudaGetDeviceProperties(&prop, 0);
+    if (prop.major == 12 && type == DType::kBFloat16) {
+      rtol = std::max(rtol, 5e-2);
+    }
   }
   else if (type == DType::kBFloat16) {
     //relax for certain prime number TN gemm
@@ -496,6 +545,31 @@ void performTest(const TestParams& params) {
 #endif
   Tensor Workspace("Workspace", TShape{ workspace_size }, DType::kByte);
 
+  //perform the reference gemm on GPU (before swizzle, which modifies scales in-place)
+  Tensor RefD("RefD", TShape{ params.n, params.m }, dtype);
+  Tensor RefPreGeluOut;
+
+  if (params.use_gelu) {
+    RefPreGeluOut = Tensor("RefPreGeluOut", TShape{ params.n, params.m }, gelu_type);
+  }
+
+  run_reference<A_Type, B_Type, Bias_Type, Gelu_Type, D_Type>(
+    params,
+    A,
+    B,
+    params.use_bias ? &bias : nullptr,
+    D,
+    RefD,
+    params.use_gelu ? &RefPreGeluOut : nullptr);
+
+  // On gfx1250, hipBLASLt MXFP8 kernels expect pre-swizzled scales.
+  if (use_mxfp8 && prop.major == 12) {
+    if (!a_colwise) swizzle_mxfp8_scales(A, true);
+    if (a_colwise)  swizzle_mxfp8_scales(A, false);
+    if (!b_colwise) swizzle_mxfp8_scales(B, true);
+    if (b_colwise)  swizzle_mxfp8_scales(B, false);
+  }
+
   //perform the gemm in GPU
   nvte_cublas_gemm(A.data(),
                    B.data(),
@@ -517,23 +591,6 @@ void performTest(const TestParams& params) {
     pre_gelu_out.to_cpu();
   }
 
-  //perform the reference gemm on GPU
-  Tensor RefD("RefD", TShape{ params.n, params.m }, dtype);
-  Tensor RefPreGeluOut;
-
-  if (params.use_gelu) {
-    RefPreGeluOut = Tensor("RefPreGeluOut", TShape{ params.n, params.m }, gelu_type);
-  }
-
-  run_reference<A_Type, B_Type, Bias_Type, Gelu_Type, D_Type>(
-    params,
-    A,
-    B,
-    params.use_bias ? &bias : nullptr,
-    D,
-    RefD,
-    params.use_gelu ? &RefPreGeluOut : nullptr);
-
   // check if error message happens in running                             
   (void)cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -605,6 +662,16 @@ void performDqTest(const TestParams &params) {
   nvte_dequantize(A_fp8.data(), A_ref.data(), 0);
   nvte_dequantize(B_fp8.data(), B_ref.data(), 0);
 
+  // On gfx1250, hipBLASLt MXFP8 kernels expect pre-swizzled scales.
+  if (prop.major == 12) {
+    const bool a_colwise = !params.transa;
+    const bool b_colwise = params.transb;
+    if (!a_colwise) swizzle_mxfp8_scales(A_fp8, true);
+    if (a_colwise)  swizzle_mxfp8_scales(A_fp8, false);
+    if (!b_colwise) swizzle_mxfp8_scales(B_fp8, true);
+    if (b_colwise)  swizzle_mxfp8_scales(B_fp8, false);
+  }
+
   Tensor bias;
   Tensor pre_gelu_out;
 
diff --git a/tests/cpp/operator/test_swizzle.cu b/tests/cpp/operator/test_swizzle.cu
index 3209d2335..0092a0c62 100644
--- a/tests/cpp/operator/test_swizzle.cu
+++ b/tests/cpp/operator/test_swizzle.cu
@@ -166,3 +166,183 @@ INSTANTIATE_TEST_SUITE_P(
       std::to_string(std::get<2>(info.param));
     return name;
     });
+
+#ifdef __HIP_PLATFORM_AMD__
+
+// MX pre-swizzle test (gfx1250 Tensile 3D layout)
+//
+// Tensile 3D: {K_scale, M}.reshape({K_scale, padM/4, 4}).permute({1, 0, 2})
+// For source (m, k): dst = (m/4) * (K*4) + k*4 + (m%4)
+
+// CPU reference for Tensile 3D MX scale pre-swizzle.
+// Row-major input [M, K], output is a flat permuted array.
+void compute_ref_mx_swizzle_row(const uint8_t *h_input, uint8_t *h_output,
+                                   const int M, const int K,
+                                   const int orig_M, const int orig_K) {
+  constexpr int GROUP = 4;
+  for (int m = 0; m < M; m++) {
+    for (int k = 0; k < K; k++) {
+      uint8_t val = 127;  // E8M0 identity: 2^0 = 1.0
+      if (m < orig_M && k < orig_K) {
+        val = h_input[m * orig_K + k];
+      }
+      int group = k / GROUP;
+      int within = k % GROUP;
+      int dst = group * (M * GROUP) + m * GROUP + within;
+      h_output[dst] = val;
+    }
+  }
+}
+
+void compute_ref_mx_swizzle_col(const uint8_t *h_input, uint8_t *h_output,
+                                   const int M, const int K,
+                                   const int orig_M, const int orig_K) {
+  constexpr int GROUP = 4;
+  for (int m = 0; m < M; m++) {
+    for (int k = 0; k < K; k++) {
+      uint8_t val = 127;
+      if (m < orig_M && k < orig_K) {
+        val = h_input[k * orig_M + m];
+      }
+      int group = k / GROUP;
+      int within = k % GROUP;
+      int dst = group * (M * GROUP) + m * GROUP + within;
+      h_output[dst] = val;
+    }
+  }
+}
+
+static size_t roundup_sz(size_t val, size_t mult) {
+  return ((val + mult - 1) / mult) * mult;
+}
+
+class MxSwizzleTestSuite
+    : public ::testing::TestWithParam<
+          std::tuple<std::pair<int, int>, bool>> {};
+
+TEST_P(MxSwizzleTestSuite, TestMxSwizzle) {
+  using namespace transformer_engine;
+  using namespace test;
+
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, 0);
+  if (prop.major < 12) {
+    GTEST_SKIP() << "MXFP8 pre-swizzle is only supported on gfx1250";
+  }
+
+  const auto dims = std::get<0>(GetParam());
+  const bool rowwise = std::get<1>(GetParam());
+
+  // Original (unpadded) scale dimensions
+  const size_t orig_M = dims.first;
+  const size_t orig_K = dims.second;
+
+  // Padded dimensions: K-tiled layout requires K_scale padded to multiple of 4
+  const size_t M = orig_M;
+  const size_t K = roundup_sz(orig_K, 4);
+
+  // Allocate host input (unpadded) and fill with random data
+  const size_t input_size = orig_M * orig_K;
+  std::unique_ptr<uint8_t[]> h_input(new uint8_t[input_size]);
+  std::mt19937 rng(42);
+  for (size_t i = 0; i < input_size; i++) {
+    h_input[i] = static_cast<uint8_t>(rng() % 256);
+  }
+
+  // Allocate device input
+  uint8_t *d_input = nullptr;
+  NVTE_CHECK_CUDA(cudaMalloc(&d_input, input_size));
+  NVTE_CHECK_CUDA(cudaMemcpy(d_input, h_input.get(), input_size, cudaMemcpyHostToDevice));
+
+  // Allocate device output (padded size)
+  const size_t output_size = M * K;
+  uint8_t *d_output = nullptr;
+  NVTE_CHECK_CUDA(cudaMalloc(&d_output, output_size));
+  NVTE_CHECK_CUDA(cudaMemset(d_output, 0, output_size));
+
+  // Build TensorWrapper for input and output
+  TensorWrapper input_tw(NVTE_MXFP8_1D_SCALING);
+  TensorWrapper output_tw(NVTE_MXFP8_1D_SCALING);
+  output_tw.set_with_gemm_swizzled_scales(true);
+
+  // Data shape must be consistent with scale shape for validation.
+  // Scale shapes use padded K; data shapes use unpadded dims
+  // (kernel derives original_M/K from them).
+  if (rowwise) {
+    std::vector<size_t> data_shape_in = {orig_M, orig_K * 32};
+    std::vector<size_t> data_shape_out = {M, K * 32};
+    std::vector<size_t> scale_shape_in = {M, K};
+    std::vector<size_t> scale_shape_out = {M, K};
+    input_tw.set_rowwise_data(nullptr, DType::kFloat8E4M3, data_shape_in);
+    input_tw.set_rowwise_scale_inv(d_input, DType::kFloat8E8M0, scale_shape_in);
+    output_tw.set_rowwise_data(nullptr, DType::kFloat8E4M3, data_shape_out);
+    output_tw.set_rowwise_scale_inv(d_output, DType::kFloat8E8M0, scale_shape_out);
+  } else {
+    std::vector<size_t> data_shape_in = {orig_K * 32, orig_M};
+    std::vector<size_t> data_shape_out = {K * 32, M};
+    std::vector<size_t> scale_shape_in = {K, M};
+    std::vector<size_t> scale_shape_out = {K, M};
+    input_tw.set_columnwise_data(nullptr, DType::kFloat8E4M3, data_shape_in);
+    input_tw.set_columnwise_scale_inv(d_input, DType::kFloat8E8M0, scale_shape_in);
+    output_tw.set_columnwise_data(nullptr, DType::kFloat8E4M3, data_shape_out);
+    output_tw.set_columnwise_scale_inv(d_output, DType::kFloat8E8M0, scale_shape_out);
+  }
+
+  nvte_swizzle_scaling_factors(input_tw.data(), output_tw.data(), 0);
+
+  NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+
+  // Copy output back to host
+  std::unique_ptr<uint8_t[]> h_output(new uint8_t[output_size]);
+  NVTE_CHECK_CUDA(cudaMemcpy(h_output.get(), d_output, output_size, cudaMemcpyDeviceToHost));
+
+  // Compute reference
+  std::unique_ptr<uint8_t[]> h_ref(new uint8_t[output_size]);
+  memset(h_ref.get(), 0, output_size);
+  if (rowwise) {
+    compute_ref_mx_swizzle_row(h_input.get(), h_ref.get(), M, K, orig_M, orig_K);
+  } else {
+    compute_ref_mx_swizzle_col(h_input.get(), h_ref.get(), M, K, orig_M, orig_K);
+  }
+
+  // Compare
+  compareResults("mx_swizzle", h_output.get(), h_ref.get(), output_size);
+
+  cudaFree(d_input);
+  cudaFree(d_output);
+}
+
+namespace {
+
+// Scale dimensions (M_scale, K_scale).
+// K_scale will be padded to multiple of 4 by the test.
+std::vector<std::pair<int, int>> mx_scale_dims = {
+  {4, 4},        // minimal
+  {8, 4},        // small
+  {32, 8},       // medium
+  {64, 16},      // larger
+  {96, 8},       // non-power-of-2 M
+  {128, 32},     // big
+  {256, 64},     // bigger
+  {512, 128},    // stress inter-tile
+  {1024, 256},   // large
+  {4096, 256},   // max stress
+};
+
+}  // namespace
+
+INSTANTIATE_TEST_SUITE_P(
+  OperatorTest,
+  MxSwizzleTestSuite,
+  ::testing::Combine(
+    ::testing::ValuesIn(mx_scale_dims),
+    ::testing::Values(true, false)
+  ),
+  [](const testing::TestParamInfo<MxSwizzleTestSuite::ParamType>& info) {
+    std::string name = "M" + std::to_string(std::get<0>(info.param).first) +
+      "_K" + std::to_string(std::get<0>(info.param).second) +
+      (std::get<1>(info.param) ? "_row" : "_col");
+    return name;
+  });
+
+#endif  // __HIP_PLATFORM_AMD__
diff --git a/transformer_engine/common/swizzle/swizzle.cu b/transformer_engine/common/swizzle/swizzle.cu
index c634c73fb..1324debfb 100644
--- a/transformer_engine/common/swizzle/swizzle.cu
+++ b/transformer_engine/common/swizzle/swizzle.cu
@@ -14,6 +14,7 @@
 #include <type_traits>
 
 #include "../common.h"
+#include "../util/cuda_runtime.h"
 #include "../util/logging.h"
 #include "transformer_engine/transformer_engine.h"
 
@@ -347,9 +348,168 @@ __global__ void multi_tensor_swizzle_col_scaling_kernel(MultiSwizzleArgs kernel_
       input, output, M, K, original_M, original_K, bid_x, bid_y, grid_dim_x, grid_dim_y);
 }
 
+#ifdef __HIP_PLATFORM_AMD__
+// ============================================================================
+// MX scale pre-swizzle kernel for gfx1250 — K-tiled 3D layout
+//
+// hipBLASlt Tensile kernels expect scales in a permuted 3D layout that
+// groups K_scale into tiles of 4 (= 128 / MXBlock32):
+//   Tensor({M, K_scale}).pad(K_scale to mult of 4).reshape({M, K_scale/4, 4}).permute({1, 0, 2})
+//
+// For source position (m, k) in the [M, K_scale] scale matrix:
+//   group  = k / 4
+//   within = k % 4
+//   dst    = group * (M * 4) + m * 4 + within
+//
+// Padding: K_scale to multiple of 4.  No M padding required.
+// Identity padding value: E8M0 127 = 2^0 = 1.0
+//
+// Reference: swizzle_mx_scale() in hipblaslt/clients/common/include/testing_matmul.hpp
+// ============================================================================
+
+constexpr int MX_PRESWIZZLE_GROUP_SIZE = 4;
+
+// Unified MX scale pre-swizzle kernel for both row-wise and column-wise.
+// Iterates only over valid (non-padded) elements; the caller must pre-fill
+// the output buffer with identity (127) to handle padding.
+//
+// kRowwise=true:  input is [orig_M, orig_K] row-major
+// kRowwise=false: input is [orig_K, orig_M] row-major (column-wise scales)
+template <bool kRowwise>
+__global__ void __launch_bounds__(256)
+    swizzle_scaling_mx_kernel(const uint8_t* __restrict__ input,
+                              uint8_t* __restrict__ output,
+                              const int padded_M,
+                              const int orig_M, const int orig_K) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int total = orig_M * orig_K;
+  if (idx >= total) return;
+
+  const int m = idx / orig_K;
+  const int k = idx % orig_K;
+
+  uint8_t val;
+  if constexpr (kRowwise) {
+    val = input[idx];  // == input[m * orig_K + k]
+  } else {
+    val = input[k * orig_M + m];
+  }
+
+  const int group = k / MX_PRESWIZZLE_GROUP_SIZE;
+  const int within = k % MX_PRESWIZZLE_GROUP_SIZE;
+  const int dst = group * (padded_M * MX_PRESWIZZLE_GROUP_SIZE)
+                + m * MX_PRESWIZZLE_GROUP_SIZE + within;
+
+  output[dst] = val;
+}
+
 }  // namespace
 
+void swizzle_scaling_factors_mx(const Tensor* input, Tensor* output, cudaStream_t stream) {
+  // Check scaling mode
+  const auto& scaling_mode = input->scaling_mode;
+  NVTE_CHECK(scaling_mode == NVTE_MXFP8_1D_SCALING,
+             "MX pre-swizzle only supports MXFP8 scaling mode (got ",
+             to_string(input->scaling_mode), ").");
+
+  // Check tensors
+  CheckInputTensor(*input, "scaling_factor_input");
+  CheckInputTensor(*output, "scaling_factor_output");
+  NVTE_CHECK(!input->with_gemm_swizzled_scales,
+             "Expected input tensor with scales in compact format.");
+  NVTE_CHECK(output->with_gemm_swizzled_scales,
+             "Expected output tensor with scales in GEMM swizzled format.");
+  NVTE_CHECK(is_fp8_dtype(input->dtype()), "Input tensor has invalid dtype (expected FP8, got ",
+             to_string(input->dtype()), ").");
+
+  // Check if scaling factors are non-trivial
+  const bool has_rowwise_scale_inv = input->scale_inv.has_data();
+  const bool has_columnwise_scale_inv = input->columnwise_scale_inv.has_data();
+  NVTE_CHECK(!has_rowwise_scale_inv || !has_columnwise_scale_inv,
+             "Input tensor has both row-wise and column-wise scaling factors");
+  if (!has_rowwise_scale_inv && !has_columnwise_scale_inv) {
+    return;
+  }
+
+  // Deduce tensor dims
+  int m{0}, k{0};
+  if (has_rowwise_scale_inv) {
+    NVTE_CHECK(input->scale_inv.shape.size() == 2,
+               "Expected 2D scaling factors, got shape=", input->scale_inv.shape, ".");
+    m = input->scale_inv.shape[0];
+    k = input->scale_inv.shape[1];
+  } else if (has_columnwise_scale_inv) {
+    NVTE_CHECK(input->columnwise_scale_inv.shape.size() == 2,
+               "Expected 2D scaling factors, got shape=", input->columnwise_scale_inv.shape, ".");
+    m = input->columnwise_scale_inv.shape[1];
+    k = input->columnwise_scale_inv.shape[0];
+  }
+
+  // Check dims -- K-tiled layout requires K_scale padded to multiple of 4
+  NVTE_CHECK(k % MX_PRESWIZZLE_GROUP_SIZE == 0,
+             "Scale K dimension must be padded to multiple of ", MX_PRESWIZZLE_GROUP_SIZE,
+             ", got ", k, ".");
+
+  // Validate output dimensions match
+  if (has_rowwise_scale_inv) {
+    NVTE_CHECK(output->scale_inv.has_data(),
+               "Output tensor does not have row-wise scaling factors.");
+    NVTE_CHECK(m * k == output->scale_inv.numel(), "Expected output tensor to have ", m * k,
+               " row-wise scaling factors, but got shape=", output->scale_inv.shape, ".");
+  }
+  if (has_columnwise_scale_inv) {
+    NVTE_CHECK(output->columnwise_scale_inv.has_data(),
+               "Output tensor does not have column-wise scaling factors.");
+    NVTE_CHECK(m * k == output->columnwise_scale_inv.numel(),
+               "Expected output tensor to have ", m * k,
+               " column-wise scaling factors, but got shape=",
+               output->columnwise_scale_inv.shape, ".");
+  }
+
+  const int total = m * k;
+  constexpr int block = 256;
+
+  // Row-wise swizzle
+  if (has_rowwise_scale_inv) {
+    const int original_M = input->flat_first_dim();
+    const int original_K = input->flat_last_dim() / MXFP8_BLOCK_SIZE;
+    // Pre-fill output with E8M0 identity (127 = 2^0) to handle padding
+    NVTE_CHECK_CUDA(cudaMemsetAsync(output->scale_inv.dptr, 127, total, stream));
+    const int orig_total = original_M * original_K;
+    const int grid = (orig_total + block - 1) / block;
+    swizzle_scaling_mx_kernel<true><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const uint8_t*>(input->scale_inv.dptr),
+        reinterpret_cast<uint8_t*>(output->scale_inv.dptr),
+        m, original_M, original_K);
+    NVTE_CHECK_CUDA(cudaGetLastError());
+  }
+
+  // Column-wise swizzle
+  if (has_columnwise_scale_inv) {
+    const int original_M = input->flat_last_dim();
+    const int original_K = input->flat_first_dim() / MXFP8_BLOCK_SIZE;
+    // Pre-fill output with E8M0 identity (127 = 2^0) to handle padding
+    NVTE_CHECK_CUDA(cudaMemsetAsync(output->columnwise_scale_inv.dptr, 127, total, stream));
+    const int orig_total = original_M * original_K;
+    const int grid = (orig_total + block - 1) / block;
+    swizzle_scaling_mx_kernel<false><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const uint8_t*>(input->columnwise_scale_inv.dptr),
+        reinterpret_cast<uint8_t*>(output->columnwise_scale_inv.dptr),
+        m, original_M, original_K);
+    NVTE_CHECK_CUDA(cudaGetLastError());
+  }
+}
+#endif  // __HIP_PLATFORM_AMD__
+
 void swizzle_scaling_factors(const Tensor* input, Tensor* output, cudaStream_t stream) {
+#ifdef __HIP_PLATFORM_AMD__
+  // On gfx1250, MXFP8 uses the MX pre-swizzle layout (K-tiled, grouped by 4).
+  if (input->scaling_mode == NVTE_MXFP8_1D_SCALING && cuda::sm_arch() == 125) {
+    swizzle_scaling_factors_mx(input, output, stream);
+    return;
+  }
+#endif  // __HIP_PLATFORM_AMD__
+
   // Check scaling mode
   const auto& scaling_mode = input->scaling_mode;
   NVTE_CHECK(scaling_mode == NVTE_MXFP8_1D_SCALING || scaling_mode == NVTE_NVFP4_1D_SCALING,
@@ -667,6 +827,24 @@ void launch_multi_tensor_swizzle_scaling_factors(MultiSwizzleArgs& kernel_args,
 
 void multi_tensor_swizzle_scaling_factors(const std::vector<Tensor*>& input,
                                           std::vector<Tensor*>& output, cudaStream_t stream) {
+#ifdef __HIP_PLATFORM_AMD__
+  // On gfx1250, MXFP8 uses the MX pre-swizzle layout.
+  if (cuda::sm_arch() == 125) {
+    bool any_mxfp8 = false;
+    for (size_t i = 0; i < input.size(); i++) {
+      if (is_mxfp8_scaling(input[i]->scaling_mode)) {
+        any_mxfp8 = true;
+      }
+    }
+    if (any_mxfp8) {
+      for (size_t i = 0; i < input.size(); i++) {
+        swizzle_scaling_factors_mx(input[i], output[i], stream);
+      }
+      return;
+    }
+  }
+#endif  // __HIP_PLATFORM_AMD__
+
   auto num_tensors = input.size();
   bool all_has_data = true;
   bool all_has_columnwise_data = true;
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 40121049a..e32a42b1d 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -80,12 +80,31 @@ std::tuple<TensorWrapper, std::vector<size_t>> xla_buffer_to_nvte_gemm_operand(
                  "Inverse scale factors need to have an 8-bit data type.");
     }
     if (scaling_mode == JAXX_Scaling_Mode::MXFP8_1D_SCALING) {
-      // Assume MXFP8 scales are already swizzled
       if (rowwise) {
         input.set_rowwise_scale_inv(scale_inv.untyped_data(), scale_dtype, scale_shape);
       } else {
         input.set_columnwise_scale_inv(scale_inv.untyped_data(), scale_dtype, scale_shape);
       }
+#ifdef USE_ROCM
+      // On gfx1250, pre-swizzle MXFP8 scales for hipBLASLt
+      if (transformer_engine::cuda::sm_arch() == 125 && swizzle_scale_ptr) {
+        TensorWrapper output(get_nvte_scaling_mode(scaling_mode));
+        if (rowwise) {
+          output.set_rowwise_data(buffer.untyped_data(), input_dtype, input_shape);
+          output.set_rowwise_scale_inv(swizzle_scale_ptr, scale_dtype, scale_shape);
+        } else {
+          output.set_columnwise_data(buffer.untyped_data(), input_dtype, input_shape);
+          output.set_columnwise_scale_inv(swizzle_scale_ptr, scale_dtype, scale_shape);
+        }
+        output.set_with_gemm_swizzled_scales(true);
+        nvte_swizzle_scaling_factors(input.data(), output.data(), stream);
+        if (rowwise) {
+          input.set_rowwise_scale_inv(swizzle_scale_ptr, scale_dtype, scale_shape);
+        } else {
+          input.set_columnwise_scale_inv(swizzle_scale_ptr, scale_dtype, scale_shape);
+        }
+      }
+#endif
       input.set_with_gemm_swizzled_scales(true);
     } else if (is_nvfp4) {  // Swizzle for NVFP4
 #ifdef USE_ROCM
@@ -195,7 +214,12 @@ Error_Type GemmFFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale_i
   auto workspace_ptr = reinterpret_cast<uint8_t *>(workspace->untyped_data());
   workspace_ptr = move_ptr_to_next_256B_aligned(workspace_ptr);
   size_t workspace_size = static_cast<size_t>(workspace->element_count()) - 256;
-  if (is_nvfp4_scaling(scaling_mode)) {
+  if (is_nvfp4_scaling(scaling_mode)
+#ifdef USE_ROCM
+      || (scaling_mode == JAXX_Scaling_Mode::MXFP8_1D_SCALING
+          && transformer_engine::cuda::sm_arch() == 125)
+#endif
+  ) {
     auto lhs_scale_size = product(lhs_scale_inv.dimensions());
     auto rhs_scale_size = product(rhs_scale_inv.dimensions());
     workspace_size = workspace_size - lhs_scale_size - rhs_scale_size;
diff --git a/transformer_engine/pytorch/csrc/extensions/gemm.cpp b/transformer_engine/pytorch/csrc/extensions/gemm.cpp
index 6898ce387..7a54728c2 100644
--- a/transformer_engine/pytorch/csrc/extensions/gemm.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/gemm.cpp
@@ -244,13 +244,10 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
   config.set_use_split_accumulator(use_split_accumulator);
   config.set_sm_count(num_math_sms);
 
-#ifndef USE_ROCM
   // Keep the swizzled scaling factor tensors alive during the GEMM.
   std::vector<std::optional<at::Tensor>> swizzled_scale_inverses_list;
-#endif
   auto main_stream = at::cuda::getCurrentCUDAStream();
   if (A_tensor.numel() != 0 && B_tensor.numel() != 0) {
-#ifndef USE_ROCM
     // Optionally swizzle the scaling factors
     auto [A_row_scales, A_col_scales] = swizzle_scales_for_gemm(A_tensor, transa, !transa);
     auto [B_row_scales, B_col_scales] = swizzle_scales_for_gemm(B_tensor, !transb, transb);
@@ -259,6 +256,7 @@ std::vector<py::object> gemm(py::handle A, bool transa, py::handle B, bool trans
     swizzled_scale_inverses_list.emplace_back(std::move(B_row_scales));
     swizzled_scale_inverses_list.emplace_back(std::move(B_col_scales));
 
+#ifndef USE_ROCM
     // Emulate the FP8 block scaling recipe with MXFP8 on Blackwell and newer
     // as it is not natively supported by cublasLt
     if (fp8_block_scaling && transformer_engine::cuda::sm_arch() >= 100) {
@@ -532,7 +530,6 @@ std::optional<std::vector<at::Tensor>> te_general_grouped_gemm(
     te_pre_gelu_out_wrappers.emplace_back(std::move(te_pre_gelu_out));
   }
 
-#ifndef USE_ROCM
   // Keep the swizzled scaling factor tensors alive during the GEMM.
   std::vector<std::optional<at::Tensor>> swizzled_scale_inverses_list;
 
@@ -542,6 +539,7 @@ std::optional<std::vector<at::Tensor>> te_general_grouped_gemm(
   swizzled_scale_inverses_list.emplace_back(
       multi_tensor_swizzle_scales_for_gemm(te_B_wrappers, !transb, transb));
 
+#ifndef USE_ROCM
   // Emulate the FP8 block scaling recipe with MXFP8 on Blackwell and newer
   // as it is not natively supported by cublasLt
   if (transformer_engine::cuda::sm_arch() >= 100) {
diff --git a/transformer_engine/pytorch/csrc/extensions/swizzle.cpp b/transformer_engine/pytorch/csrc/extensions/swizzle.cpp
index 4ad57bbf1..d9929c93e 100644
--- a/transformer_engine/pytorch/csrc/extensions/swizzle.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/swizzle.cpp
@@ -13,6 +13,7 @@
 
 #include "common.h"
 #include "common/common.h"
+#include "common/util/cuda_runtime.h"
 #include "extensions.h"
 #include "pybind.h"
 #include "util.h"
@@ -55,6 +56,13 @@ std::tuple<std::optional<at::Tensor>, std::optional<at::Tensor>> swizzle_scales_
       return {std::nullopt, std::nullopt};
   }
 
+#ifdef USE_ROCM
+  // On ROCm, only MXFP8 on gfx1250 needs scale pre-swizzling
+  if (scaling_mode != NVTE_MXFP8_1D_SCALING || transformer_engine::cuda::sm_arch() != 125) {
+    return {std::nullopt, std::nullopt};
+  }
+#endif
+
   // Return early if scales are already swizzled
   if (tensor.get_with_gemm_swizzled_scales()) {
     return {std::nullopt, std::nullopt};
@@ -164,6 +172,13 @@ std::optional<at::Tensor> multi_tensor_swizzle_scales_for_gemm(
       return std::nullopt;
   }
 
+#ifdef USE_ROCM
+  // On ROCm, only MXFP8 on gfx1250 needs scale pre-swizzling
+  if (scaling_mode != NVTE_MXFP8_1D_SCALING || transformer_engine::cuda::sm_arch() != 125) {
+    return std::nullopt;
+  }
+#endif
+
   // Filter out tensors that already have swizzled scales
   std::vector<TensorWrapper *> tensors_needing_swizzle;
   for (auto &tensor : tensors) {
diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index bb960406d..f1f6d690a 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -9,6 +9,9 @@
 #include <pybind.h>
 
 #include "common.h"
+#ifdef USE_ROCM
+#include "common/util/cuda_runtime.h"
+#endif
 #include "pybind.h"
 #include "torch/torch.h"
 
@@ -1104,6 +1107,17 @@ std::vector<size_t> MXFP8Quantizer::get_scale_shape(const std::vector<size_t>& s
              "MXFP8 requires tensor dims that are divisible by ", MXFP8_BLOCK_SIZE,
              " (got shape=", shape, ")");
 #ifdef USE_ROCM
+  // gfx1250 MX pre-swizzle (Tensile 3D) layout requires M padded to multiple of 4.
+  if (transformer_engine::cuda::sm_arch() == 125) {
+    size_t m_dim = numel / last_dim;
+    size_t k_scale = last_dim / MXFP8_BLOCK_SIZE;
+    if (!columnwise) {
+      return {roundup(m_dim, 4), k_scale};
+    } else {
+      return {k_scale, roundup(m_dim, 4)};
+    }
+  }
+
   return !columnwise 
          ? std::vector<size_t>{numel / last_dim, last_dim / MXFP8_BLOCK_SIZE} 
          : std::vector<size_t>{numel / last_dim / MXFP8_BLOCK_SIZE, last_dim};
diff --git a/transformer_engine/pytorch/csrc/util.h b/transformer_engine/pytorch/csrc/util.h
index 6588aa6c5..f2310b61f 100644
--- a/transformer_engine/pytorch/csrc/util.h
+++ b/transformer_engine/pytorch/csrc/util.h
@@ -9,8 +9,6 @@
 #ifndef TRANSFORMER_ENGINE_PYTORCH_CSRC_UTIL_H_
 #define TRANSFORMER_ENGINE_PYTORCH_CSRC_UTIL_H_
 
-#ifndef USE_ROCM
-
 #include <torch/extension.h>
 
 #include <optional>
@@ -37,6 +35,7 @@ std::optional<at::Tensor> multi_tensor_swizzle_scales_for_gemm(std::vector<Tenso
                                                                bool rowwise_usage,
                                                                bool columnwise_usage);
 
+#ifndef USE_ROCM
 /*! \brief Convert a block scaling tensor to an mxfp8 tensor in-place.
  *
  *  If rowwise==false, the columnwise data will be reinterpreted as
@@ -50,10 +49,9 @@ std::optional<at::Tensor> multi_tensor_swizzle_scales_for_gemm(std::vector<Tenso
  *  during the GEMM.
  */
 at::Tensor convert_block_scaling_to_mxfp8_tensor(TensorWrapper& input, bool rowwise);
+#endif  //!USE_ROCM
 
 }  // namespace pytorch
 }  // namespace transformer_engine
 
-#endif  //!USE_ROCM
-
 #endif  // TRANSFORMER_ENGINE_PYTORCH_CSRC_UTIL_H_
diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
index bd3d93e9f..faa1ea5a4 100644
--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -19,7 +19,7 @@
 
 from transformer_engine.common.recipe import MXFP8BlockScaling, Recipe
 from ..constants import MXFP8_BLOCK_SCALING_SIZE
-from ..utils import devices_match, round_up_to_nearest_multiple
+from ..utils import devices_match, get_device_compute_capability, round_up_to_nearest_multiple
 from .storage.mxfp8_tensor_storage import MXFP8TensorStorage, _FromMXFP8Func
 from ..quantized_tensor import QuantizedTensor, Quantizer
 from ._quantization_helpers import _IdentityFunc
@@ -144,9 +144,14 @@ def make_empty(
             data = torch.empty(shape, dtype=torch.uint8, device=device)
             # ROCm TE does not implement fuse padding zeros so use zero tensor here
             if IS_HIP_EXTENSION:
+                m_dim = math.prod(shape[:-1])
+                k_scale = math.ceil(shape[-1] / MXFP8_BLOCK_SCALING_SIZE)
+                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
+                if get_device_compute_capability() == (12, 5):
+                    m_dim = round_up_to_nearest_multiple(m_dim, 4)
                 scale_inv = torch.zeros(
-                    math.prod(shape[:-1]), 
-                    math.ceil(shape[-1] / MXFP8_BLOCK_SCALING_SIZE),
+                    m_dim,
+                    k_scale,
                     dtype=torch.uint8,
                     device=device,
                     pin_memory=pin_memory,
@@ -169,9 +174,14 @@ def make_empty(
             )
             # ROCm TE does not implement fuse padding zeros so use zero tensor here
             if IS_HIP_EXTENSION:
+                k_scale = math.ceil(math.prod(shape[:-1]) / MXFP8_BLOCK_SCALING_SIZE)
+                m_dim = shape[-1]
+                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
+                if get_device_compute_capability() == (12, 5):
+                    m_dim = round_up_to_nearest_multiple(m_dim, 4)
                 columnwise_scale_inv = torch.zeros(
-                    math.ceil(math.prod(shape[:-1]) / MXFP8_BLOCK_SCALING_SIZE),
-                    shape[-1],
+                    k_scale,
+                    m_dim,
                     dtype=torch.uint8,
                     device=device,
                     pin_memory=pin_memory,
@@ -470,8 +480,12 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
 
             scale_invs = [tensor._rowwise_scale_inv, tensor._columnwise_scale_inv]
             split_sizes_for_scale = [split_size, split_size // MXFP8_BLOCK_SCALING_SIZE]
-            # Padding requirements: rowwise dim0 should be divisble by 128, columnwise dim0 should be divisble by 4
-            padding_multiples = [128, 4]
+            if IS_HIP_EXTENSION and get_device_compute_capability() == (12, 5):
+                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
+                padding_multiples = [4, 1]
+            else:
+                # Padding requirements: rowwise dim0 should be divisble by 128, columnwise dim0 should be divisble by 4
+                padding_multiples = [128, 4]
             for scale_inv, scale_split_size, pad_multiple in zip(
                 scale_invs, split_sizes_for_scale, padding_multiples
             ):

From d5a16c9792cd3155e4333be8373135206bc5c00f Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Tue, 2 Jun 2026 10:29:27 -0500
Subject: [PATCH 09/13] skip known-failing mxfp8 tests due to hipblaslt
 limitation

---
 tests/cpp/operator/test_cublaslt_gemm.cu | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/cpp/operator/test_cublaslt_gemm.cu b/tests/cpp/operator/test_cublaslt_gemm.cu
index b8312de00..66a0039a5 100644
--- a/tests/cpp/operator/test_cublaslt_gemm.cu
+++ b/tests/cpp/operator/test_cublaslt_gemm.cu
@@ -639,6 +639,17 @@ void performDqTest(const TestParams &params) {
     GTEST_SKIP() << "MXFP8 is not supported in current config";
   }
 
+  // hipBLASLt on gfx950 produces incorrect results for certain small MXFP8
+  // GEMMs with non-TN layouts.
+  if (prop.major == 9 && prop.minor == 5) {
+    const bool is_NN = !params.transa && !params.transb;
+    const bool is_NT = !params.transa && params.transb;
+    if ((is_NN && params.m == 64) ||
+        (is_NT && params.m > 32 && params.m <= 128 && params.n <= 64)) {
+      GTEST_SKIP() << "hipBLASLt MXFP8 non-TN GEMM with small M/N is not supported on gfx950";
+    }
+  }
+
   DType ref_type = dtype;
   TShape a_shape = params.transa ? TShape{params.m, params.k} : TShape{params.k, params.m};
   TShape b_shape = params.transb ? TShape{params.k, params.n} : TShape{params.n, params.k};

From 5f510c17273575f9e0b14d26d63b0bc3aa859de4 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@amd.com>
Date: Wed, 3 Jun 2026 10:21:37 -0500
Subject: [PATCH 10/13] Fix gemm.cpp

---
 transformer_engine/jax/csrc/extensions/gemm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 603df8814..d0cac768a 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -242,9 +242,9 @@ Error_Type GemmV2FFI(cudaStream_t stream, Buffer_Type lhs, Buffer_Type lhs_scale
   auto workspace_ptr = reinterpret_cast<uint8_t *>(workspace->untyped_data());
   workspace_ptr = move_ptr_to_next_256B_aligned(workspace_ptr);
   size_t workspace_size = static_cast<size_t>(workspace->element_count()) - 256;
-  if (is_nvfp4_scaling(scaling_mode)
+  if (is_nvfp4_scaling(config.scaling_mode)
 #ifdef USE_ROCM
-      || (scaling_mode == JAXX_Scaling_Mode::MXFP8_1D_SCALING
+      || (config.scaling_mode == JAXX_Scaling_Mode::MXFP8_1D_SCALING
           && transformer_engine::cuda::sm_arch() == 125)
 #endif
   ) {

From bfedb4ac69f87d7758aacd699c1ba489f6d1bd66 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Fri, 5 Jun 2026 03:18:35 +0000
Subject: [PATCH 11/13] more padding fixes

---
 transformer_engine/common/transformer_engine.cpp  |  6 ++++--
 transformer_engine/pytorch/csrc/quantizer.cpp     |  7 ++++---
 transformer_engine/pytorch/tensor/mxfp8_tensor.py | 10 ++++++----
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/transformer_engine/common/transformer_engine.cpp b/transformer_engine/common/transformer_engine.cpp
index 60f6174dc..65259d1c5 100644
--- a/transformer_engine/common/transformer_engine.cpp
+++ b/transformer_engine/common/transformer_engine.cpp
@@ -129,8 +129,10 @@ void CheckScaleTensorShape(const Tensor &t, const std::string &name) {
       // Need (4, 128) alignment even for e8 scaling factor
       auto block_alignment = std::vector<size_t>{128ul, 4ul};
 #else
-      // HIP does not use scale padding
-      auto block_alignment = std::vector<size_t>{1ul, 1ul};
+      // HIP does not use scale padding (except gfx1250 which pads both dims to mult of 4)
+      auto block_alignment = (cuda::sm_arch() == 125)
+          ? std::vector<size_t>{4ul, 4ul}
+          : std::vector<size_t>{1ul, 1ul};
 #endif
       size_t expected_x, expected_y, alignment;
       const size_t block_size_rowwise = 32;
diff --git a/transformer_engine/pytorch/csrc/quantizer.cpp b/transformer_engine/pytorch/csrc/quantizer.cpp
index 0ec72028e..f941dcc3c 100644
--- a/transformer_engine/pytorch/csrc/quantizer.cpp
+++ b/transformer_engine/pytorch/csrc/quantizer.cpp
@@ -1698,14 +1698,15 @@ std::vector<size_t> MXFP8Quantizer::get_scale_shape(const std::vector<size_t>& s
              "MXFP8 requires tensor dims that are divisible by ", MXFP8_BLOCK_SIZE,
              " (got shape=", shape, ")");
 #ifdef USE_ROCM
-  // gfx1250 MX pre-swizzle (Tensile 3D) layout requires M padded to multiple of 4.
+  // gfx1250 MX pre-swizzle (Tensile 3D) layout requires both M and K_scale
+  // padded to multiples of 4.
   if (transformer_engine::cuda::sm_arch() == 125) {
     size_t m_dim = numel / last_dim;
     size_t k_scale = last_dim / MXFP8_BLOCK_SIZE;
     if (!columnwise) {
-      return {roundup(m_dim, 4), k_scale};
+      return {roundup(m_dim, 4), roundup(k_scale, 4)};
     } else {
-      return {k_scale, roundup(m_dim, 4)};
+      return {roundup(m_dim / MXFP8_BLOCK_SIZE, 4), roundup(last_dim, 4)};
     }
   }
 
diff --git a/transformer_engine/pytorch/tensor/mxfp8_tensor.py b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
index 442a381bc..851e59a75 100644
--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -146,9 +146,10 @@ def make_empty(
             if IS_HIP_EXTENSION:
                 m_dim = math.prod(shape[:-1])
                 k_scale = math.ceil(shape[-1] / MXFP8_BLOCK_SCALING_SIZE)
-                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
+                # gfx1250 MX pre-swizzle layout requires both dims padded to multiple of 4
                 if get_device_compute_capability() == (12, 5):
                     m_dim = round_up_to_nearest_multiple(m_dim, 4)
+                    k_scale = round_up_to_nearest_multiple(k_scale, 4)
                 scale_inv = torch.zeros(
                     m_dim,
                     k_scale,
@@ -176,8 +177,9 @@ def make_empty(
             if IS_HIP_EXTENSION:
                 k_scale = math.ceil(math.prod(shape[:-1]) / MXFP8_BLOCK_SCALING_SIZE)
                 m_dim = shape[-1]
-                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
+                # gfx1250 MX pre-swizzle layout requires both dims padded to multiple of 4
                 if get_device_compute_capability() == (12, 5):
+                    k_scale = round_up_to_nearest_multiple(k_scale, 4)
                     m_dim = round_up_to_nearest_multiple(m_dim, 4)
                 columnwise_scale_inv = torch.zeros(
                     k_scale,
@@ -524,8 +526,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs=None):
             scale_invs = [tensor._rowwise_scale_inv, tensor._columnwise_scale_inv]
             split_sizes_for_scale = [split_size, split_size // MXFP8_BLOCK_SCALING_SIZE]
             if IS_HIP_EXTENSION and get_device_compute_capability() == (12, 5):
-                # gfx1250 MX pre-swizzle layout requires M padded to multiple of 4
-                padding_multiples = [4, 1]
+                # gfx1250 MX pre-swizzle layout requires both dims padded to multiple of 4
+                padding_multiples = [4, 4]
             else:
                 # Padding requirements: rowwise dim0 should be divisble by 128, columnwise dim0 should be divisble by 4
                 padding_multiples = [128, 4]

From ce60ce0e626626ff5f1004fdee435f5e25bf570b Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Wed, 10 Jun 2026 19:55:23 +0000
Subject: [PATCH 12/13] increase jax WS size

---
 transformer_engine/jax/cpp_extensions/gemm.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index c387eb51c..3919086cb 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -621,7 +621,12 @@ def _dims_are_consecutive(dims):
         # Declare cuBLAS workspace
         workspace_size = get_cublas_workspace_size_bytes()
         # NVFP4 swizzling happen in via nvte kernel instead of JAX transposes
-        if scaling_mode.is_nvfp4_scaling:
+        # On gfx1250, MXFP8 scale pre-swizzling also needs workspace space
+        if scaling_mode.is_nvfp4_scaling or (
+            scaling_mode.is_mxfp8_scaling
+            and is_hip_extension()
+            and get_device_compute_capability(0) == 125
+        ):
             workspace_size += lhs_scale_inv.size + rhs_scale_inv.size
         if not collective_op.is_none:
             workspace_size *= get_cgemm_num_max_streams()

From b668a2c48ba0d080409ee36162abcc29da75bf07 Mon Sep 17 00:00:00 2001
From: Matthias Diener <matthias.diener@gmail.com>
Date: Wed, 10 Jun 2026 22:38:45 +0000
Subject: [PATCH 13/13] more support for non-TN

---
 transformer_engine/common/gemm/rocm_gemm.cu | 45 ++++++++++++++++-----
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/transformer_engine/common/gemm/rocm_gemm.cu b/transformer_engine/common/gemm/rocm_gemm.cu
index 51d90e591..e2809d9c6 100644
--- a/transformer_engine/common/gemm/rocm_gemm.cu
+++ b/transformer_engine/common/gemm/rocm_gemm.cu
@@ -29,6 +29,7 @@
 #include "../common.h"
 #include "../util/vectorized_pointwise.h"
 #include "../util/logging.h"
+#include "../util/cuda_runtime.h"
 
 namespace transformer_engine {
 
@@ -310,14 +311,26 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
     // dimensions (with matrix interpreted in row-major order).
     if (is_A_transposed) {
       NVTE_CHECK(A.has_data(), "Input A is missing row-wise usage");
+      ret.A = A.data.dptr;
+      ret.transA = CUBLAS_OP_T;
+      ret.Atype = A.data.dtype;
+      ret.A_scale_inv = A.scale_inv.dptr;
+      ret.lda = k;
     } else {
       NVTE_CHECK(A.has_columnwise_data(), "Input A is missing column-wise usage");
+      ret.A = A.columnwise_data.dptr;
+      ret.Atype = A.columnwise_data.dtype;
+      ret.A_scale_inv = A.columnwise_scale_inv.dptr;
+      // On gfx1250, hipBLASLt only supports TN layout for MXFP8.
+      // Convert A from N to T using columnwise data (same as tensor scaling).
+      if (cuda::sm_arch() == 125) {
+        ret.transA = CUBLAS_OP_T;
+        ret.lda = k;
+      } else {
+        ret.transA = transA;
+        ret.lda = m;
+      }
     }
-    ret.A = is_A_transposed ? A.data.dptr : A.columnwise_data.dptr;
-    ret.transA = transA;
-    ret.Atype = is_A_transposed ? A.data.dtype : A.columnwise_data.dtype;
-    ret.A_scale_inv = is_A_transposed ? A.scale_inv.dptr : A.columnwise_scale_inv.dptr;
-    ret.lda = is_A_transposed ? k : m;
   } else if (is_nvfp_scaling(A.scaling_mode)) {
     // NVFP4: dequant path always produces TN layout for the BF16 GEMM,
     // but the source data may come from either rowwise or columnwise buffers.
@@ -355,14 +368,26 @@ GemmParam CanonicalizeGemmInput(const transformer_engine::Tensor &A, const cubla
     // dimensions (with matrix interpreted in row-major order).
     if (is_B_transposed) {
       NVTE_CHECK(B.has_columnwise_data(), "Input B is missing column-wise usage");
+      ret.B = B.columnwise_data.dptr;
+      ret.Btype = B.columnwise_data.dtype;
+      ret.B_scale_inv = B.columnwise_scale_inv.dptr;
+      // On gfx1250, hipBLASLt only supports TN layout for MXFP8.
+      // Convert B from T to N using columnwise data (same as tensor scaling).
+      if (cuda::sm_arch() == 125) {
+        ret.transB = CUBLAS_OP_N;
+        ret.ldb = k;
+      } else {
+        ret.transB = transB;
+        ret.ldb = n;
+      }
     } else {
       NVTE_CHECK(B.has_data(), "Input B is missing row-wise usage");
+      ret.B = B.data.dptr;
+      ret.transB = CUBLAS_OP_N;
+      ret.Btype = B.data.dtype;
+      ret.B_scale_inv = B.scale_inv.dptr;
+      ret.ldb = k;
     }
-    ret.B = is_B_transposed ? B.columnwise_data.dptr : B.data.dptr;
-    ret.transB = transB;
-    ret.Btype = is_B_transposed ? B.columnwise_data.dtype : B.data.dtype;
-    ret.B_scale_inv = is_B_transposed ? B.columnwise_scale_inv.dptr : B.scale_inv.dptr;
-    ret.ldb = is_B_transposed ? n : k;
   } else if (is_nvfp_scaling(B.scaling_mode)) {
     // NVFP4: dequant path always produces TN layout for the BF16 GEMM,
     // but the source data may come from either rowwise or columnwise buffers.