From 815ce061d3dc6c72353781ebddc2597087ad68ec Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Fri, 8 May 2026 11:03:25 +0800
Subject: [PATCH 01/12] add dot_scaled repro script and md file

---
 .../unittest/mxfp8/repro_mxfp8_dot_scaled.md  | 29 ++++++++
 .../unittest/mxfp8/repro_mxfp8_dot_scaled.py  | 67 +++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
 create mode 100644 tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py

diff --git a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
new file mode 100644
index 0000000..8977b42
--- /dev/null
+++ b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
@@ -0,0 +1,29 @@
+## MXFP8 `tl.dot_scaled` Repro
+
+Minimal reproducer for Triton's MXFP8 `tl.dot_scaled`.
+
+The script builds `e4m3` MXFP8 GEMM inputs, runs one Triton kernel that calls
+`tl.dot_scaled`, then compares the result with:
+
+```python
+convert_from_mxfp8(a) @ convert_from_mxfp8(b)
+```
+
+### Cases
+
+The same `dot_scaled_kernel` covers both cases:
+
+1. `K=32`: one `tl.dot_scaled` call.
+2. `K=128`: four `tl.dot_scaled` calls accumulated with `BLOCK_K=32`.
+
+### How to run
+
+```bash
+python3 repro_mxfp8_dot_scaled.py
+```
+
+Run it in the same CDNA4 environment used for the MXFP8 kernel tests.
+
+### Expected signal
+
+The script prints `allclose`, `max_diff`, and `mean_diff` for both cases.
diff --git a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
new file mode 100644
index 0000000..9814030
--- /dev/null
+++ b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
@@ -0,0 +1,67 @@
+import torch
+import triton
+import triton.language as tl
+
+import alto.kernels.mxfp8.mxfp8_quantization  # noqa: F401
+
+
+@triton.jit
+def dot_scaled_kernel(a_ptr, b_ptr, a_s_ptr, b_s_ptr, c_ptr, K: tl.constexpr):
+    offs_m = tl.arange(0, 64)
+    offs_n = tl.arange(0, 64)
+
+    acc = tl.zeros((64, 64), dtype=tl.float32)
+    for k0 in range(0, K, 32):
+        offs_k = k0 + tl.arange(0, 32)
+        scale_k = k0 // 32
+
+        a = tl.load(a_ptr + offs_m[:, None] * K + offs_k[None, :])
+        b = tl.load(b_ptr + offs_k[:, None] * 64 + offs_n[None, :])
+        a_s = tl.load(a_s_ptr + offs_m[:, None] * (K // 32) + scale_k)
+        b_s = tl.load(b_s_ptr + scale_k * 64 + offs_n[:, None])
+
+        acc = tl.dot_scaled(a, a_s, "e4m3", b, b_s, "e4m3", acc=acc, out_dtype=tl.float32)
+
+    tl.store(c_ptr + offs_m[:, None] * 64 + offs_n[None, :], acc)
+
+
+def run_case(name, k):
+    m, n = 64, 64
+    torch.manual_seed(0)
+
+    a = torch.randn((m, k), device="cuda") * 0.1
+    b = torch.randn((k, n), device="cuda") * 0.1
+    a[:, :32] *= 1024
+    b[:32, :] *= 1024
+
+    a_lp, a_s = torch.ops.alto.convert_to_mxfp8(
+        a, mxfp_format="e4m3", axis=-1, is_2d_block=False
+    )
+    b_lp, b_s = torch.ops.alto.convert_to_mxfp8(
+        b, mxfp_format="e4m3", axis=-2, is_2d_block=False
+    )
+    a_dq = torch.ops.alto.convert_from_mxfp8(
+        a_lp, a_s, output_dtype=torch.float32, axis=-1, is_2d_block=False
+    )
+    b_dq = torch.ops.alto.convert_from_mxfp8(
+        b_lp, b_s, output_dtype=torch.float32, axis=-2, is_2d_block=False
+    )
+
+    out = torch.empty((m, n), device="cuda", dtype=torch.float32)
+    dot_scaled_kernel[(1,)](a_lp, b_lp, a_s, b_s, out, K=k)
+
+    ref = a_dq @ b_dq
+    diff = (out - ref).abs()
+    print(f"== {name} ==")
+    print(f"allclose={torch.allclose(out, ref)}")
+    print(f"max_diff={diff.max().item():.6f}")
+    print(f"mean_diff={diff.mean().item():.6f}")
+
+
+def main():
+    run_case("single tl.dot_scaled, K=32", 32)
+    run_case("K-chain tl.dot_scaled, K=128", 128)
+
+
+if __name__ == "__main__":
+    main()

From 3f4d5df7a24c28a7fdf179f9acc178ac47812b80 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Fri, 8 May 2026 12:09:10 +0800
Subject: [PATCH 02/12] add minimal MXFP8 dot_scaled multi-block repro

---
 .../unittest/mxfp8/repro_mxfp8_dot_scaled.md  | 15 +++++---
 .../unittest/mxfp8/repro_mxfp8_dot_scaled.py  | 36 ++++++++++++-------
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
index 8977b42..bc75cec 100644
--- a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
+++ b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.md
@@ -1,6 +1,6 @@
 ## MXFP8 `tl.dot_scaled` Repro
 
-Minimal reproducer for Triton's MXFP8 `tl.dot_scaled`.
+Minimal reproducer for the MXFP8 `tl.dot_scaled` multi-block accuracy issue.
 
 The script builds `e4m3` MXFP8 GEMM inputs, runs one Triton kernel that calls
 `tl.dot_scaled`, then compares the result with:
@@ -11,10 +11,14 @@ convert_from_mxfp8(a) @ convert_from_mxfp8(b)
 
 ### Cases
 
-The same `dot_scaled_kernel` covers both cases:
+The same `dot_scaled_kernel` covers three cases:
 
-1. `K=32`: one `tl.dot_scaled` call.
-2. `K=128`: four `tl.dot_scaled` calls accumulated with `BLOCK_K=32`.
+1. `K=32, DOT_K=32`: baseline, one `tl.dot_scaled` call over one quant block.
+2. `K=128, DOT_K=32`: safe path, four `tl.dot_scaled` calls, each over one quant block.
+3. `K=128, DOT_K=128`: problem path, one `tl.dot_scaled` call spanning four
+   32-wide quant blocks.
+
+The inputs include outlier-heavy 32-wide K blocks to make scale disparity visible.
 
 ### How to run
 
@@ -26,4 +30,5 @@ Run it in the same CDNA4 environment used for the MXFP8 kernel tests.
 
 ### Expected signal
 
-The script prints `allclose`, `max_diff`, and `mean_diff` for both cases.
+The problem path should show a noticeably larger error than the safe path. The script
+prints `allclose`, `max_diff`, `mean_diff`, and `relative_max_diff` for each case.
diff --git a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
index 9814030..dbc2255 100644
--- a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
+++ b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
@@ -6,33 +6,42 @@
 
 
 @triton.jit
-def dot_scaled_kernel(a_ptr, b_ptr, a_s_ptr, b_s_ptr, c_ptr, K: tl.constexpr):
+def dot_scaled_kernel(a_ptr, b_ptr, a_s_ptr, b_s_ptr, c_ptr, K: tl.constexpr, DOT_K: tl.constexpr):
     offs_m = tl.arange(0, 64)
     offs_n = tl.arange(0, 64)
+    scale_count: tl.constexpr = K // 32
+    dot_scale_count: tl.constexpr = DOT_K // 32
 
     acc = tl.zeros((64, 64), dtype=tl.float32)
-    for k0 in range(0, K, 32):
-        offs_k = k0 + tl.arange(0, 32)
-        scale_k = k0 // 32
+    for k0 in range(0, K, DOT_K):
+        offs_k = k0 + tl.arange(0, DOT_K)
+        offs_scale = k0 // 32 + tl.arange(0, dot_scale_count)
 
         a = tl.load(a_ptr + offs_m[:, None] * K + offs_k[None, :])
         b = tl.load(b_ptr + offs_k[:, None] * 64 + offs_n[None, :])
-        a_s = tl.load(a_s_ptr + offs_m[:, None] * (K // 32) + scale_k)
-        b_s = tl.load(b_s_ptr + scale_k * 64 + offs_n[:, None])
+        a_s = tl.load(a_s_ptr + offs_m[:, None] * scale_count + offs_scale[None, :])
+        b_s = tl.load(b_s_ptr + offs_n[:, None] + offs_scale[None, :] * 64)
 
         acc = tl.dot_scaled(a, a_s, "e4m3", b, b_s, "e4m3", acc=acc, out_dtype=tl.float32)
 
     tl.store(c_ptr + offs_m[:, None] * 64 + offs_n[None, :], acc)
 
 
-def run_case(name, k):
+def run_case(name, k, dot_k):
     m, n = 64, 64
     torch.manual_seed(0)
 
     a = torch.randn((m, k), device="cuda") * 0.1
     b = torch.randn((k, n), device="cuda") * 0.1
-    a[:, :32] *= 1024
-    b[:32, :] *= 1024
+    # Make each 32-wide quant block use a different scale range.
+    # This stresses the DOT_K=128 path, where one tl.dot_scaled spans multiple blocks.
+    quant_block_ratios = (1024, 1, 256, 16)
+    for block_idx, ratio in enumerate(quant_block_ratios):
+        start = block_idx * 32
+        end = min(start + 32, k)
+        if start < k:
+            a[:, start:end] *= ratio
+            b[start:end, :] *= ratio
 
     a_lp, a_s = torch.ops.alto.convert_to_mxfp8(
         a, mxfp_format="e4m3", axis=-1, is_2d_block=False
@@ -48,19 +57,22 @@ def run_case(name, k):
     )
 
     out = torch.empty((m, n), device="cuda", dtype=torch.float32)
-    dot_scaled_kernel[(1,)](a_lp, b_lp, a_s, b_s, out, K=k)
+    dot_scaled_kernel[(1,)](a_lp, b_lp, a_s, b_s, out, K=k, DOT_K=dot_k)
 
     ref = a_dq @ b_dq
     diff = (out - ref).abs()
+    ref_max = ref.abs().max()
     print(f"== {name} ==")
     print(f"allclose={torch.allclose(out, ref)}")
     print(f"max_diff={diff.max().item():.6f}")
     print(f"mean_diff={diff.mean().item():.6f}")
+    print(f"relative_max_diff={(diff.max() / ref_max).item():.6f}")
 
 
 def main():
-    run_case("single tl.dot_scaled, K=32", 32)
-    run_case("K-chain tl.dot_scaled, K=128", 128)
+    run_case("baseline: K=32, DOT_K=32", 32, 32)
+    run_case("safe path: K=128, DOT_K=32", 128, 32)
+    run_case("problem path: K=128, DOT_K=128", 128, 128)
 
 
 if __name__ == "__main__":

From 11ae961698fd44e157e2e682a851f2b84f064fef Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Fri, 8 May 2026 02:04:58 -0500
Subject: [PATCH 03/12] fix MXFP8 dot_scaled multi-block repro signal

---
 .../unittest/mxfp8/repro_mxfp8_dot_scaled.py  | 76 +++++++++++++------
 1 file changed, 52 insertions(+), 24 deletions(-)

diff --git a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
index dbc2255..2ea2094 100644
--- a/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
+++ b/tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py
@@ -5,40 +5,48 @@
 import alto.kernels.mxfp8.mxfp8_quantization  # noqa: F401
 
 
+BLOCK_M = 64
+BLOCK_N = 64
+QUANT_BLOCK_SIZE = 32
+OUTLIER_BLOCK_RATIOS = (1024, 1, 256, 16)
+
+
 @triton.jit
 def dot_scaled_kernel(a_ptr, b_ptr, a_s_ptr, b_s_ptr, c_ptr, K: tl.constexpr, DOT_K: tl.constexpr):
-    offs_m = tl.arange(0, 64)
-    offs_n = tl.arange(0, 64)
-    scale_count: tl.constexpr = K // 32
-    dot_scale_count: tl.constexpr = DOT_K // 32
-
-    acc = tl.zeros((64, 64), dtype=tl.float32)
-    for k0 in range(0, K, DOT_K):
-        offs_k = k0 + tl.arange(0, DOT_K)
-        offs_scale = k0 // 32 + tl.arange(0, dot_scale_count)
+    block_m: tl.constexpr = 64
+    block_n: tl.constexpr = 64
+    quant_block_size: tl.constexpr = 32
+    offs_m = tl.arange(0, block_m)
+    offs_n = tl.arange(0, block_n)
+    scale_count: tl.constexpr = K // quant_block_size
+    dot_scale_count: tl.constexpr = DOT_K // quant_block_size
+
+    acc = tl.zeros((block_m, block_n), dtype=tl.float32)
+    for block_start in range(0, K, DOT_K):
+        offs_k = block_start + tl.arange(0, DOT_K)
+        offs_scale = block_start // quant_block_size + tl.arange(0, dot_scale_count)
 
         a = tl.load(a_ptr + offs_m[:, None] * K + offs_k[None, :])
-        b = tl.load(b_ptr + offs_k[:, None] * 64 + offs_n[None, :])
+        b = tl.load(b_ptr + offs_k[:, None] * block_n + offs_n[None, :])
         a_s = tl.load(a_s_ptr + offs_m[:, None] * scale_count + offs_scale[None, :])
-        b_s = tl.load(b_s_ptr + offs_n[:, None] + offs_scale[None, :] * 64)
+        b_s = tl.load(b_s_ptr + offs_n[:, None] + offs_scale[None, :] * block_n)
 
         acc = tl.dot_scaled(a, a_s, "e4m3", b, b_s, "e4m3", acc=acc, out_dtype=tl.float32)
 
-    tl.store(c_ptr + offs_m[:, None] * 64 + offs_n[None, :], acc)
+    tl.store(c_ptr + offs_m[:, None] * block_n + offs_n[None, :], acc)
 
 
 def run_case(name, k, dot_k):
-    m, n = 64, 64
+    m, n = BLOCK_M, BLOCK_N
     torch.manual_seed(0)
 
     a = torch.randn((m, k), device="cuda") * 0.1
     b = torch.randn((k, n), device="cuda") * 0.1
     # Make each 32-wide quant block use a different scale range.
     # This stresses the DOT_K=128 path, where one tl.dot_scaled spans multiple blocks.
-    quant_block_ratios = (1024, 1, 256, 16)
-    for block_idx, ratio in enumerate(quant_block_ratios):
-        start = block_idx * 32
-        end = min(start + 32, k)
+    for block_idx, ratio in enumerate(OUTLIER_BLOCK_RATIOS):
+        start = block_idx * QUANT_BLOCK_SIZE
+        end = start + QUANT_BLOCK_SIZE
         if start < k:
             a[:, start:end] *= ratio
             b[start:end, :] *= ratio
@@ -49,6 +57,10 @@ def run_case(name, k, dot_k):
     b_lp, b_s = torch.ops.alto.convert_to_mxfp8(
         b, mxfp_format="e4m3", axis=-2, is_2d_block=False
     )
+    # axis=-2 conversion returns transposed-stride B tensors. This repro uses
+    # a minimal pointer-arithmetic kernel, so normalize B to isolate dot_scaled.
+    b_lp = b_lp.contiguous()
+    b_s = b_s.contiguous()
     a_dq = torch.ops.alto.convert_from_mxfp8(
         a_lp, a_s, output_dtype=torch.float32, axis=-1, is_2d_block=False
     )
@@ -62,17 +74,33 @@ def run_case(name, k, dot_k):
     ref = a_dq @ b_dq
     diff = (out - ref).abs()
     ref_max = ref.abs().max()
+    max_diff = diff.max()
+    mean_diff = diff.mean()
+    relative_max_diff = max_diff / ref_max
+
     print(f"== {name} ==")
-    print(f"allclose={torch.allclose(out, ref)}")
-    print(f"max_diff={diff.max().item():.6f}")
-    print(f"mean_diff={diff.mean().item():.6f}")
-    print(f"relative_max_diff={(diff.max() / ref_max).item():.6f}")
+    print(f"allclose={torch.allclose(out, ref, atol=1e-4, rtol=1e-4)}")
+    print(f"max_diff={max_diff.item():.6f}")
+    print(f"mean_diff={mean_diff.item():.6f}")
+    print(f"relative_max_diff={relative_max_diff.item():.6f}")
+    return max_diff.item(), mean_diff.item(), relative_max_diff.item()
 
 
 def main():
-    run_case("baseline: K=32, DOT_K=32", 32, 32)
-    run_case("safe path: K=128, DOT_K=32", 128, 32)
-    run_case("problem path: K=128, DOT_K=128", 128, 128)
+    cases = (
+        ("baseline", 32, 32),
+        ("safe path", 128, 32),
+        ("problem path: spans 2 quant blocks", 128, 64),
+        ("problem path: spans 4 quant blocks", 128, 128),
+    )
+    mean_diffs = {}
+    for name, k, dot_k in cases:
+        _, mean_diff, _ = run_case(f"{name}: K={k}, DOT_K={dot_k}", k, dot_k)
+        mean_diffs[dot_k] = mean_diff
+
+    safe_mean_diff = mean_diffs[QUANT_BLOCK_SIZE]
+    print(f"dot_k_64_vs_safe_mean_diff_ratio={mean_diffs[64] / safe_mean_diff:.2f}x")
+    print(f"dot_k_128_vs_safe_mean_diff_ratio={mean_diffs[128] / safe_mean_diff:.2f}x")
 
 
 if __name__ == "__main__":

From 3e758a34e7f84a6db88d9c86da6d71be2bde5523 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Fri, 29 May 2026 07:25:27 +0000
Subject: [PATCH 04/12] mxfp8: scaffold grouped GEMM package (Step 1)

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md | 213 ++++++++++++++++++
 .../mxfp8/mxfp8_grouped_gemm/__init__.py      |   6 +
 .../mxfp8/mxfp8_grouped_gemm/autotune.py      |  27 +++
 .../mxfp8/mxfp8_grouped_gemm/cg_backward.py   | 171 ++++++++++++++
 .../mxfp8/mxfp8_grouped_gemm/cg_forward.py    | 100 ++++++++
 .../mxfp8/mxfp8_grouped_gemm/functional.py    |  42 ++++
 .../mxfp8_grouped_gemm/tests/__init__.py      |   0
 7 files changed, 559 insertions(+)
 create mode 100644 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
 create mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
new file mode 100644
index 0000000..e8a26f5
--- /dev/null
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -0,0 +1,213 @@
+# MXFP8 E4M3 Grouped GEMM — Minimum Viable Plan
+
+目标：在 AMD MI300 (CDNA3) / MI350 (CDNA4) 上实现能支撑 GPT-OSS MoE 训练跑起来的最小可用 mxfp8 grouped GEMM（fwd + dgrad + wgrad）。**V1 格式约定为全部 e4m3**——fwd / dgrad / wgrad 三个 pass 的所有 operand 都用 e4m3。混合格式（bwd grad_output 用 e5m2）作为 v2 的精度优化项，理由见 §0。
+
+参考实现位于 `alto/kernels/fp4/mxfp4/mxfp_grouped_gemm/`（mxfp4 三个 kernel + autograd）以及本目录下 `mxfp8_quantization.py` / `mxfp8_linear.py`（mxfp8 quant/dequant 基础设施与 blockwise GEMM 模板）。
+
+---
+
+## 0. 格式选择：V1 全 e4m3，混合格式留给 v2
+
+**V1 决定：fwd / dgrad / wgrad 全部 operand 用 e4m3。** 单一格式让 kernel 不需要 dtype 分发，autograd 不需要为 grad_output 单独走 e5m2 量化，最小版本实现与验证都最简单。代价是 grad 的数值鲁棒性（见下表方案 A），先用 toy MoE 训练验证是否够用；不够再按下面的分析升级到混合格式。
+
+以下分析说明为什么工业界最终走混合格式（**v2 方向**）。FP8 训练里 "e4m3 用在 fwd、e5m2 用在 grad" 是 NVIDIA Transformer Engine、Meta FP8 training、TorchAO MX recipe 等一致采用的工业共识。原因来自两个 dtype 的位分配差异 vs 激活/梯度的分布差异：
+
+| 格式 | exp / mantissa | 动态范围 | mantissa 相对误差 |
+|---|---|---|---|
+| **e4m3** | 4 / 3 | ~2⁻⁹ ~ 448 | ~6% |
+| **e5m2** | 5 / 2 | ~2⁻¹⁶ ~ 57344 | ~12% |
+
+**fwd 的 activation / weight**：经过 LayerNorm/GELU 后分布相对集中（典型 ±几十），动态范围需求低，**单元素精度更重要** → e4m3 更合适。
+
+**bwd 的 grad_output**：分布是长尾的——大多数值很小（~1e-5），偶尔有 spike，训练后期还会进一步衰减，**动态范围需求大**，单元素精度反而次要（反正会被 sum 平均掉） → e5m2 更合适。
+
+三种方案的实际后果：
+
+| 方案 | 问题 |
+|---|---|
+| **A. 全 e4m3（V1 本方案）** | grad 的小尾部 underflow、spike overflow，**通常几百到几千步就发散**；先用 toy MoE 验证 v1 范围内是否触发 |
+| B. 全 e5m2 | activation mantissa 只剩 2 bit，每次 GEMM 引入 ~12% 量化噪声，**深网络 loss 显著高于 bf16 baseline** |
+| C. 混合（**v2 方向**） | 各取所长，loss 曲线接近 bf16 |
+
+**升级到混合格式的成本几乎为零**（v2 时）：
+- `convert_to_mxfp8` 已支持运行时 `mxfp_format` 切换 e4m3/e5m2
+- `tl.dot_scaled(a, a_s, "e5m2", b, b_s, "e4m3", ...)` 原生支持左右 operand 不同 dtype
+- autograd 里 fwd quant X/W 为 e4m3，bwd quant GO 改 e5m2，W/X 继续复用 fwd 已量化的 e4m3 版本
+
+为了让 v2 升级无痛，API 仍暴露 `fwd_format` 与 `bwd_grad_format` 两个独立参数，但 **V1 默认两者都为 e4m3**；v2 把 `bwd_grad_format` 默认改成 e5m2 即得工业共识，kernel 本体不动。
+
+---
+
+## 1. 复用 vs 新写
+
+### 可直接复用（不动）
+- `mxfp8_quantization.py`：`convert_to_mxfp8` / `convert_from_mxfp8` / `calculate_mxfp8_scales`。一个 op 已覆盖 e4m3/e5m2、SR、1D/2D block、任意 axis。
+- mxfp4 grouped GEMM 的整体脚手架：
+  - persistent kernel + super-grouping 调度（`_compute_pid`）
+  - `indices_ptr` + `GROUP_SIZE_M` 的 contiguous 路由
+  - `USE_2DBLOCK_*` 布局开关
+  - `triton_op` 包装 + `MXFPxGroupedGEMM(autograd.Function)` 结构
+- mxfp4 autograd 里 fwd/bwd 沿不同 axis 多 quant 一份 X / grad_output 的逻辑（wgrad 需要沿 M 量化）。
+
+### 必须改写
+1. **去掉所有 K-packing**。mxfp4 一 byte 装两元素，mxfp8 一 byte 一元素。删除：
+   - `PACKED_BLOCK_SIZE_K = BLOCK_SIZE_K // 2`、`K_PACKED = K // 2`、`offs_k_pack`、`mask_k_pack`
+   - constexpr 参数 `K_PACK_B` / `K_PACK_GO` / `K_PACK_A`
+   - wrapper 中所有 `K *= 2` / `M_bufferlen *= 2` 等还原
+   - `tl.dot_scaled(..., lhs_k_pack=..., rhs_k_pack=...)` 全部去掉
+2. **`tl.dot_scaled` 的 dtype 字符串**。mxfp4 写死 `"e2m1"`；mxfp8 改写死 `"e4m3"`（V1 三个 pass 都是 e4m3 × e4m3）。仍把它参数化为两个独立 constexpr `LHS_FORMAT_ID` / `RHS_FORMAT_ID`（0=e4m3, 1=e5m2），kernel 内 `if/else` 分发，**V1 只走 e4m3×e4m3 一条分支**；e5m2 分支预留给 v2 混合格式，提前写好可省去后续改 kernel：
+   - fwd: e4m3 × e4m3
+   - dgrad: e4m3 (GO) × e4m3 (W)  ← v2 改 e5m2 (GO) × e4m3 (W)
+   - wgrad: e4m3 (GO) × e4m3 (X)  ← v2 改 e5m2 (GO) × e4m3 (X)
+3. **`BLOCK_SIZE_K` 降到 32（= QUANT_BLOCK_SIZE）**。`blockwise_mxfp8_gemm_kernel` 已注释说明：单次 `dot_scaled` 跨多个 32-wide scale group 会与 dequant-then-matmul 参考发散。mxfp4 现在默认 128（4 个 group）对 mxfp4 可接受，但 mxfp8 训练对数值更敏感，先保守取 32；后续 autotune 视精度放宽。
+4. **CDNA3 fallback 内嵌进同一个 kernel**。参考 `blockwise_mxfp8_gemm_kernel` 的 `USE_DOT_SCALED` 分支：CDNA4 走 `tl.dot_scaled`，CDNA3 走 `_dequantize_fp8` → `tl.dot(fp32)`。不再像 mxfp4 那样在 wrapper 层走两条完全独立的路径（dequant + 外部 bf16 grouped GEMM）。
+
+### 全新写
+- 3 个 Triton kernel：`_kernel_mxfp8_grouped_gemm_forward` / `_backward_dx` / `_backward_dw`
+- 3 个 `triton_op` wrapper
+- 1 个 `MXFP8GroupedGEMM(torch.autograd.Function)` + 用户入口 `mxfp8_grouped_gemm(...)`
+
+预估代码量：~600 行 Triton kernel + ~250 行 Python wrapper/autograd。
+
+---
+
+## 2. 接口契约
+
+### 张量与 scale layout
+与 `blockwise_mxfp8_gemm` 完全一致，仅多一个 expert 维：
+
+| 张量 | shape | scale shape (1D, 沿 K) | scale shape (2D) |
+|---|---|---|---|
+| inputs (X) | [M_total, K] | [M_total, K/32] | [M_total/32, K/32] |
+| expert_weights (W) | [num_experts, N, K] | [num_experts, N, K/32] | [num_experts, N/32, K/32] |
+| grad_output (GO) | [M_total, N] | [M_total, N/32] | [M_total/32, N/32] |
+| output / grad_input | [M_total, N] / [M_total, K] | — | — |
+| grad_weights | [num_experts, N, K] | — | — |
+
+约定（沿用 mxfp4 注释）："B scales are N x K even though B operand is K x N"——scale 在非 reduction 维上是 major。
+
+### 用户 API
+```python
+def mxfp8_grouped_gemm(
+    inputs: Tensor,                 # [M_total, K], bf16/fp32
+    expert_weights: Tensor,         # [num_experts, N, K]
+    expert_indices: Tensor,         # [M_total], int32, 每 GROUP_SIZE_M 同一个 expert
+    *,
+    fwd_format: str = "e4m3",       # fwd 时 X/W 的格式
+    bwd_grad_format: str = "e4m3",  # V1 默认 e4m3（v2 改 e5m2）；bwd 时 grad_output 的格式 (W/X 仍 e4m3)
+    use_2dblock_x: bool = False,
+    use_2dblock_w: bool = True,
+    use_sr_grad: bool = False,
+    trans_weights: bool = True,
+) -> Tensor:  # [M_total, N], bf16/fp32
+```
+
+### 三个 GEMM 的 contraction & scale axis
+| Pass | 计算 | reduction dim | X-side quant axis | W/GO-side quant axis |
+|---|---|---|---|---|
+| fwd | `Y = X @ W^T` | K | X: -1 (K) | W: -1 (K) |
+| dgrad | `dX = GO @ W` | N | GO: -1 (N) | W: -2 (N) |
+| wgrad | `dW = GO^T @ X` | M | GO: 0 (M) | X: 0 (M) |
+
+⇒ 训练一次迭代需要 **2 套 X 的量化**（沿 K 和沿 M）和 **2 套 GO 的量化**（沿 N 和沿 M）。W 也是 2 套（沿 K 和沿 N），但 W 可以在 optimizer step 后离线做一次。这与 mxfp4 autograd 已有逻辑相同。
+
+---
+
+## 3. 落地步骤
+
+### Step 1 — 目录与骨架 ✅ 已完成
+新建 `alto/kernels/mxfp8/mxfp8_grouped_gemm/`：
+```
+__init__.py
+autotune.py          # ALIGN_SIZE_M=128, STANDARD_CONFIGS（先单 config: BSM=128, BSN=128, BSK=32）
+cg_forward.py        # _kernel_mxfp8_grouped_gemm_forward + mxfp8_grouped_gemm_forward
+cg_backward.py       # _backward_dx / _backward_dw + 两个 triton_op + MXFP8GroupedGEMM + mxfp8_grouped_gemm
+functional.py        # 暴露顶层入口
+```
+
+**完成情况**：
+- 五个文件全部就位，`from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm` 可正常 import。
+- 所有 kernel body / wrapper / autograd 方法为占位（`pass` 或 `NotImplementedError`），按 Step 2-5 逐步填充。
+- `autotune.py`：`ALIGN_SIZE_M=128`，单 config `BSM=BSN=128, BSK=32`（= QUANT_BLOCK_SIZE，每次 dot_scaled 覆盖一个 scale group）。
+- dtype 参数化通道（`LHS_FORMAT_ID` / `RHS_FORMAT_ID`，0=e4m3 / 1=e5m2）已在三个 kernel 签名中预留，但**默认值全部对齐 V1 全 e4m3**（`fwd_format`/`bwd_grad_format` 默认 e4m3，wrapper `lhs_format_id`/`rhs_format_id` 默认 0）；e5m2 通道留给 v2。
+
+### Step 2 — Forward kernel
+基于 `mxfp4/cg_forward.py` 机械改写：
+1. 删除所有 packing 相关代码（见 §1 第 1 点清单）
+2. 加入 `LHS_FORMAT_ID` / `RHS_FORMAT_ID` constexpr，`USE_DOT_SCALED` constexpr
+3. K 累加循环内：CDNA4 路径 `tl.dot_scaled(a, a_s, fmt_a, b, b_s, fmt_b, acc=acc, out_dtype=fp32)`；CDNA3 路径 dequant 后 `tl.dot`
+4. wrapper：`convert_to_mxfp8(inputs, axis=-1, mxfp_format=fwd_format)`、`convert_to_mxfp8(weights, axis=quant_axis_w, mxfp_format=fwd_format)`、launch
+
+**验证**：与 `mxfp4_grouped_gemm_forward`+bf16 dequant 同样的对比方式，跟 `for e in experts: X_e @ W_e.T`（bf16 reference）比 cosine similarity / max rel error。
+
+### Step 3 — Backward dgrad kernel
+基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
+- 删 packing
+- dtype：V1 `LHS=e4m3, RHS=e4m3`（v2 改 `LHS=e5m2`）
+- 注意 W 的访问：dgrad 沿 N reduce，所以 W [N,K] 在 kernel 内按 N-major 加载（与 fwd 相同 shape，不同 reduction）；scale `b_s` 此时沿 N 是 reduction 维 → `stride_bsk`/`stride_bsn` 用法跟 mxfp4 一致
+
+### Step 4 — Backward wgrad kernel
+基于 `_kernel_mxfp4_grouped_gemm_backward_dw`：
+- 删 packing
+- dtype：V1 `LHS=e4m3, RHS=e4m3`（v2 改 `LHS=e5m2`）
+- M 是 reduction 维 → 必须用 **沿 M 量化** 的 GO / X（autograd 里准备好）
+- 保持 mxfp4 的 "loop over groups, skip if expert mismatch" 简单实现，性能问题留到 v2
+
+### Step 5 — Autograd Function
+参考 `MXFP4GroupedGEMM`：
+1. fwd：调 `convert_to_mxfp8` 量化 X (axis=-1) 与 W (axis=quant_axis_w)，调 fwd kernel
+2. 若 `use_2dblock_x=False`，额外 quant X 沿 axis=0 一份给 wgrad
+3. 若 `use_2dblock_w=False`，额外 quant W 沿 requant axis 一份给 dgrad
+4. `ctx.save_for_backward(...)`
+5. bwd：quant GO 沿 axis=-1（给 dgrad）与 axis=0（给 wgrad），格式用 `bwd_grad_format`（V1=e4m3），调两个 bwd kernel
+6. 跳过 mxfp4 里的 `use_dge` / `hadamard_transform` / `use_macro_block_scaling` / `clip_mode`（这些是研究 feature，最小版本不要）
+
+### Step 6 — 数值正确性测试
+新建 `mxfp8/mxfp8_grouped_gemm/tests/`：
+1. `test_forward.py`：单 expert + 多 expert，bf16 reference 对齐（rel err < ~1e-2）
+2. `test_backward.py`：finite-diff 不现实，改用「mxfp8 模拟版」reference：用 `convert_to_mxfp8` 后立刻 `convert_from_mxfp8` 回 bf16，再走 PyTorch 原生 GEMM，作为「数值等价 reference」
+3. `test_e2e_moe.py`：toy MoE layer（2 expert, K=128, N=128, M_total=256），fwd+bwd+optimizer step，看 loss 下降几步
+
+### Step 7 — MI300 fallback 验证
+仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
+
+### Step 8 — 接 GPT-OSS（不在最小版本范围）
+预留接口：`mxfp8_grouped_gemm` 签名要能直接替换现有 MoE forward 中的 grouped GEMM 调用。具体集成视 GPT-OSS 训练栈 PR 时再做。
+
+---
+
+## 4. 不做的事（明确划线）
+
+为了"最小可用"，**v1 显式不做**：
+- ❌ 混合格式（bwd grad_output 用 e5m2）——V1 全 e4m3，e5m2 分支预留但不启用（见 §0）
+- ❌ DGE（dynamic gradient estimation）
+- ❌ Hadamard transform
+- ❌ Macro block scaling
+- ❌ Clip mode / static clipping
+- ❌ wgrad 的 split-K 优化（先用 mxfp4 那套 "遍历所有 group 判等" 的简单实现）
+- ❌ TMA / async copy / pipelining 调优
+- ❌ CUTLASS 路径
+- ❌ Autotune（先单 config，跑通再开）
+- ❌ FSDP/TP 集成测试
+
+这些都是 v1 跑通后的优化项。
+
+---
+
+## 5. 关键风险与对策
+
+| 风险 | 对策 |
+|---|---|
+| `tl.dot_scaled` 在 CDNA4 上 e5m2 × e4m3 混合 dtype 行为未验证 | Step 3 先单独写一个 toy 测试验证 mixed-dtype dot_scaled 输出，再嵌入 grouped GEMM |
+| `BLOCK_SIZE_K=32` 单 group 太小，K 累加 overhead 大 | 先确认正确性，再尝试 64/128 评估数值偏差是否可接受 |
+| wgrad kernel 在 expert 数多时性能差（每个 tile 扫所有 group） | v1 接受；v2 改 split-K + 按 expert grouping 调度 |
+| GPT-OSS 实际 token 路由可能不满足 GROUP_SIZE_M=128 对齐 | 上游 padding（已是 mxfp4 路径假设），不在 kernel 内处理 |
+
+---
+
+## 6. 验收标准（v1 完成定义）
+
+1. fwd / dgrad / wgrad 三个 kernel 在 CDNA4 与 CDNA3 上都能跑通
+2. 数值对齐 bf16 reference：fwd cos-sim > 0.999，bwd cos-sim > 0.995
+3. toy MoE 训练 100 steps loss 单调下降，与 bf16 baseline 同形
+4. 单元测试覆盖 1D/2D block × CDNA3/CDNA4 各组合（V1 全 e4m3；e5m2 分支留待 v2）
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
new file mode 100644
index 0000000..b2bb4e6
--- /dev/null
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.functional import mxfp8_grouped_gemm
+
+__all__ = ["mxfp8_grouped_gemm"]
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
new file mode 100644
index 0000000..aeab99d
--- /dev/null
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+"""Autotune configs for mxfp8 grouped GEMM.
+
+v1 keeps a single conservative config:
+- BLOCK_SIZE_K == QUANT_BLOCK_SIZE (=32) so each tl.dot_scaled call covers
+  exactly one mx scale group; this matches the numerical contract validated
+  by alto/kernels/mxfp8/mxfp8_linear.py.
+- BSM=BSN=128 matches mxfp4 grouped GEMM's default tile.
+Wider autotune is deferred to v2.
+"""
+
+import triton
+
+ALIGN_SIZE_M = 128  # token routing alignment; tokens routed to the same expert must form contiguous blocks of this size
+
+STANDARD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+        },
+        num_stages=2,
+        num_warps=4,
+    ),
+]
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
new file mode 100644
index 0000000..bf7da66
--- /dev/null
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+"""MXFP8 contiguous grouped GEMM — backward (dgrad + wgrad) + autograd Function.
+
+Skeleton only; kernel bodies filled in Steps 3-5.
+"""
+
+import torch
+import triton
+import triton.language as tl
+from torch.library import triton_op, wrap_triton
+
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT, is_cdna4
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import (
+    STANDARD_CONFIGS,
+    ALIGN_SIZE_M,
+)
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
+
+
+# ============ dgrad: grad_input = grad_output @ expert_weights ============
+
+@triton.autotune(
+    configs=STANDARD_CONFIGS,
+    key=[
+        "N", "K", "GROUP_SIZE_M",
+        "USE_2DBLOCK_GO", "USE_2DBLOCK_B",
+        "QUANT_BLOCK_SIZE",
+        "LHS_FORMAT_ID", "RHS_FORMAT_ID", "USE_DOT_SCALED",
+    ],
+)
+@triton.jit
+def _kernel_mxfp8_grouped_gemm_backward_dx(
+    grad_output_ptr,  # [M_TOTAL, N], fp8 (V1: e4m3; v2: e5m2)
+    b_ptr,            # [num_experts, N, K], fp8 (e4m3)
+    grad_input_ptr,   # [M_TOTAL, K], output dtype
+    indices_ptr,
+    go_s_ptr,
+    b_s_ptr,
+    stride_gom, stride_gon,
+    stride_be, stride_bn, stride_bk,
+    stride_gim, stride_gik,
+    stride_gosm, stride_gosn,
+    stride_bse, stride_bsn, stride_bsk,
+    M_TOTAL,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_EXPERTS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    USE_2DBLOCK_GO: tl.constexpr,
+    USE_2DBLOCK_B: tl.constexpr,
+    QUANT_BLOCK_SIZE: tl.constexpr,
+    LHS_FORMAT_ID: tl.constexpr,  # V1: 0 (e4m3); v2: 1 (e5m2) for grad_output
+    RHS_FORMAT_ID: tl.constexpr,  # 0 (e4m3)
+    USE_DOT_SCALED: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr = ALIGN_SIZE_M,
+):
+    """dgrad kernel: dX = GO @ W (N is reduction dim). To be implemented in Step 3."""
+    # TODO(Step 3): port from mxfp4 cg_backward._kernel_mxfp4_grouped_gemm_backward_dx
+    pass
+
+
+# ============ wgrad: grad_weights = grad_output^T @ inputs (per expert) ============
+
+@triton.autotune(
+    configs=STANDARD_CONFIGS,
+    key=[
+        "N", "K", "NUM_EXPERTS", "GROUP_SIZE_M",
+        "USE_2DBLOCK_GO", "USE_2DBLOCK_A",
+        "QUANT_BLOCK_SIZE",
+        "LHS_FORMAT_ID", "RHS_FORMAT_ID", "USE_DOT_SCALED",
+    ],
+)
+@triton.jit
+def _kernel_mxfp8_grouped_gemm_backward_dw(
+    grad_output_ptr,   # [M_TOTAL, N], fp8 (V1: e4m3; v2: e5m2), quantized along M-axis
+    inputs_ptr,        # [M_TOTAL, K], fp8 (e4m3), quantized along M-axis
+    grad_weights_ptr,  # [num_experts, N, K], output dtype
+    indices_ptr,
+    go_s_ptr,
+    a_s_ptr,
+    stride_gom, stride_gon,
+    stride_am, stride_ak,
+    stride_gbe, stride_gbn, stride_gbk,
+    stride_gosm, stride_gosn,
+    stride_asm, stride_ask,
+    M_TOTAL,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_EXPERTS: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    USE_2DBLOCK_GO: tl.constexpr,
+    USE_2DBLOCK_A: tl.constexpr,
+    QUANT_BLOCK_SIZE: tl.constexpr,
+    LHS_FORMAT_ID: tl.constexpr,  # V1: 0 (e4m3); v2: 1 (e5m2) for grad_output
+    RHS_FORMAT_ID: tl.constexpr,  # 0 (e4m3)
+    USE_DOT_SCALED: tl.constexpr,
+):
+    """wgrad kernel: dW = GO^T @ X (M is reduction dim). To be implemented in Step 4."""
+    # TODO(Step 4): port from mxfp4 cg_backward._kernel_mxfp4_grouped_gemm_backward_dw
+    pass
+
+
+# =============== triton_op wrappers ===============
+
+@triton_op("alto::mxfp8_grouped_gemm_backward_inputs", mutates_args={})
+def mxfp8_grouped_gemm_backward_inputs(
+    grad_output: torch.Tensor,
+    expert_weights: torch.Tensor,
+    expert_indices: torch.Tensor,
+    go_scales: torch.Tensor,
+    expert_weight_scales: torch.Tensor,
+    trans_weights: bool = True,
+    use_2dblock_x: bool = False,
+    use_2dblock_w: bool = True,
+    lhs_format_id: int = 0,  # V1: e4m3 (v2: 1=e5m2 for grad_output)
+    rhs_format_id: int = 0,  # e4m3
+    output_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    """dgrad wrapper. To be implemented in Step 3."""
+    raise NotImplementedError("Step 3: implement dgrad wrapper")
+
+
+@triton_op("alto::mxfp8_grouped_gemm_backward_weights", mutates_args={})
+def mxfp8_grouped_gemm_backward_weights(
+    grad_output: torch.Tensor,
+    inputs: torch.Tensor,
+    expert_indices: torch.Tensor,
+    num_experts: int,
+    go_scales: torch.Tensor,
+    input_scales: torch.Tensor,
+    trans_weights: bool = True,
+    use_2dblock_go: bool = False,
+    use_2dblock_x: bool = False,
+    lhs_format_id: int = 0,  # V1: e4m3 (v2: 1=e5m2 for grad_output)
+    rhs_format_id: int = 0,  # e4m3
+    output_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    """wgrad wrapper. To be implemented in Step 4."""
+    raise NotImplementedError("Step 4: implement wgrad wrapper")
+
+
+# =============== Autograd Function ===============
+
+@torch.compiler.allow_in_graph
+class MXFP8GroupedGEMM(torch.autograd.Function):
+    """Autograd Function for mxfp8 grouped GEMM. To be implemented in Step 5."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        inputs,
+        expert_weights,
+        expert_indices,
+        trans_weights=True,
+        use_2dblock_x=False,
+        use_2dblock_w=True,
+        use_sr_grad=False,
+        fwd_format="e4m3",
+        bwd_grad_format="e4m3",
+    ):
+        raise NotImplementedError("Step 5: implement autograd forward")
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        raise NotImplementedError("Step 5: implement autograd backward")
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
new file mode 100644
index 0000000..e5e3e1f
--- /dev/null
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+"""MXFP8 contiguous grouped GEMM — forward pass.
+
+Skeleton only; kernel body filled in Step 2.
+"""
+
+import torch
+from torch.library import triton_op, wrap_triton
+import triton
+import triton.language as tl
+
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import (
+    STANDARD_CONFIGS,
+    ALIGN_SIZE_M,
+)
+
+
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, super_group_m):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * super_group_m
+    group_size_m = min(num_pid_m - first_pid_m, super_group_m)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton.autotune(
+    configs=STANDARD_CONFIGS,
+    key=[
+        "N",
+        "K",
+        "GROUP_SIZE_M",
+        "USE_2DBLOCK_A",
+        "USE_2DBLOCK_B",
+        "QUANT_BLOCK_SIZE",
+        "LHS_FORMAT_ID",
+        "RHS_FORMAT_ID",
+        "USE_DOT_SCALED",
+    ],
+)
+@triton.jit
+def _kernel_mxfp8_grouped_gemm_forward(
+    # Pointers to matrices
+    a_ptr,        # [M_total, K], fp8 (e4m3)
+    b_ptr,        # [num_experts, N, K], fp8 (e4m3)
+    c_ptr,        # [M_total, N], output dtype
+    indices_ptr,  # [M_total], int32
+    a_s_ptr,      # A scales, uint8 E8M0
+    b_s_ptr,      # B scales, uint8 E8M0
+    # Matrix strides
+    stride_am, stride_ak,
+    stride_be, stride_bn, stride_bk,
+    stride_cm, stride_cn,
+    stride_asm, stride_ask,
+    stride_bse, stride_bsn, stride_bsk,
+    # Matrix dimensions
+    M_TOTAL,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    NUM_EXPERTS: tl.constexpr,
+    # Tile params
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    USE_2DBLOCK_A: tl.constexpr,
+    USE_2DBLOCK_B: tl.constexpr,
+    QUANT_BLOCK_SIZE: tl.constexpr,
+    LHS_FORMAT_ID: tl.constexpr,  # 0=e4m3, 1=e5m2
+    RHS_FORMAT_ID: tl.constexpr,
+    USE_DOT_SCALED: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr = ALIGN_SIZE_M,
+    SUPER_GROUP_M: tl.constexpr = 32,
+):
+    """Forward grouped GEMM kernel: Y = X @ W^T per expert. To be implemented in Step 2."""
+    # TODO(Step 2): port from mxfp4 cg_forward._kernel_mxfp4_grouped_gemm_forward,
+    # remove all packing, parameterize dot_scaled dtype strings via LHS/RHS_FORMAT_ID,
+    # add CDNA3 fallback via _dequantize_fp8 + tl.dot when USE_DOT_SCALED=False.
+    pass
+
+
+@triton_op("alto::mxfp8_grouped_gemm_forward", mutates_args={})
+def mxfp8_grouped_gemm_forward(
+    inputs: torch.Tensor,            # [M_total, K], fp8
+    expert_weights: torch.Tensor,    # [num_experts, N, K], fp8
+    expert_indices: torch.Tensor,    # [M_total], int32
+    input_scales: torch.Tensor,
+    weight_scales: torch.Tensor,
+    trans_weights: bool = True,
+    use_2dblock_x: bool = False,
+    use_2dblock_w: bool = True,
+    lhs_format_id: int = 0,          # 0=e4m3, 1=e5m2
+    rhs_format_id: int = 0,
+    output_dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    """Wrapper: launches forward kernel. To be implemented in Step 2."""
+    raise NotImplementedError("Step 2: implement forward wrapper")
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
new file mode 100644
index 0000000..06b4802
--- /dev/null
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+"""Top-level user-facing entry point for mxfp8 grouped GEMM."""
+
+import torch
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_backward import MXFP8GroupedGEMM
+
+
+def mxfp8_grouped_gemm(
+    inputs: torch.Tensor,            # [M_total, K], bf16/fp32
+    expert_weights: torch.Tensor,    # [num_experts, N, K] if trans_weights else [num_experts, K, N]
+    expert_indices: torch.Tensor,    # [M_total], int (auto-cast to int32)
+    *,
+    trans_weights: bool = True,
+    use_2dblock_x: bool = False,
+    use_2dblock_w: bool = True,
+    use_sr_grad: bool = False,
+    fwd_format: str = "e4m3",
+    bwd_grad_format: str = "e4m3",
+) -> torch.Tensor:
+    """MXFP8 contiguous grouped GEMM with full backward.
+
+    V1 uses e4m3 for all operands across fwd/dgrad/wgrad. The mixed format
+    (bwd grad_output in e5m2) is reserved for v2 — flip bwd_grad_format to
+    "e5m2" once the e5m2 dot_scaled branch is enabled. See
+    MXFP8_GROUPED_GEMM_PLAN.md §0 for the rationale.
+    """
+    if expert_indices.dtype != torch.int32:
+        expert_indices = expert_indices.to(torch.int32)
+
+    return MXFP8GroupedGEMM.apply(
+        inputs,
+        expert_weights,
+        expert_indices,
+        trans_weights,
+        use_2dblock_x,
+        use_2dblock_w,
+        use_sr_grad,
+        fwd_format,
+        bwd_grad_format,
+    )
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py
new file mode 100644
index 0000000..e69de29

From fa05b999215e895b45eaecfe289097946b346d82 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Mon, 1 Jun 2026 08:02:09 +0000
Subject: [PATCH 05/12] mxfp8: implement forward grouped GEMM kernel (Step 2)

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md |   7 +-
 .../mxfp8/mxfp8_grouped_gemm/cg_forward.py    | 222 +++++++++++++++++-
 .../mxfp8/test_mxfp8_grouped_gemm_forward.py  |  83 +++++++
 3 files changed, 299 insertions(+), 13 deletions(-)
 create mode 100644 tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index e8a26f5..2c843fb 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -131,7 +131,7 @@ functional.py        # 暴露顶层入口
 - `autotune.py`：`ALIGN_SIZE_M=128`，单 config `BSM=BSN=128, BSK=32`（= QUANT_BLOCK_SIZE，每次 dot_scaled 覆盖一个 scale group）。
 - dtype 参数化通道（`LHS_FORMAT_ID` / `RHS_FORMAT_ID`，0=e4m3 / 1=e5m2）已在三个 kernel 签名中预留，但**默认值全部对齐 V1 全 e4m3**（`fwd_format`/`bwd_grad_format` 默认 e4m3，wrapper `lhs_format_id`/`rhs_format_id` 默认 0）；e5m2 通道留给 v2。
 
-### Step 2 — Forward kernel
+### Step 2 — Forward kernel ✅ 已完成
 基于 `mxfp4/cg_forward.py` 机械改写：
 1. 删除所有 packing 相关代码（见 §1 第 1 点清单）
 2. 加入 `LHS_FORMAT_ID` / `RHS_FORMAT_ID` constexpr，`USE_DOT_SCALED` constexpr
@@ -140,6 +140,11 @@ functional.py        # 暴露顶层入口
 
 **验证**：与 `mxfp4_grouped_gemm_forward`+bf16 dequant 同样的对比方式，跟 `for e in experts: X_e @ W_e.T`（bf16 reference）比 cosine similarity / max rel error。
 
+**完成情况**：
+- `cg_forward.py` kernel body + wrapper 已实现。fp8 load 用 `other=0.0`；CDNA3 dequant 路径 `_dequantize_fp8` 一律传 `IS_2D_BLOCK=False`（scale 偏移已在 kernel 内展开为逐行 `[BLOCK, n_rep_k]`，与参考 `blockwise_mxfp8_gemm_kernel` 一致）。
+- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py`：2 shapes × 2D-block-x × 2D-block-w 共 8 例全过。主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999；另加 bf16 宽松 sanity（> 0.99）。
+- ⚠️ 当前硬件 MI300X = **CDNA3**，`is_cdna4()=False`，仅验证了 `USE_DOT_SCALED=False` 的 dequant+dot 路径；`tl.dot_scaled`（CDNA4）分支已写但需在 MI350 上验证（见 Step 7）。
+
 ### Step 3 — Backward dgrad kernel
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
 - 删 packing
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
index e5e3e1f..d80812c 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
@@ -1,16 +1,19 @@
 # Copyright (c) 2026 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-"""MXFP8 contiguous grouped GEMM — forward pass.
+"""MXFP8 contiguous grouped GEMM — forward pass."""
 
-Skeleton only; kernel body filled in Step 2.
-"""
+from typing import Optional
 
 import torch
 from torch.library import triton_op, wrap_triton
 import triton
 import triton.language as tl
 
-from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
+from alto.kernels.mxfp8.mxfp8_quantization import (
+    BLOCK_SIZE_DEFAULT,
+    _dequantize_fp8,
+    is_cdna4,
+)
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import (
     STANDARD_CONFIGS,
     ALIGN_SIZE_M,
@@ -75,17 +78,149 @@ def _kernel_mxfp8_grouped_gemm_forward(
     GROUP_SIZE_M: tl.constexpr = ALIGN_SIZE_M,
     SUPER_GROUP_M: tl.constexpr = 32,
 ):
-    """Forward grouped GEMM kernel: Y = X @ W^T per expert. To be implemented in Step 2."""
-    # TODO(Step 2): port from mxfp4 cg_forward._kernel_mxfp4_grouped_gemm_forward,
-    # remove all packing, parameterize dot_scaled dtype strings via LHS/RHS_FORMAT_ID,
-    # add CDNA3 fallback via _dequantize_fp8 + tl.dot when USE_DOT_SCALED=False.
-    pass
+    """Forward grouped GEMM kernel: Y = X @ W^T per expert.
+
+    A = X [M_TOTAL, K] (e4m3), B = W [num_experts, N, K] (e4m3); reduction over K.
+    IMPORTANT: assumes GROUP_SIZE_M is a multiple of BLOCK_SIZE_M (or vice versa)
+    and inputs are pre-aligned to block boundaries.
+    """
+    if USE_2DBLOCK_A:
+        tl.assume(BLOCK_SIZE_M % QUANT_BLOCK_SIZE == 0)
+    if USE_2DBLOCK_B:
+        tl.assume(BLOCK_SIZE_N % QUANT_BLOCK_SIZE == 0)
+    tl.assume(BLOCK_SIZE_K % QUANT_BLOCK_SIZE == 0)
+
+    c_type = c_ptr.dtype.element_ty
+    start_pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M_TOTAL, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    num_tiles = num_pid_m * num_pid_n
+    tile_id_c = start_pid - NUM_SMS
+    num_pid_in_group = SUPER_GROUP_M * num_pid_n
+
+    # number of MXFP scale groups inside a K tile
+    n_rep_k: tl.constexpr = BLOCK_SIZE_K // QUANT_BLOCK_SIZE
+    # total number of scales along K
+    Ks: tl.constexpr = K // QUANT_BLOCK_SIZE
+
+    for tile_id in tl.range(start_pid, num_tiles, NUM_SMS):
+
+        tile_m_idx, tile_n_idx = _compute_pid(tile_id, num_pid_in_group, num_pid_m, SUPER_GROUP_M)
+
+        m_start = tile_m_idx * BLOCK_SIZE_M
+        n_start = tile_n_idx * BLOCK_SIZE_N
+
+        if m_start < M_TOTAL:
+
+            offs_m = m_start + tl.arange(0, BLOCK_SIZE_M)
+            offs_n = n_start + tl.arange(0, BLOCK_SIZE_N)
+
+            if USE_2DBLOCK_A:
+                offs_m_scale = offs_m // QUANT_BLOCK_SIZE
+            else:
+                offs_m_scale = offs_m
+            if USE_2DBLOCK_B:
+                offs_n_scale = offs_n // QUANT_BLOCK_SIZE
+            else:
+                offs_n_scale = offs_n
+
+            accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+            for ki in range(k_tiles):
+
+                offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+                offs_k_scale = ki * n_rep_k + tl.arange(0, n_rep_k)
+                mask_k_scale = offs_k_scale < Ks
+
+                mask_m = offs_m < M_TOTAL
+                mask_n = offs_n < N
+                mask_k = offs_k < K
+
+                mask_a = mask_m[:, None] & mask_k[None, :]
+                mask_b = mask_k[:, None] & mask_n[None, :]
+
+                # Determine the expert group index and load expert ID
+                group_idx = m_start // GROUP_SIZE_M
+                expert_idx = tl.load(indices_ptr + group_idx * GROUP_SIZE_M)
+
+                # Load inputs A [BLOCK_M, BLOCK_K]
+                a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+                a = tl.load(a_ptrs, mask=mask_a, other=0.0)
+
+                # Load expert weights B [BLOCK_K, BLOCK_N] for this block's expert
+                b_ptrs = (b_ptr + expert_idx * stride_be + offs_n[None, :] * stride_bn +
+                          offs_k[:, None] * stride_bk)
+                b = tl.load(b_ptrs, mask=mask_b, other=0.0)
+
+                a_s_ptrs = a_s_ptr + offs_m_scale[:, None] * stride_asm + offs_k_scale[None, :] * stride_ask
+                # B scales are N x K even though B operand is K x N.
+                b_s_ptrs = (b_s_ptr + expert_idx * stride_bse + offs_n_scale[:, None] * stride_bsn +
+                            offs_k_scale[None, :] * stride_bsk)
+
+                a_s = tl.load(a_s_ptrs, mask=mask_m[:, None] & mask_k_scale[None, :], other=1)
+                b_s = tl.load(b_s_ptrs, mask=mask_n[:, None] & mask_k_scale[None, :], other=1)
+
+                if USE_DOT_SCALED:
+                    # CDNA4 path: native scaled dot. dtype strings chosen by format id.
+                    if LHS_FORMAT_ID == 0:
+                        if RHS_FORMAT_ID == 0:
+                            accumulator = tl.dot_scaled(a, a_s, "e4m3", b, b_s, "e4m3",
+                                                        acc=accumulator, out_dtype=tl.float32)
+                        else:
+                            accumulator = tl.dot_scaled(a, a_s, "e4m3", b, b_s, "e5m2",
+                                                        acc=accumulator, out_dtype=tl.float32)
+                    else:
+                        if RHS_FORMAT_ID == 0:
+                            accumulator = tl.dot_scaled(a, a_s, "e5m2", b, b_s, "e4m3",
+                                                        acc=accumulator, out_dtype=tl.float32)
+                        else:
+                            accumulator = tl.dot_scaled(a, a_s, "e5m2", b, b_s, "e5m2",
+                                                        acc=accumulator, out_dtype=tl.float32)
+                else:
+                    # CDNA3 fallback: dequantize to fp32 then plain dot.
+                    # Scale offsets above already expand to per-row scales of shape
+                    # [BLOCK, n_rep_k], so dequant treats them as 1D (IS_2D_BLOCK=False)
+                    # regardless of how the scales were originally produced.
+                    a_dq = _dequantize_fp8(
+                        a, a_s,
+                        output_dtype=tl.float32,
+                        BLOCK_M=BLOCK_SIZE_M,
+                        BLOCK_N=BLOCK_SIZE_K,
+                        QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                        FP8_FORMAT=LHS_FORMAT_ID,
+                        IS_2D_BLOCK=False,
+                        USE_ASM=False,
+                    )
+                    b_dq = tl.trans(_dequantize_fp8(
+                        tl.trans(b), b_s,
+                        output_dtype=tl.float32,
+                        BLOCK_M=BLOCK_SIZE_N,
+                        BLOCK_N=BLOCK_SIZE_K,
+                        QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                        FP8_FORMAT=RHS_FORMAT_ID,
+                        IS_2D_BLOCK=False,
+                        USE_ASM=False,
+                    ))
+                    accumulator = tl.dot(a_dq, b_dq, acc=accumulator, out_dtype=tl.float32)
+
+            tile_id_c += NUM_SMS
+            tile_m_idx, tile_n_idx = _compute_pid(tile_id_c, num_pid_in_group, num_pid_m, SUPER_GROUP_M)
+
+            offs_m = tile_m_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+            offs_n = tile_n_idx * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+            mask_m = offs_m < M_TOTAL
+            mask_n = offs_n < N
+            mask_c = mask_m[:, None] & mask_n[None, :]
+
+            c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
+            tl.store(c_ptrs, accumulator.to(c_type), mask=mask_c)
 
 
 @triton_op("alto::mxfp8_grouped_gemm_forward", mutates_args={})
 def mxfp8_grouped_gemm_forward(
     inputs: torch.Tensor,            # [M_total, K], fp8
-    expert_weights: torch.Tensor,    # [num_experts, N, K], fp8
+    expert_weights: torch.Tensor,    # [num_experts, N, K] (trans) or [num_experts, K, N], fp8
     expert_indices: torch.Tensor,    # [M_total], int32
     input_scales: torch.Tensor,
     weight_scales: torch.Tensor,
@@ -95,6 +230,69 @@ def mxfp8_grouped_gemm_forward(
     lhs_format_id: int = 0,          # 0=e4m3, 1=e5m2
     rhs_format_id: int = 0,
     output_dtype: torch.dtype = torch.bfloat16,
+    use_dot_scaled: Optional[bool] = None,
 ) -> torch.Tensor:
-    """Wrapper: launches forward kernel. To be implemented in Step 2."""
-    raise NotImplementedError("Step 2: implement forward wrapper")
+    """Launch the mxfp8 forward grouped GEMM kernel on pre-quantized operands.
+
+    Y[m, :] = X[m] @ W[expert(m)]^T, computed per contiguous group of tokens.
+    """
+    assert expert_indices.is_contiguous(), "Expert indices tensor must be contiguous"
+    if use_dot_scaled is None:
+        use_dot_scaled = is_cdna4()
+
+    M_total, K = inputs.shape
+    torch._check(M_total > 0)
+    assert M_total % ALIGN_SIZE_M == 0, \
+        f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
+
+    if expert_indices.dtype != torch.int32:
+        expert_indices = expert_indices.to(torch.int32)
+
+    if trans_weights:
+        num_experts, N, K_weights = expert_weights.shape
+        stride_be, stride_bn, stride_bk = expert_weights.stride()
+        stride_bse, stride_bsn, stride_bsk = weight_scales.stride()
+    else:
+        num_experts, K_weights, N = expert_weights.shape
+        stride_be, stride_bk, stride_bn = expert_weights.stride()
+        stride_bse, stride_bsk, stride_bsn = weight_scales.stride()
+
+    assert K == K_weights, f"Input K ({K}) must match weight K ({K_weights})"
+
+    output = torch.zeros((M_total, N), device=inputs.device, dtype=output_dtype)
+
+    stride_am, stride_ak = inputs.stride()
+    stride_asm, stride_ask = input_scales.stride()
+    stride_cm, stride_cn = output.stride()
+
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+    grid = (NUM_SMS, 1, 1)
+
+    wrap_triton(_kernel_mxfp8_grouped_gemm_forward)[grid](
+        inputs,
+        expert_weights,
+        output,
+        expert_indices,
+        input_scales,
+        weight_scales,
+        stride_am, stride_ak,
+        stride_be, stride_bn, stride_bk,
+        stride_cm, stride_cn,
+        stride_asm, stride_ask,
+        stride_bse, stride_bsn, stride_bsk,
+        M_TOTAL=M_total,
+        N=N,
+        K=K,
+        NUM_EXPERTS=num_experts,
+        NUM_SMS=NUM_SMS,
+        GROUP_SIZE_M=ALIGN_SIZE_M,
+        SUPER_GROUP_M=32,
+        USE_2DBLOCK_A=use_2dblock_x,
+        USE_2DBLOCK_B=use_2dblock_w,
+        QUANT_BLOCK_SIZE=BLOCK_SIZE_DEFAULT,
+        LHS_FORMAT_ID=lhs_format_id,
+        RHS_FORMAT_ID=rhs_format_id,
+        USE_DOT_SCALED=use_dot_scaled,
+    )
+
+    return output
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
new file mode 100644
index 0000000..4458700
--- /dev/null
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+
+import pytest
+import torch
+
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT, is_cdna4
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
+
+from .utils import prepare_data, convert_from_mxfp8_pytorch
+
+
+def _cossim(x, y):
+    x = x.flatten().to(torch.float32)
+    y = y.flatten().to(torch.float32)
+    return torch.nn.functional.cosine_similarity(x, y, dim=0).item()
+
+
+def _make_indices(num_groups, num_experts, device):
+    indices = torch.zeros(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        e = torch.randint(0, num_experts, (1,), device=device, dtype=torch.int32).item()
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = e
+    return indices
+
+
+def _reference(inputs, expert_weights, indices, num_groups, trans_weights):
+    M_total, N = inputs.shape[0], (expert_weights.shape[1] if trans_weights else expert_weights.shape[2])
+    out = torch.zeros((M_total, N), dtype=inputs.dtype, device=inputs.device)
+    for g in range(num_groups):
+        s, e = g * ALIGN_SIZE_M, (g + 1) * ALIGN_SIZE_M
+        w = expert_weights[indices[s].item()]
+        w = w.t() if trans_weights else w
+        out[s:e] = inputs[s:e] @ w
+    return out
+
+
+@pytest.mark.parametrize("shape", [(256, 128, 128, 1), (512, 256, 256, 4)])
+@pytest.mark.parametrize("use_2dblock_x", [False, True])
+@pytest.mark.parametrize("use_2dblock_w", [False, True])
+def test_forward(shape, use_2dblock_x, use_2dblock_w):
+    M_total, N, K, num_experts = shape
+    M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
+    num_groups = M_total // ALIGN_SIZE_M
+    device = torch.device("cuda")
+    data_type = torch.bfloat16
+    trans_weights = True
+
+    inputs = prepare_data((M_total, K), data_type)
+    expert_weights = prepare_data((num_experts, N, K), data_type)
+    indices = _make_indices(num_groups, num_experts, device)
+
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_x)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_w)
+
+    out = mxfp8_grouped_gemm_forward(
+        x_lp, w_lp, indices, x_s, w_s,
+        trans_weights=trans_weights,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+        lhs_format_id=0,
+        rhs_format_id=0,
+        output_dtype=data_type,
+    )
+
+    # Primary correctness gate: dequantize the exact fp8 operands the kernel
+    # consumed, matmul in PyTorch. This isolates kernel-port correctness from
+    # mxfp8-vs-bf16 quantization error.
+    x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_x)
+    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_w)
+    ref_dq = _reference(x_dq, w_dq, indices, num_groups, trans_weights)
+    cos_dq = _cossim(out, ref_dq)
+    assert cos_dq > 0.999, \
+        f"kernel vs dequant-matmul cos-sim too low: {cos_dq} (shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w})"
+
+    # Looser sanity check against the full-precision bf16 path.
+    ref_bf16 = _reference(inputs, expert_weights, indices, num_groups, trans_weights)
+    cos_bf16 = _cossim(out, ref_bf16)
+    assert cos_bf16 > 0.99, f"kernel vs bf16 cos-sim too low: {cos_bf16}"

From 128b5262b1d7b5a4955bcb15bbfcab7c614833ee Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Tue, 2 Jun 2026 04:37:44 -0500
Subject: [PATCH 06/12] Add MXFP8 grouped GEMM forward guards and coverage

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md |  5 ++-
 .../mxfp8/mxfp8_grouped_gemm/cg_forward.py    | 30 +++++++++++++
 .../mxfp8/test_mxfp8_grouped_gemm_forward.py  | 44 ++++++++++++++++---
 3 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index 2c843fb..289c627 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -142,8 +142,9 @@ functional.py        # 暴露顶层入口
 
 **完成情况**：
 - `cg_forward.py` kernel body + wrapper 已实现。fp8 load 用 `other=0.0`；CDNA3 dequant 路径 `_dequantize_fp8` 一律传 `IS_2D_BLOCK=False`（scale 偏移已在 kernel 内展开为逐行 `[BLOCK, n_rep_k]`，与参考 `blockwise_mxfp8_gemm_kernel` 一致）。
-- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py`：2 shapes × 2D-block-x × 2D-block-w 共 8 例全过。主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999；另加 bf16 宽松 sanity（> 0.99）。
-- ⚠️ 当前硬件 MI300X = **CDNA3**，`is_cdna4()=False`，仅验证了 `USE_DOT_SCALED=False` 的 dequant+dot 路径；`tl.dot_scaled`（CDNA4）分支已写但需在 MI350 上验证（见 Step 7）。
+- wrapper 已补最小输入契约检查：`inputs`/`expert_weights` 维度、`expert_indices.numel() == M_total`、`K % 32 == 0`、2D weight scale 的 `N % 32 == 0`、以及 input/weight scale shape 精确匹配。V1 仍不支持 padded buffer；如需支持，应像 mxfp4/nvfp4 一样显式区分 `M_bufferlen` 与 `M_total`。
+- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` 共 16 个正例全过；另有 2 个负例覆盖 `expert_indices` 长度不匹配和 `weight_scales` shape 错误，共 **18 passed**。主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999；另加 bf16 宽松 sanity（> 0.99）。
+- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径，`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py -q` 结果为 `18 passed, 14 warnings in 12.05s`。CDNA3 fallback 路径仍需在 CDNA3/MI300 上单独复验。
 
 ### Step 3 — Backward dgrad kernel
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
index d80812c..fa6b143 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
@@ -236,6 +236,8 @@ def mxfp8_grouped_gemm_forward(
 
     Y[m, :] = X[m] @ W[expert(m)]^T, computed per contiguous group of tokens.
     """
+    assert inputs.dim() == 2, f"inputs must be 2D, got {inputs.dim()}D"
+    assert expert_weights.dim() == 3, f"expert_weights must be 3D, got {expert_weights.dim()}D"
     assert expert_indices.is_contiguous(), "Expert indices tensor must be contiguous"
     if use_dot_scaled is None:
         use_dot_scaled = is_cdna4()
@@ -244,6 +246,10 @@ def mxfp8_grouped_gemm_forward(
     torch._check(M_total > 0)
     assert M_total % ALIGN_SIZE_M == 0, \
         f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
+    assert expert_indices.numel() == M_total, \
+        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
+    if K % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"K ({K}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
 
     if expert_indices.dtype != torch.int32:
         expert_indices = expert_indices.to(torch.int32)
@@ -258,6 +264,30 @@ def mxfp8_grouped_gemm_forward(
         stride_bse, stride_bsk, stride_bsn = weight_scales.stride()
 
     assert K == K_weights, f"Input K ({K}) must match weight K ({K_weights})"
+    if use_2dblock_w and N % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT}) for 2D weight scales")
+
+    expected_input_scales = (
+        (M_total // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+        if use_2dblock_x else
+        (M_total, K // BLOCK_SIZE_DEFAULT)
+    )
+    if trans_weights:
+        expected_weight_scales = (
+            (num_experts, N // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+            if use_2dblock_w else
+            (num_experts, N, K // BLOCK_SIZE_DEFAULT)
+        )
+    else:
+        expected_weight_scales = (
+            (num_experts, K // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+            if use_2dblock_w else
+            (num_experts, K // BLOCK_SIZE_DEFAULT, N)
+        )
+    assert input_scales.shape == torch.Size(expected_input_scales), \
+        f"input_scales shape {input_scales.shape} must be {expected_input_scales}"
+    assert weight_scales.shape == torch.Size(expected_weight_scales), \
+        f"weight_scales shape {weight_scales.shape} must be {expected_weight_scales}"
 
     output = torch.zeros((M_total, N), device=inputs.device, dtype=output_dtype)
 
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
index 4458700..1a038cd 100644
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
@@ -5,7 +5,7 @@
 import pytest
 import torch
 
-from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT, is_cdna4
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
 
@@ -40,22 +40,24 @@ def _reference(inputs, expert_weights, indices, num_groups, trans_weights):
 @pytest.mark.parametrize("shape", [(256, 128, 128, 1), (512, 256, 256, 4)])
 @pytest.mark.parametrize("use_2dblock_x", [False, True])
 @pytest.mark.parametrize("use_2dblock_w", [False, True])
-def test_forward(shape, use_2dblock_x, use_2dblock_w):
+@pytest.mark.parametrize("trans_weights", [True, False])
+def test_forward(shape, use_2dblock_x, use_2dblock_w, trans_weights):
     M_total, N, K, num_experts = shape
     M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
     num_groups = M_total // ALIGN_SIZE_M
     device = torch.device("cuda")
     data_type = torch.bfloat16
-    trans_weights = True
 
     inputs = prepare_data((M_total, K), data_type)
-    expert_weights = prepare_data((num_experts, N, K), data_type)
+    weight_shape = (num_experts, N, K) if trans_weights else (num_experts, K, N)
+    weight_axis = -1 if trans_weights else -2
+    expert_weights = prepare_data(weight_shape, data_type)
     indices = _make_indices(num_groups, num_experts, device)
 
     x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
         inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_x)
     w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_w)
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=weight_axis, is_2d_block=use_2dblock_w)
 
     out = mxfp8_grouped_gemm_forward(
         x_lp, w_lp, indices, x_s, w_s,
@@ -71,7 +73,7 @@ def test_forward(shape, use_2dblock_x, use_2dblock_w):
     # consumed, matmul in PyTorch. This isolates kernel-port correctness from
     # mxfp8-vs-bf16 quantization error.
     x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_x)
-    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_w)
+    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, weight_axis, use_2dblock_w)
     ref_dq = _reference(x_dq, w_dq, indices, num_groups, trans_weights)
     cos_dq = _cossim(out, ref_dq)
     assert cos_dq > 0.999, \
@@ -81,3 +83,33 @@ def test_forward(shape, use_2dblock_x, use_2dblock_w):
     ref_bf16 = _reference(inputs, expert_weights, indices, num_groups, trans_weights)
     cos_bf16 = _cossim(out, ref_bf16)
     assert cos_bf16 > 0.99, f"kernel vs bf16 cos-sim too low: {cos_bf16}"
+
+
+def test_forward_rejects_indices_length_mismatch():
+    M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
+    data_type = torch.bfloat16
+    inputs = prepare_data((M_total, K), data_type)
+    expert_weights = prepare_data((num_experts, N, K), data_type)
+    indices = torch.zeros(M_total - 1, dtype=torch.int32, device="cuda")
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+
+    with pytest.raises(AssertionError, match="expert_indices length"):
+        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
+
+
+def test_forward_rejects_wrong_scale_shape():
+    M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
+    data_type = torch.bfloat16
+    inputs = prepare_data((M_total, K), data_type)
+    expert_weights = prepare_data((num_experts, N, K), data_type)
+    indices = torch.zeros(M_total, dtype=torch.int32, device="cuda")
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+
+    with pytest.raises(AssertionError, match="weight_scales shape"):
+        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s[:, :, :-1])

From d077a80782dfa27baa847e7a7c334efec47c295e Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Fri, 5 Jun 2026 01:48:36 -0500
Subject: [PATCH 07/12] mxfp8: implement grouped GEMM backward path

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md |  36 +-
 .../mxfp8/mxfp8_grouped_gemm/autotune.py      |  24 +
 .../mxfp8/mxfp8_grouped_gemm/cg_backward.py   | 529 +++++++++++++++++-
 .../mxfp8/test_mxfp8_grouped_gemm_backward.py | 222 ++++++++
 4 files changed, 786 insertions(+), 25 deletions(-)
 create mode 100644 tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index 289c627..93289d0 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -146,20 +146,30 @@ functional.py        # 暴露顶层入口
 - 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` 共 16 个正例全过；另有 2 个负例覆盖 `expert_indices` 长度不匹配和 `weight_scales` shape 错误，共 **18 passed**。主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999；另加 bf16 宽松 sanity（> 0.99）。
 - 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径，`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py -q` 结果为 `18 passed, 14 warnings in 12.05s`。CDNA3 fallback 路径仍需在 CDNA3/MI300 上单独复验。
 
-### Step 3 — Backward dgrad kernel
+### Step 3 — Backward dgrad kernel ✅ 已完成
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
 - 删 packing
 - dtype：V1 `LHS=e4m3, RHS=e4m3`（v2 改 `LHS=e5m2`）
 - 注意 W 的访问：dgrad 沿 N reduce，所以 W [N,K] 在 kernel 内按 N-major 加载（与 fwd 相同 shape，不同 reduction）；scale `b_s` 此时沿 N 是 reduction 维 → `stride_bsk`/`stride_bsn` 用法跟 mxfp4 一致
 
-### Step 4 — Backward wgrad kernel
+**完成情况**：
+- `autotune.py` 新增 `DGRAD_CONFIGS`：`BSM=128, BSN=32, BSK=32`，让 dgrad 的 N reduction 每次 `dot_scaled` 只覆盖一个 32-wide MX scale group。
+- `_kernel_mxfp8_grouped_gemm_backward_dx` 已实现：按 `(M, K)` tile 计算 `dX = GO @ W`，删除 mxfp4 packing，支持 `USE_DOT_SCALED=True` 的 CDNA4 `tl.dot_scaled` 路径，以及 `USE_DOT_SCALED=False` 的 CDNA3 dequant + `tl.dot` fallback。
+- wrapper `mxfp8_grouped_gemm_backward_inputs` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/W scale shape 精确匹配，并支持 `trans_weights=True/False`。
+
+### Step 4 — Backward wgrad kernel ✅ 已完成
 基于 `_kernel_mxfp4_grouped_gemm_backward_dw`：
 - 删 packing
 - dtype：V1 `LHS=e4m3, RHS=e4m3`（v2 改 `LHS=e5m2`）
 - M 是 reduction 维 → 必须用 **沿 M 量化** 的 GO / X（autograd 里准备好）
 - 保持 mxfp4 的 "loop over groups, skip if expert mismatch" 简单实现，性能问题留到 v2
 
-### Step 5 — Autograd Function
+**完成情况**：
+- `autotune.py` 新增 `WGRAD_CONFIGS`：`BSM=32, BSN=128, BSK=32`，让 wgrad 的 M reduction 每次 `dot_scaled` 只覆盖一个 32-wide MX scale group。
+- `_kernel_mxfp8_grouped_gemm_backward_dw` 已实现：按 `(expert, N, K)` tile 计算 `dW = GO^T @ X`，保留 mxfp4 的简单调度方式：每个 expert tile 遍历所有 contiguous routing group，只累加匹配 expert 的 group。
+- wrapper `mxfp8_grouped_gemm_backward_weights` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/X scale shape 精确匹配，并支持 `trans_weights=True/False`。
+
+### Step 5 — Autograd Function ✅ 已完成
 参考 `MXFP4GroupedGEMM`：
 1. fwd：调 `convert_to_mxfp8` 量化 X (axis=-1) 与 W (axis=quant_axis_w)，调 fwd kernel
 2. 若 `use_2dblock_x=False`，额外 quant X 沿 axis=0 一份给 wgrad
@@ -168,11 +178,21 @@ functional.py        # 暴露顶层入口
 5. bwd：quant GO 沿 axis=-1（给 dgrad）与 axis=0（给 wgrad），格式用 `bwd_grad_format`（V1=e4m3），调两个 bwd kernel
 6. 跳过 mxfp4 里的 `use_dge` / `hadamard_transform` / `use_macro_block_scaling` / `clip_mode`（这些是研究 feature，最小版本不要）
 
-### Step 6 — 数值正确性测试
-新建 `mxfp8/mxfp8_grouped_gemm/tests/`：
-1. `test_forward.py`：单 expert + 多 expert，bf16 reference 对齐（rel err < ~1e-2）
-2. `test_backward.py`：finite-diff 不现实，改用「mxfp8 模拟版」reference：用 `convert_to_mxfp8` 后立刻 `convert_from_mxfp8` 回 bf16，再走 PyTorch 原生 GEMM，作为「数值等价 reference」
-3. `test_e2e_moe.py`：toy MoE layer（2 expert, K=128, N=128, M_total=256），fwd+bwd+optimizer step，看 loss 下降几步
+**完成情况**：
+- `MXFP8GroupedGEMM` 已接通完整 forward/backward：forward 保存 wgrad 所需的沿 M 量化 X，以及 dgrad 所需的沿 N 量化 W；backward 量化 GO 后分别调用 dgrad/wgrad kernel。
+- `functional.py` 的用户入口 `mxfp8_grouped_gemm(...)` 已接到 `MXFP8GroupedGEMM.apply`，默认仍是 V1 全 e4m3，`bwd_grad_format="e4m3"`。
+- 当前实现没有引入 mxfp4 的 DGE、Hadamard、macro block scaling、clip mode；这是正确的，v1 不该把研究 feature 混进最小路径。
+
+### Step 6 — 数值正确性测试 ⏳ 部分完成
+测试位置在 `tests/unittest/mxfp8/`：
+1. `test_mxfp8_grouped_gemm_forward.py`：单 expert + 多 expert，dequant-then-matmul reference 对齐，并保留 bf16 sanity。
+2. `test_mxfp8_grouped_gemm_backward.py`：用「mxfp8 模拟版」reference（先 `convert_to_mxfp8`，再 `convert_from_mxfp8` 回 fp32，然后走 PyTorch 原生 GEMM）校验 dgrad/wgrad；另覆盖 autograd 端到端 forward + backward。
+3. `test_e2e_moe.py`：toy MoE layer + optimizer step 仍未实现。
+
+**验证记录**：
+- 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **17 passed, 14 warnings in 20.64s**。
+- backward 覆盖：dgrad `trans_weights` × 2D GO × 2D W 共 8 个正例；wgrad `trans_weights` × 2D X 共 4 个正例；autograd 覆盖 `trans_weights=True` 下 2D X/W 四种组合，以及 `trans_weights=False` 的 1D 路径。
+- 仍需补 toy MoE 训练 sanity，并在 CDNA3/MI300 上单独验证 `USE_DOT_SCALED=False` fallback。
 
 ### Step 7 — MI300 fallback 验证
 仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
index aeab99d..b297e63 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/autotune.py
@@ -25,3 +25,27 @@
         num_warps=4,
     ),
 ]
+
+DGRAD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 32,   # dgrad reduces over N; keep one MX scale group per dot_scaled
+            "BLOCK_SIZE_K": 32,
+        },
+        num_stages=2,
+        num_warps=4,
+    ),
+]
+
+WGRAD_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": 32,   # wgrad reduces over M; keep one MX scale group per dot_scaled
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+        },
+        num_stages=2,
+        num_warps=4,
+    ),
+]
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
index bf7da66..14b61c6 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
@@ -5,23 +5,34 @@
 Skeleton only; kernel bodies filled in Steps 3-5.
 """
 
+from typing import Optional
+
 import torch
 import triton
 import triton.language as tl
 from torch.library import triton_op, wrap_triton
 
-from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT, is_cdna4
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT, _dequantize_fp8, is_cdna4
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import (
-    STANDARD_CONFIGS,
+    DGRAD_CONFIGS,
+    WGRAD_CONFIGS,
     ALIGN_SIZE_M,
 )
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
 
 
+def _format_to_id(fmt: str) -> int:
+    if fmt == "e4m3":
+        return 0
+    if fmt == "e5m2":
+        return 1
+    raise ValueError(f"Unsupported MXFP8 format: {fmt}")
+
+
 # ============ dgrad: grad_input = grad_output @ expert_weights ============
 
 @triton.autotune(
-    configs=STANDARD_CONFIGS,
+    configs=DGRAD_CONFIGS,
     key=[
         "N", "K", "GROUP_SIZE_M",
         "USE_2DBLOCK_GO", "USE_2DBLOCK_B",
@@ -57,15 +68,109 @@ def _kernel_mxfp8_grouped_gemm_backward_dx(
     USE_DOT_SCALED: tl.constexpr,
     GROUP_SIZE_M: tl.constexpr = ALIGN_SIZE_M,
 ):
-    """dgrad kernel: dX = GO @ W (N is reduction dim). To be implemented in Step 3."""
-    # TODO(Step 3): port from mxfp4 cg_backward._kernel_mxfp4_grouped_gemm_backward_dx
-    pass
+    """dgrad kernel: dX = GO @ W (N is reduction dim)."""
+    if USE_2DBLOCK_GO:
+        tl.assume(BLOCK_SIZE_M % QUANT_BLOCK_SIZE == 0)
+    if USE_2DBLOCK_B:
+        tl.assume(BLOCK_SIZE_K % QUANT_BLOCK_SIZE == 0)
+    tl.assume(BLOCK_SIZE_N % QUANT_BLOCK_SIZE == 0)
+
+    c_type = grad_input_ptr.dtype.element_ty
+    n_rep_n: tl.constexpr = BLOCK_SIZE_N // QUANT_BLOCK_SIZE
+    Ns: tl.constexpr = N // QUANT_BLOCK_SIZE
+
+    pid = tl.program_id(0)
+    num_k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    tile_m = pid // num_k_tiles
+    tile_k = pid % num_k_tiles
+
+    m_start = tile_m * BLOCK_SIZE_M
+    k_start = tile_k * BLOCK_SIZE_K
+
+    if m_start < M_TOTAL:
+        offs_m = m_start + tl.arange(0, BLOCK_SIZE_M)
+        offs_k = k_start + tl.arange(0, BLOCK_SIZE_K)
+        mask_m = offs_m < M_TOTAL
+        mask_k = offs_k < K
+
+        if USE_2DBLOCK_GO:
+            offs_m_scale = offs_m // QUANT_BLOCK_SIZE
+        else:
+            offs_m_scale = offs_m
+        if USE_2DBLOCK_B:
+            offs_k_scale = offs_k // QUANT_BLOCK_SIZE
+        else:
+            offs_k_scale = offs_k
+
+        group_idx = m_start // GROUP_SIZE_M
+        expert_idx = tl.load(indices_ptr + group_idx * GROUP_SIZE_M)
+
+        grad_input = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
+        num_n_tiles = tl.cdiv(N, BLOCK_SIZE_N)
+        for ni in range(num_n_tiles):
+            offs_n = ni * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+            offs_n_scale = ni * n_rep_n + tl.arange(0, n_rep_n)
+            mask_n = offs_n < N
+            mask_n_scale = offs_n_scale < Ns
+
+            go_ptrs = grad_output_ptr + offs_m[:, None] * stride_gom + offs_n[None, :] * stride_gon
+            go = tl.load(go_ptrs, mask=mask_m[:, None] & mask_n[None, :], other=0.0)
+
+            w_ptrs = b_ptr + expert_idx * stride_be + offs_n[:, None] * stride_bn + offs_k[None, :] * stride_bk
+            w = tl.load(w_ptrs, mask=mask_n[:, None] & mask_k[None, :], other=0.0)
+
+            go_s_ptrs = go_s_ptr + offs_m_scale[:, None] * stride_gosm + offs_n_scale[None, :] * stride_gosn
+            # W scales are K x N-scale even though W operand is N x K.
+            b_s_ptrs = b_s_ptr + expert_idx * stride_bse + offs_k_scale[:, None] * stride_bsk + offs_n_scale[None, :] * stride_bsn
+            go_s = tl.load(go_s_ptrs, mask=mask_m[:, None] & mask_n_scale[None, :], other=1)
+            b_s = tl.load(b_s_ptrs, mask=mask_k[:, None] & mask_n_scale[None, :], other=1)
+
+            if USE_DOT_SCALED:
+                if LHS_FORMAT_ID == 0:
+                    if RHS_FORMAT_ID == 0:
+                        grad_input = tl.dot_scaled(go, go_s, "e4m3", w, b_s, "e4m3",
+                                                   acc=grad_input, out_dtype=tl.float32)
+                    else:
+                        grad_input = tl.dot_scaled(go, go_s, "e4m3", w, b_s, "e5m2",
+                                                   acc=grad_input, out_dtype=tl.float32)
+                else:
+                    if RHS_FORMAT_ID == 0:
+                        grad_input = tl.dot_scaled(go, go_s, "e5m2", w, b_s, "e4m3",
+                                                   acc=grad_input, out_dtype=tl.float32)
+                    else:
+                        grad_input = tl.dot_scaled(go, go_s, "e5m2", w, b_s, "e5m2",
+                                                   acc=grad_input, out_dtype=tl.float32)
+            else:
+                go_dq = _dequantize_fp8(
+                    go, go_s,
+                    output_dtype=tl.float32,
+                    BLOCK_M=BLOCK_SIZE_M,
+                    BLOCK_N=BLOCK_SIZE_N,
+                    QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                    FP8_FORMAT=LHS_FORMAT_ID,
+                    IS_2D_BLOCK=False,
+                    USE_ASM=False,
+                )
+                w_dq = tl.trans(_dequantize_fp8(
+                    tl.trans(w), b_s,
+                    output_dtype=tl.float32,
+                    BLOCK_M=BLOCK_SIZE_K,
+                    BLOCK_N=BLOCK_SIZE_N,
+                    QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                    FP8_FORMAT=RHS_FORMAT_ID,
+                    IS_2D_BLOCK=False,
+                    USE_ASM=False,
+                ))
+                grad_input = tl.dot(go_dq, w_dq, acc=grad_input, out_dtype=tl.float32)
+
+        grad_input_ptrs = grad_input_ptr + offs_m[:, None] * stride_gim + offs_k[None, :] * stride_gik
+        tl.store(grad_input_ptrs, grad_input.to(c_type), mask=mask_m[:, None] & mask_k[None, :])
 
 
 # ============ wgrad: grad_weights = grad_output^T @ inputs (per expert) ============
 
 @triton.autotune(
-    configs=STANDARD_CONFIGS,
+    configs=WGRAD_CONFIGS,
     key=[
         "N", "K", "NUM_EXPERTS", "GROUP_SIZE_M",
         "USE_2DBLOCK_GO", "USE_2DBLOCK_A",
@@ -101,9 +206,109 @@ def _kernel_mxfp8_grouped_gemm_backward_dw(
     RHS_FORMAT_ID: tl.constexpr,  # 0 (e4m3)
     USE_DOT_SCALED: tl.constexpr,
 ):
-    """wgrad kernel: dW = GO^T @ X (M is reduction dim). To be implemented in Step 4."""
-    # TODO(Step 4): port from mxfp4 cg_backward._kernel_mxfp4_grouped_gemm_backward_dw
-    pass
+    """wgrad kernel: dW = GO^T @ X (M is reduction dim)."""
+    if USE_2DBLOCK_GO:
+        tl.assume(BLOCK_SIZE_N % QUANT_BLOCK_SIZE == 0)
+    if USE_2DBLOCK_A:
+        tl.assume(BLOCK_SIZE_K % QUANT_BLOCK_SIZE == 0)
+    tl.assume(BLOCK_SIZE_M % QUANT_BLOCK_SIZE == 0)
+
+    c_type = grad_weights_ptr.dtype.element_ty
+    n_rep_m: tl.constexpr = BLOCK_SIZE_M // QUANT_BLOCK_SIZE
+    Ms: tl.constexpr = M_TOTAL // QUANT_BLOCK_SIZE
+
+    pid = tl.program_id(0)
+    n_tiles = tl.cdiv(N, BLOCK_SIZE_N)
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    tiles_per_expert = n_tiles * k_tiles
+    expert_id = pid // tiles_per_expert
+    position_id = pid % tiles_per_expert
+
+    if expert_id < NUM_EXPERTS:
+        tile_n = position_id // k_tiles
+        tile_k = position_id % k_tiles
+        n_start = tile_n * BLOCK_SIZE_N
+        k_start = tile_k * BLOCK_SIZE_K
+
+        offs_n = n_start + tl.arange(0, BLOCK_SIZE_N)
+        offs_k = k_start + tl.arange(0, BLOCK_SIZE_K)
+        mask_n = offs_n < N
+        mask_k = offs_k < K
+
+        if USE_2DBLOCK_GO:
+            offs_n_scale = offs_n // QUANT_BLOCK_SIZE
+        else:
+            offs_n_scale = offs_n
+        if USE_2DBLOCK_A:
+            offs_k_scale = offs_k // QUANT_BLOCK_SIZE
+        else:
+            offs_k_scale = offs_k
+
+        grad_weights = tl.zeros((BLOCK_SIZE_N, BLOCK_SIZE_K), dtype=tl.float32)
+        for group_idx in range(0, M_TOTAL // GROUP_SIZE_M):
+            group_start = group_idx * GROUP_SIZE_M
+            group_expert = tl.load(indices_ptr + group_start)
+
+            if group_expert == expert_id:
+                for m_offset in range(0, GROUP_SIZE_M, BLOCK_SIZE_M):
+                    m_start = group_start + m_offset
+                    offs_m = m_start + tl.arange(0, BLOCK_SIZE_M)
+                    offs_m_scale = m_start // QUANT_BLOCK_SIZE + tl.arange(0, n_rep_m)
+                    mask_m = offs_m < tl.minimum(group_start + GROUP_SIZE_M, M_TOTAL)
+                    mask_m_scale = offs_m_scale < tl.minimum((group_start + GROUP_SIZE_M) // QUANT_BLOCK_SIZE, Ms)
+
+                    go_ptrs = grad_output_ptr + offs_n[:, None] * stride_gon + offs_m[None, :] * stride_gom
+                    go = tl.load(go_ptrs, mask=mask_n[:, None] & mask_m[None, :], other=0.0)
+
+                    in_ptrs = inputs_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+                    inp = tl.load(in_ptrs, mask=mask_m[:, None] & mask_k[None, :], other=0.0)
+
+                    go_s_ptrs = go_s_ptr + offs_n_scale[:, None] * stride_gosn + offs_m_scale[None, :] * stride_gosm
+                    # Input scales are K x M-scale even though input operand is M x K.
+                    inp_s_ptrs = a_s_ptr + offs_k_scale[:, None] * stride_ask + offs_m_scale[None, :] * stride_asm
+                    go_s = tl.load(go_s_ptrs, mask=mask_n[:, None] & mask_m_scale[None, :], other=1)
+                    inp_s = tl.load(inp_s_ptrs, mask=mask_k[:, None] & mask_m_scale[None, :], other=1)
+
+                    if USE_DOT_SCALED:
+                        if LHS_FORMAT_ID == 0:
+                            if RHS_FORMAT_ID == 0:
+                                grad_weights = tl.dot_scaled(go, go_s, "e4m3", inp, inp_s, "e4m3",
+                                                             acc=grad_weights, out_dtype=tl.float32)
+                            else:
+                                grad_weights = tl.dot_scaled(go, go_s, "e4m3", inp, inp_s, "e5m2",
+                                                             acc=grad_weights, out_dtype=tl.float32)
+                        else:
+                            if RHS_FORMAT_ID == 0:
+                                grad_weights = tl.dot_scaled(go, go_s, "e5m2", inp, inp_s, "e4m3",
+                                                             acc=grad_weights, out_dtype=tl.float32)
+                            else:
+                                grad_weights = tl.dot_scaled(go, go_s, "e5m2", inp, inp_s, "e5m2",
+                                                             acc=grad_weights, out_dtype=tl.float32)
+                    else:
+                        go_dq = _dequantize_fp8(
+                            go, go_s,
+                            output_dtype=tl.float32,
+                            BLOCK_M=BLOCK_SIZE_N,
+                            BLOCK_N=BLOCK_SIZE_M,
+                            QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                            FP8_FORMAT=LHS_FORMAT_ID,
+                            IS_2D_BLOCK=False,
+                            USE_ASM=False,
+                        )
+                        inp_dq = tl.trans(_dequantize_fp8(
+                            tl.trans(inp), inp_s,
+                            output_dtype=tl.float32,
+                            BLOCK_M=BLOCK_SIZE_K,
+                            BLOCK_N=BLOCK_SIZE_M,
+                            QUANT_BLOCK_SIZE=QUANT_BLOCK_SIZE,
+                            FP8_FORMAT=RHS_FORMAT_ID,
+                            IS_2D_BLOCK=False,
+                            USE_ASM=False,
+                        ))
+                        grad_weights = tl.dot(go_dq, inp_dq, acc=grad_weights, out_dtype=tl.float32)
+
+        grad_w_ptrs = grad_weights_ptr + expert_id * stride_gbe + offs_n[:, None] * stride_gbn + offs_k[None, :] * stride_gbk
+        tl.store(grad_w_ptrs, grad_weights.to(c_type), mask=mask_n[:, None] & mask_k[None, :])
 
 
 # =============== triton_op wrappers ===============
@@ -121,9 +326,91 @@ def mxfp8_grouped_gemm_backward_inputs(
     lhs_format_id: int = 0,  # V1: e4m3 (v2: 1=e5m2 for grad_output)
     rhs_format_id: int = 0,  # e4m3
     output_dtype: torch.dtype = torch.bfloat16,
+    use_dot_scaled: Optional[bool] = None,
 ) -> torch.Tensor:
-    """dgrad wrapper. To be implemented in Step 3."""
-    raise NotImplementedError("Step 3: implement dgrad wrapper")
+    """dgrad wrapper: grad_input = grad_output @ W."""
+    assert grad_output.dim() == 2, f"grad_output must be 2D, got {grad_output.dim()}D"
+    assert expert_weights.dim() == 3, f"expert_weights must be 3D, got {expert_weights.dim()}D"
+    assert expert_indices.is_contiguous(), "Expert indices tensor must be contiguous"
+    if use_dot_scaled is None:
+        use_dot_scaled = is_cdna4()
+
+    M_total, N = grad_output.shape
+    torch._check(M_total > 0)
+    assert M_total % ALIGN_SIZE_M == 0, \
+        f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
+    assert expert_indices.numel() == M_total, \
+        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
+    if N % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
+
+    if expert_indices.dtype != torch.int32:
+        expert_indices = expert_indices.to(torch.int32)
+
+    if trans_weights:
+        num_experts, N_weights, K = expert_weights.shape
+        stride_be, stride_bn, stride_bk = expert_weights.stride()
+        stride_bse, stride_bsn, stride_bsk = expert_weight_scales.stride()
+        expected_weight_scales = (
+            (num_experts, N // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+            if use_2dblock_w else
+            (num_experts, N // BLOCK_SIZE_DEFAULT, K)
+        )
+    else:
+        num_experts, K, N_weights = expert_weights.shape
+        stride_be, stride_bk, stride_bn = expert_weights.stride()
+        stride_bse, stride_bsk, stride_bsn = expert_weight_scales.stride()
+        expected_weight_scales = (
+            (num_experts, K // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+            if use_2dblock_w else
+            (num_experts, K, N // BLOCK_SIZE_DEFAULT)
+        )
+
+    assert N == N_weights, f"grad_output N ({N}) must match weight N ({N_weights})"
+    if K % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"K ({K}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
+
+    expected_go_scales = (
+        (M_total // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+        if use_2dblock_x else
+        (M_total, N // BLOCK_SIZE_DEFAULT)
+    )
+    assert go_scales.shape == torch.Size(expected_go_scales), \
+        f"go_scales shape {go_scales.shape} must be {expected_go_scales}"
+    assert expert_weight_scales.shape == torch.Size(expected_weight_scales), \
+        f"expert_weight_scales shape {expert_weight_scales.shape} must be {expected_weight_scales}"
+
+    grad_inputs = torch.zeros((M_total, K), device=grad_output.device, dtype=output_dtype)
+    stride_gom, stride_gon = grad_output.stride()
+    stride_gosm, stride_gosn = go_scales.stride()
+    stride_gim, stride_gik = grad_inputs.stride()
+
+    grid = lambda meta: (triton.cdiv(M_total, meta["BLOCK_SIZE_M"]) * triton.cdiv(K, meta["BLOCK_SIZE_K"]),)
+    wrap_triton(_kernel_mxfp8_grouped_gemm_backward_dx)[grid](
+        grad_output,
+        expert_weights,
+        grad_inputs,
+        expert_indices,
+        go_scales,
+        expert_weight_scales,
+        stride_gom, stride_gon,
+        stride_be, stride_bn, stride_bk,
+        stride_gim, stride_gik,
+        stride_gosm, stride_gosn,
+        stride_bse, stride_bsn, stride_bsk,
+        M_TOTAL=M_total,
+        N=N,
+        K=K,
+        NUM_EXPERTS=num_experts,
+        GROUP_SIZE_M=ALIGN_SIZE_M,
+        USE_2DBLOCK_GO=use_2dblock_x,
+        USE_2DBLOCK_B=use_2dblock_w,
+        QUANT_BLOCK_SIZE=BLOCK_SIZE_DEFAULT,
+        LHS_FORMAT_ID=lhs_format_id,
+        RHS_FORMAT_ID=rhs_format_id,
+        USE_DOT_SCALED=use_dot_scaled,
+    )
+    return grad_inputs
 
 
 @triton_op("alto::mxfp8_grouped_gemm_backward_weights", mutates_args={})
@@ -140,16 +427,91 @@ def mxfp8_grouped_gemm_backward_weights(
     lhs_format_id: int = 0,  # V1: e4m3 (v2: 1=e5m2 for grad_output)
     rhs_format_id: int = 0,  # e4m3
     output_dtype: torch.dtype = torch.bfloat16,
+    use_dot_scaled: Optional[bool] = None,
 ) -> torch.Tensor:
-    """wgrad wrapper. To be implemented in Step 4."""
-    raise NotImplementedError("Step 4: implement wgrad wrapper")
+    """wgrad wrapper: grad_weight = grad_output^T @ inputs per expert."""
+    assert grad_output.dim() == 2, f"grad_output must be 2D, got {grad_output.dim()}D"
+    assert inputs.dim() == 2, f"inputs must be 2D, got {inputs.dim()}D"
+    assert expert_indices.is_contiguous(), "Expert indices tensor must be contiguous"
+    if use_dot_scaled is None:
+        use_dot_scaled = is_cdna4()
+
+    M_total, N = grad_output.shape
+    M_inputs, K = inputs.shape
+    assert M_inputs == M_total, f"inputs M ({M_inputs}) must match grad_output M ({M_total})"
+    torch._check(M_total > 0)
+    assert M_total % ALIGN_SIZE_M == 0, \
+        f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
+    assert expert_indices.numel() == M_total, \
+        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
+    if N % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
+    if K % BLOCK_SIZE_DEFAULT != 0:
+        raise ValueError(f"K ({K}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
+
+    if expert_indices.dtype != torch.int32:
+        expert_indices = expert_indices.to(torch.int32)
+
+    expected_go_scales = (
+        (M_total // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+        if use_2dblock_go else
+        (M_total // BLOCK_SIZE_DEFAULT, N)
+    )
+    expected_input_scales = (
+        (M_total // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+        if use_2dblock_x else
+        (M_total // BLOCK_SIZE_DEFAULT, K)
+    )
+    assert go_scales.shape == torch.Size(expected_go_scales), \
+        f"go_scales shape {go_scales.shape} must be {expected_go_scales}"
+    assert input_scales.shape == torch.Size(expected_input_scales), \
+        f"input_scales shape {input_scales.shape} must be {expected_input_scales}"
+
+    if trans_weights:
+        grad_weights = torch.zeros((num_experts, N, K), device=grad_output.device, dtype=output_dtype)
+        stride_gbe, stride_gbn, stride_gbk = grad_weights.stride()
+    else:
+        grad_weights = torch.zeros((num_experts, K, N), device=grad_output.device, dtype=output_dtype)
+        stride_gbe, stride_gbk, stride_gbn = grad_weights.stride()
+
+    stride_gom, stride_gon = grad_output.stride()
+    stride_gosm, stride_gosn = go_scales.stride()
+    stride_am, stride_ak = inputs.stride()
+    stride_asm, stride_ask = input_scales.stride()
+
+    grid = lambda meta: (num_experts * triton.cdiv(N, meta["BLOCK_SIZE_N"]) * triton.cdiv(K, meta["BLOCK_SIZE_K"]),)
+    wrap_triton(_kernel_mxfp8_grouped_gemm_backward_dw)[grid](
+        grad_output,
+        inputs,
+        grad_weights,
+        expert_indices,
+        go_scales,
+        input_scales,
+        stride_gom, stride_gon,
+        stride_am, stride_ak,
+        stride_gbe, stride_gbn, stride_gbk,
+        stride_gosm, stride_gosn,
+        stride_asm, stride_ask,
+        M_TOTAL=M_total,
+        N=N,
+        K=K,
+        NUM_EXPERTS=num_experts,
+        GROUP_SIZE_M=ALIGN_SIZE_M,
+        USE_2DBLOCK_GO=use_2dblock_go,
+        USE_2DBLOCK_A=use_2dblock_x,
+        QUANT_BLOCK_SIZE=BLOCK_SIZE_DEFAULT,
+        LHS_FORMAT_ID=lhs_format_id,
+        RHS_FORMAT_ID=rhs_format_id,
+        USE_DOT_SCALED=use_dot_scaled,
+    )
+    return grad_weights
 
 
 # =============== Autograd Function ===============
 
 @torch.compiler.allow_in_graph
 class MXFP8GroupedGEMM(torch.autograd.Function):
-    """Autograd Function for mxfp8 grouped GEMM. To be implemented in Step 5."""
+    """Autograd Function for the V1 MXFP8 grouped GEMM path."""
 
     @staticmethod
     def forward(
@@ -164,8 +526,141 @@ def forward(
         fwd_format="e4m3",
         bwd_grad_format="e4m3",
     ):
-        raise NotImplementedError("Step 5: implement autograd forward")
+        original_dtype = inputs.dtype
+        fwd_format_id = _format_to_id(fwd_format)
+        bwd_grad_format_id = _format_to_id(bwd_grad_format)
+        quant_axis_w = -1 if trans_weights else -2
+        requant_axis_w = -2 if trans_weights else -1
+
+        inputs_mxfp8, input_scales = torch.ops.alto.convert_to_mxfp8(
+            inputs,
+            block_size=BLOCK_SIZE_DEFAULT,
+            mxfp_format=fwd_format,
+            axis=-1,
+            is_2d_block=use_2dblock_x,
+        )
+        expert_weights_mxfp8, expert_weight_scales = torch.ops.alto.convert_to_mxfp8(
+            expert_weights,
+            block_size=BLOCK_SIZE_DEFAULT,
+            mxfp_format=fwd_format,
+            axis=quant_axis_w,
+            is_2d_block=use_2dblock_w,
+        )
+
+        output = mxfp8_grouped_gemm_forward(
+            inputs_mxfp8,
+            expert_weights_mxfp8,
+            expert_indices,
+            input_scales,
+            expert_weight_scales,
+            trans_weights=trans_weights,
+            use_2dblock_x=use_2dblock_x,
+            use_2dblock_w=use_2dblock_w,
+            lhs_format_id=fwd_format_id,
+            rhs_format_id=fwd_format_id,
+            output_dtype=original_dtype,
+        )
+
+        if use_2dblock_w:
+            expert_weights_dgrad = expert_weights_mxfp8
+            expert_weight_dgrad_scales = expert_weight_scales
+        else:
+            expert_weights_dgrad, expert_weight_dgrad_scales = torch.ops.alto.convert_to_mxfp8(
+                expert_weights,
+                block_size=BLOCK_SIZE_DEFAULT,
+                mxfp_format=fwd_format,
+                axis=requant_axis_w,
+                is_2d_block=False,
+            )
+
+        if use_2dblock_x:
+            inputs_wgrad = inputs_mxfp8
+            input_wgrad_scales = input_scales
+        else:
+            inputs_wgrad, input_wgrad_scales = torch.ops.alto.convert_to_mxfp8(
+                inputs,
+                block_size=BLOCK_SIZE_DEFAULT,
+                mxfp_format=fwd_format,
+                axis=0,
+                is_2d_block=False,
+            )
+
+        ctx.save_for_backward(
+            inputs_wgrad,
+            input_wgrad_scales,
+            expert_weights_dgrad,
+            expert_weight_dgrad_scales,
+            expert_indices,
+        )
+        ctx.trans_weights = trans_weights
+        ctx.use_2dblock_x = use_2dblock_x
+        ctx.use_2dblock_w = use_2dblock_w
+        ctx.use_sr_grad = use_sr_grad
+        ctx.fwd_format_id = fwd_format_id
+        ctx.bwd_grad_format_id = bwd_grad_format_id
+        ctx.bwd_grad_format = bwd_grad_format
+        ctx.original_dtype = original_dtype
+        ctx.num_experts = expert_weights.shape[0]
+        return output
 
     @staticmethod
     def backward(ctx, grad_output):
-        raise NotImplementedError("Step 5: implement autograd backward")
+        inputs_wgrad, input_wgrad_scales, expert_weights_dgrad, expert_weight_dgrad_scales, expert_indices = ctx.saved_tensors
+
+        if ctx.use_2dblock_x:
+            grad_output_dgrad, grad_output_dgrad_scales = torch.ops.alto.convert_to_mxfp8(
+                grad_output,
+                block_size=BLOCK_SIZE_DEFAULT,
+                mxfp_format=ctx.bwd_grad_format,
+                axis=-1,
+                is_2d_block=True,
+                use_sr=ctx.use_sr_grad,
+            )
+            grad_output_wgrad = grad_output_dgrad
+            grad_output_wgrad_scales = grad_output_dgrad_scales
+        else:
+            grad_output_dgrad, grad_output_dgrad_scales = torch.ops.alto.convert_to_mxfp8(
+                grad_output,
+                block_size=BLOCK_SIZE_DEFAULT,
+                mxfp_format=ctx.bwd_grad_format,
+                axis=-1,
+                is_2d_block=False,
+                use_sr=ctx.use_sr_grad,
+            )
+            grad_output_wgrad, grad_output_wgrad_scales = torch.ops.alto.convert_to_mxfp8(
+                grad_output,
+                block_size=BLOCK_SIZE_DEFAULT,
+                mxfp_format=ctx.bwd_grad_format,
+                axis=0,
+                is_2d_block=False,
+                use_sr=ctx.use_sr_grad,
+            )
+
+        grad_inputs = torch.ops.alto.mxfp8_grouped_gemm_backward_inputs(
+            grad_output=grad_output_dgrad,
+            expert_weights=expert_weights_dgrad,
+            expert_indices=expert_indices,
+            go_scales=grad_output_dgrad_scales,
+            expert_weight_scales=expert_weight_dgrad_scales,
+            trans_weights=ctx.trans_weights,
+            use_2dblock_x=ctx.use_2dblock_x,
+            use_2dblock_w=ctx.use_2dblock_w,
+            lhs_format_id=ctx.bwd_grad_format_id,
+            rhs_format_id=ctx.fwd_format_id,
+            output_dtype=ctx.original_dtype,
+        )
+        grad_weights = torch.ops.alto.mxfp8_grouped_gemm_backward_weights(
+            grad_output=grad_output_wgrad,
+            inputs=inputs_wgrad,
+            expert_indices=expert_indices,
+            num_experts=ctx.num_experts,
+            go_scales=grad_output_wgrad_scales,
+            input_scales=input_wgrad_scales,
+            trans_weights=ctx.trans_weights,
+            use_2dblock_go=ctx.use_2dblock_x,
+            use_2dblock_x=ctx.use_2dblock_x,
+            lhs_format_id=ctx.bwd_grad_format_id,
+            rhs_format_id=ctx.fwd_format_id,
+            output_dtype=ctx.original_dtype,
+        )
+        return grad_inputs, grad_weights, None, None, None, None, None, None, None
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
new file mode 100644
index 0000000..7ab40c8
--- /dev/null
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+
+import pytest
+import torch
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_backward import (
+    mxfp8_grouped_gemm_backward_inputs,
+    mxfp8_grouped_gemm_backward_weights,
+)
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
+
+from .utils import prepare_data, convert_from_mxfp8_pytorch
+
+
+def _cossim(x, y):
+    return torch.nn.functional.cosine_similarity(x.flatten().float(), y.flatten().float(), dim=0).item()
+
+
+def _make_indices(num_groups, num_experts, device):
+    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
+    return indices
+
+
+def _reference(inputs, expert_weights, indices, trans_weights):
+    m_total = inputs.shape[0]
+    n_dim = expert_weights.shape[1] if trans_weights else expert_weights.shape[2]
+    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        expert_idx = indices[start].item()
+        weight = expert_weights[expert_idx].t() if trans_weights else expert_weights[expert_idx]
+        out[start:start + ALIGN_SIZE_M] = inputs[start:start + ALIGN_SIZE_M] @ weight
+    return out
+
+
+def _reference_dgrad(grad_output, expert_weights, indices, trans_weights):
+    m_total = grad_output.shape[0]
+    k_dim = expert_weights.shape[2] if trans_weights else expert_weights.shape[1]
+    grad_inputs = torch.zeros((m_total, k_dim), dtype=grad_output.dtype, device=grad_output.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        end = start + ALIGN_SIZE_M
+        expert_idx = indices[start].item()
+        weight = expert_weights[expert_idx] if trans_weights else expert_weights[expert_idx].t()
+        grad_inputs[start:end] = grad_output[start:end] @ weight
+    return grad_inputs
+
+
+def _reference_wgrad(grad_output, inputs, indices, num_experts, trans_weights):
+    n_dim = grad_output.shape[1]
+    k_dim = inputs.shape[1]
+    grad_weights = torch.zeros(
+        (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim),
+        dtype=grad_output.dtype,
+        device=grad_output.device,
+    )
+    for start in range(0, inputs.shape[0], ALIGN_SIZE_M):
+        end = start + ALIGN_SIZE_M
+        expert_idx = indices[start].item()
+        if trans_weights:
+            grad_weights[expert_idx] += grad_output[start:end].t() @ inputs[start:end]
+        else:
+            grad_weights[expert_idx] += inputs[start:end].t() @ grad_output[start:end]
+    return grad_weights
+
+
+@pytest.mark.parametrize("trans_weights", [True, False])
+@pytest.mark.parametrize("use_2dblock_go", [False, True])
+@pytest.mark.parametrize("use_2dblock_w", [False, True])
+def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go, use_2dblock_w):
+    m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
+    dtype = torch.bfloat16
+
+    grad_output = prepare_data((m_total, n_dim), dtype)
+    weight_shape = (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim)
+    expert_weights = prepare_data(weight_shape, dtype)
+    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, torch.device("cuda"))
+
+    weight_axis = (-1 if trans_weights else -2) if use_2dblock_w else (-2 if trans_weights else -1)
+    grad_output_lp, grad_output_scales = torch.ops.alto.convert_to_mxfp8(
+        grad_output,
+        block_size=BLOCK_SIZE_DEFAULT,
+        mxfp_format="e4m3",
+        axis=-1,
+        is_2d_block=use_2dblock_go,
+    )
+    expert_weights_lp, expert_weight_scales = torch.ops.alto.convert_to_mxfp8(
+        expert_weights,
+        block_size=BLOCK_SIZE_DEFAULT,
+        mxfp_format="e4m3",
+        axis=weight_axis,
+        is_2d_block=use_2dblock_w,
+    )
+
+    grad_inputs = mxfp8_grouped_gemm_backward_inputs(
+        grad_output_lp,
+        expert_weights_lp,
+        expert_indices,
+        grad_output_scales,
+        expert_weight_scales,
+        trans_weights=trans_weights,
+        use_2dblock_x=use_2dblock_go,
+        use_2dblock_w=use_2dblock_w,
+        output_dtype=dtype,
+    )
+
+    grad_output_dq = convert_from_mxfp8_pytorch(
+        grad_output_lp, grad_output_scales, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_go)
+    expert_weights_dq = convert_from_mxfp8_pytorch(
+        expert_weights_lp, expert_weight_scales, torch.float32, BLOCK_SIZE_DEFAULT, weight_axis, use_2dblock_w)
+    grad_inputs_ref = _reference_dgrad(grad_output_dq, expert_weights_dq, expert_indices, trans_weights)
+
+    cos = _cossim(grad_inputs, grad_inputs_ref)
+    assert cos > 0.999, \
+        f"dgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_go={use_2dblock_go}, 2d_w={use_2dblock_w})"
+
+
+@pytest.mark.parametrize("trans_weights", [True, False])
+@pytest.mark.parametrize("use_2dblock_x", [False, True])
+def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x):
+    m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
+    dtype = torch.bfloat16
+
+    inputs = prepare_data((m_total, k_dim), dtype)
+    grad_output = prepare_data((m_total, n_dim), dtype)
+    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, torch.device("cuda"))
+
+    grad_axis = -1 if use_2dblock_x else 0
+    input_axis = -1 if use_2dblock_x else 0
+    grad_output_lp, grad_output_scales = torch.ops.alto.convert_to_mxfp8(
+        grad_output,
+        block_size=BLOCK_SIZE_DEFAULT,
+        mxfp_format="e4m3",
+        axis=grad_axis,
+        is_2d_block=use_2dblock_x,
+    )
+    inputs_lp, input_scales = torch.ops.alto.convert_to_mxfp8(
+        inputs,
+        block_size=BLOCK_SIZE_DEFAULT,
+        mxfp_format="e4m3",
+        axis=input_axis,
+        is_2d_block=use_2dblock_x,
+    )
+
+    grad_weights = mxfp8_grouped_gemm_backward_weights(
+        grad_output_lp,
+        inputs_lp,
+        expert_indices,
+        num_experts,
+        grad_output_scales,
+        input_scales,
+        trans_weights=trans_weights,
+        use_2dblock_go=use_2dblock_x,
+        use_2dblock_x=use_2dblock_x,
+        output_dtype=dtype,
+    )
+
+    grad_output_dq = convert_from_mxfp8_pytorch(
+        grad_output_lp, grad_output_scales, torch.float32, BLOCK_SIZE_DEFAULT, grad_axis, use_2dblock_x)
+    inputs_dq = convert_from_mxfp8_pytorch(
+        inputs_lp, input_scales, torch.float32, BLOCK_SIZE_DEFAULT, input_axis, use_2dblock_x)
+    grad_weights_ref = _reference_wgrad(grad_output_dq, inputs_dq, expert_indices, num_experts, trans_weights)
+
+    cos = _cossim(grad_weights, grad_weights_ref)
+    assert cos > 0.999, f"wgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_x={use_2dblock_x})"
+
+
+def _run_autograd_case(trans_weights, use_2dblock_x, use_2dblock_w, shape=(256, 128, 128, 2)):
+    m_total, n_dim, k_dim, num_experts = shape
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
+    weight_shape = (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim)
+    expert_weights = prepare_data(weight_shape, dtype).requires_grad_(True)
+    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, device)
+    target = prepare_data((m_total, n_dim), dtype)
+
+    outputs_ref = _reference(inputs, expert_weights, expert_indices, trans_weights)
+    torch.nn.functional.mse_loss(outputs_ref, target).backward()
+    grad_inputs_ref = inputs.grad.detach().clone()
+    grad_weights_ref = expert_weights.grad.detach().clone()
+    inputs.grad.zero_()
+    expert_weights.grad.zero_()
+
+    outputs = mxfp8_grouped_gemm(
+        inputs,
+        expert_weights,
+        expert_indices,
+        trans_weights=trans_weights,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+        use_sr_grad=False,
+    )
+    torch.nn.functional.mse_loss(outputs, target).backward()
+
+    assert _cossim(outputs, outputs_ref) > 0.99
+    assert _cossim(inputs.grad, grad_inputs_ref) > 0.99
+    assert _cossim(expert_weights.grad, grad_weights_ref) > 0.99
+
+
+@pytest.mark.parametrize("use_2dblock_x", [False, True])
+@pytest.mark.parametrize("use_2dblock_w", [False, True])
+def test_mxfp8_grouped_gemm_autograd(use_2dblock_x, use_2dblock_w):
+    _run_autograd_case(
+        trans_weights=True,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+    )
+
+
+def test_mxfp8_grouped_gemm_autograd_trans_weights_false():
+    _run_autograd_case(
+        trans_weights=False,
+        use_2dblock_x=False,
+        use_2dblock_w=False,
+        shape=(256, 256, 128, 2),
+    )

From caccfc1a2c774f03147348c9d210f2bff6ed448d Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Wed, 10 Jun 2026 03:32:38 +0000
Subject: [PATCH 08/12] mxfp8: add SNR test gates and use_dot_scaled coverage
 for grouped GEMM

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md | 140 ++++++++++++++++--
 .../mxfp8/mxfp8_grouped_gemm/cg_backward.py   |   5 +-
 .../mxfp8/test_mxfp8_grouped_gemm_backward.py |  67 ++++++++-
 .../mxfp8/test_mxfp8_grouped_gemm_forward.py  |  87 +++++++++++
 4 files changed, 275 insertions(+), 24 deletions(-)

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index 93289d0..30e636f 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -142,9 +142,14 @@ functional.py        # 暴露顶层入口
 
 **完成情况**：
 - `cg_forward.py` kernel body + wrapper 已实现。fp8 load 用 `other=0.0`；CDNA3 dequant 路径 `_dequantize_fp8` 一律传 `IS_2D_BLOCK=False`（scale 偏移已在 kernel 内展开为逐行 `[BLOCK, n_rep_k]`，与参考 `blockwise_mxfp8_gemm_kernel` 一致）。
-- wrapper 已补最小输入契约检查：`inputs`/`expert_weights` 维度、`expert_indices.numel() == M_total`、`K % 32 == 0`、2D weight scale 的 `N % 32 == 0`、以及 input/weight scale shape 精确匹配。V1 仍不支持 padded buffer；如需支持，应像 mxfp4/nvfp4 一样显式区分 `M_bufferlen` 与 `M_total`。
-- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` 共 16 个正例全过；另有 2 个负例覆盖 `expert_indices` 长度不匹配和 `weight_scales` shape 错误，共 **18 passed**。主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999；另加 bf16 宽松 sanity（> 0.99）。
-- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径，`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py -q` 结果为 `18 passed, 14 warnings in 12.05s`。CDNA3 fallback 路径仍需在 CDNA3/MI300 上单独复验。
+- wrapper 已补最小输入契约检查：`inputs`/`expert_weights` 维度、`expert_indices.numel() == M_total`、`M_total` 按 `ALIGN_SIZE_M` 对齐、`K % 32 == 0`、2D weight scale 的 `N % 32 == 0`、以及 input/weight scale shape 精确匹配。V1 仍不支持 padded buffer；如需支持，应像 mxfp4/nvfp4 一样显式区分 `M_bufferlen` 与 `M_total`。
+- wrapper 暴露了 `use_dot_scaled: Optional[bool] = None` 开关：`None` 时按设备自动选择（CDNA4→`tl.dot_scaled`，否则 dequant fallback），显式传 `False` 可在任意设备上强制走 CDNA3 dequant 路径用于测试。
+- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py` 共 **21 个**，主校验全部用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999 且 **SNR > 40 dB**（SNR 抓 cos-sim 看不到的幅度错误，如漏 scale / 错累加器），另加 bf16 宽松 sanity（> 0.99）：
+  - `test_forward`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` = 16 个正例。
+  - `test_forward_dequant_fallback_matches_dot_scaled`：强制 `use_dot_scaled=False`，无论运行设备都覆盖 `_dequantize_fp8 → tl.dot` 分支（真实 MI300 ground-truth 仍待 Step 7）。
+  - `test_forward_single_expert_matches_mxfp8_linear`：全部 token 路由到单 expert，与 `mxfp8_linear._to_mxfp8_then_scaled_mm` 交叉校验（SNR > 30 dB）。这是独立交叉验证——linear 路径用自己的 autograd Function 量化，能抓到 dequant-matmul reference 抓不到的量化 bug（后者与测试共用同一份 `convert_to_mxfp8` 输出，量化 bug 会两边同时错而仍通过）。
+  - 3 个负例：`expert_indices` 长度不匹配、`M_total` 未对齐 `ALIGN_SIZE_M`、`weight_scales` shape 错误。
+- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径。CDNA3 dequant 分支已由 `use_dot_scaled=False` 测试在 CI 中强制覆盖；真实 CDNA3/MI300 硬件 ground-truth 复验仍属 Step 7。
 
 ### Step 3 — Backward dgrad kernel ✅ 已完成
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
@@ -155,7 +160,7 @@ functional.py        # 暴露顶层入口
 **完成情况**：
 - `autotune.py` 新增 `DGRAD_CONFIGS`：`BSM=128, BSN=32, BSK=32`，让 dgrad 的 N reduction 每次 `dot_scaled` 只覆盖一个 32-wide MX scale group。
 - `_kernel_mxfp8_grouped_gemm_backward_dx` 已实现：按 `(M, K)` tile 计算 `dX = GO @ W`，删除 mxfp4 packing，支持 `USE_DOT_SCALED=True` 的 CDNA4 `tl.dot_scaled` 路径，以及 `USE_DOT_SCALED=False` 的 CDNA3 dequant + `tl.dot` fallback。
-- wrapper `mxfp8_grouped_gemm_backward_inputs` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/W scale shape 精确匹配，并支持 `trans_weights=True/False`。
+- wrapper `mxfp8_grouped_gemm_backward_inputs` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/W scale shape 精确匹配，并支持 `trans_weights=True/False`；同样暴露 `use_dot_scaled: Optional[bool] = None`（CDNA4 自动用 `tl.dot_scaled`，显式 `False` 强制 dequant fallback）。
 
 ### Step 4 — Backward wgrad kernel ✅ 已完成
 基于 `_kernel_mxfp4_grouped_gemm_backward_dw`：
@@ -167,7 +172,7 @@ functional.py        # 暴露顶层入口
 **完成情况**：
 - `autotune.py` 新增 `WGRAD_CONFIGS`：`BSM=32, BSN=128, BSK=32`，让 wgrad 的 M reduction 每次 `dot_scaled` 只覆盖一个 32-wide MX scale group。
 - `_kernel_mxfp8_grouped_gemm_backward_dw` 已实现：按 `(expert, N, K)` tile 计算 `dW = GO^T @ X`，保留 mxfp4 的简单调度方式：每个 expert tile 遍历所有 contiguous routing group，只累加匹配 expert 的 group。
-- wrapper `mxfp8_grouped_gemm_backward_weights` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/X scale shape 精确匹配，并支持 `trans_weights=True/False`。
+- wrapper `mxfp8_grouped_gemm_backward_weights` 已补最小输入契约检查：`M_total` 按 `ALIGN_SIZE_M=128` 对齐、`expert_indices.numel() == M_total`、`N/K` 可被 32 整除、GO/X scale shape 精确匹配，并支持 `trans_weights=True/False`；同样暴露 `use_dot_scaled: Optional[bool] = None`（CDNA4 自动用 `tl.dot_scaled`，显式 `False` 强制 dequant fallback）。
 
 ### Step 5 — Autograd Function ✅ 已完成
 参考 `MXFP4GroupedGEMM`：
@@ -185,14 +190,24 @@ functional.py        # 暴露顶层入口
 
 ### Step 6 — 数值正确性测试 ⏳ 部分完成
 测试位置在 `tests/unittest/mxfp8/`：
-1. `test_mxfp8_grouped_gemm_forward.py`：单 expert + 多 expert，dequant-then-matmul reference 对齐，并保留 bf16 sanity。
-2. `test_mxfp8_grouped_gemm_backward.py`：用「mxfp8 模拟版」reference（先 `convert_to_mxfp8`，再 `convert_from_mxfp8` 回 fp32，然后走 PyTorch 原生 GEMM）校验 dgrad/wgrad；另覆盖 autograd 端到端 forward + backward。
-3. `test_e2e_moe.py`：toy MoE layer + optimizer step 仍未实现。
+1. `test_mxfp8_grouped_gemm_forward.py`（**21 tests**）：见 Step 2 清单。
+2. `test_mxfp8_grouped_gemm_backward.py`（**31 tests**）：用 dequant-then-matmul reference（先 `convert_to_mxfp8`，再用 `convert_from_mxfp8_pytorch` 回 fp32，然后走 PyTorch 原生 GEMM）校验 dgrad/wgrad，并加 cos-sim > 0.999 + SNR > 40 dB 双重门槛；另覆盖 autograd 端到端 forward + backward。详见下方覆盖清单。
+3. `repro_mxfp8_dot_scaled.py` + `repro_mxfp8_dot_scaled.md`（见 §5 风险 2）：独立复现脚本，用一个最小 `tl.dot_scaled` kernel 对比 `convert_from_mxfp8(a) @ convert_from_mxfp8(b)`，验证「单次 `dot_scaled` 跨多个 32-wide scale group 会发散」这一约束。四个 case：`DOT_K=32` baseline、`DOT_K=32` over K=128 safe path、`DOT_K=64`（跨 2 group）与 `DOT_K=128`（跨 4 group）problem path，输入用 outlier-heavy 的 32-wide K block 放大 scale 差异，打印 `max_diff`/`mean_diff`/`relative_max_diff` 与 problem-vs-safe 的 mean_diff 比值。需在 CDNA4 环境手动运行（非 pytest 自动收集）。
+4. `test_e2e_moe.py`：toy MoE layer + optimizer step 仍未实现，设计与跟进见 §7。
+
+**backward 测试覆盖清单**：
+- `test_backward_inputs_matches_dequant_reference`：dgrad，`trans_weights` × 2D GO × 2D W × `use_dot_scaled∈{None, False}` = 16 个正例。
+- `test_backward_weights_matches_dequant_reference`：wgrad，`trans_weights` × 2D X × `use_dot_scaled∈{None, False}` = 8 个正例。
+- `test_mxfp8_grouped_gemm_autograd`：autograd 端到端，`trans_weights=True` 下 2D X/W 四种组合。
+- `test_mxfp8_grouped_gemm_autograd_trans_weights_false`：`trans_weights=False` 的 1D 路径（shape `(384,256,128,2)`）。
+- `test_backward_wrappers_reject_non_aligned_mtotal`：两个 backward wrapper 在 `M_total` 未对齐 `ALIGN_SIZE_M` 时 fail-fast 的负例。
+- `test_autograd_many_experts_with_empty_expert`：experts(8) > groups(2)，部分 expert 收到零 token，校验空 expert 的 `dW` 严格为 0、被路由 expert 的 `dW` 非零、且全程梯度有限，覆盖 wgrad「扫所有 group 判等」调度的空 expert 分支。
+- 其中 `use_dot_scaled=False` 的参数化在 CI 中强制覆盖了 CDNA3 `_dequantize_fp8 → tl.dot` fallback（任意设备可跑）。
 
 **验证记录**：
-- 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **17 passed, 14 warnings in 20.64s**。
-- backward 覆盖：dgrad `trans_weights` × 2D GO × 2D W 共 8 个正例；wgrad `trans_weights` × 2D X 共 4 个正例；autograd 覆盖 `trans_weights=True` 下 2D X/W 四种组合，以及 `trans_weights=False` 的 1D 路径。
-- 仍需补 toy MoE 训练 sanity，并在 CDNA3/MI300 上单独验证 `USE_DOT_SCALED=False` fallback。
+- 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward，彼时 **17 passed**；之后扩充 `use_dot_scaled` 参数化与空 expert / 对齐负例。
+- 2026-06-09 在 MI300X（CDNA3）上重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 20.74s**。
+- 仍需补 toy MoE 训练 sanity；`use_dot_scaled=False` fallback 已在 CI 强制覆盖，真实 CDNA4 `tl.dot_scaled` 路径仍需在 CDNA4 硬件上单独复验（本次为 CDNA3 机器）。
 
 ### Step 7 — MI300 fallback 验证
 仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
@@ -200,6 +215,55 @@ functional.py        # 暴露顶层入口
 ### Step 8 — 接 GPT-OSS（不在最小版本范围）
 预留接口：`mxfp8_grouped_gemm` 签名要能直接替换现有 MoE forward 中的 grouped GEMM 调用。具体集成视 GPT-OSS 训练栈 PR 时再做。
 
+> 定位提醒：§7 的 toy-moe-test 是**算子级前置闸门**（验证 grouped GEMM 这一个算子放进训练循环不发散），**不是模型级前置验证**——它不覆盖 router/gating 梯度、激活/多层误差耦合、以及进入「几百到几千步」危险区后的行为。即「台架通过」只代表算子可以装车，整车（GPT-OSS）在真实路况下的数值风险仍需接上后另测。
+
+#### 现状判断：算子核心已就绪，但**还不能直接接 GPT-OSS**
+
+V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三 pass 数值对、autograd 通、toy 训练 100 步不发散）。但「支持 GPT-OSS training」卡在算子与真实 MoE 之间的**接口契约**与若干前置验证上。下列待办按接入优先级排：
+
+- [ ] **【阻塞 · 头号】offsets 入口 + padded buffer 支持，对齐 mxfp4/nvfp4。**（方案已批准，待实施，见 §8.1）
+  现状（`cg_forward.py:247-249` 等）：入口要求 `M_total % 128 == 0`、每 128 token 整块同一 expert、`expert_indices.numel() == M_total`，且 forward 注释明确「V1 不支持 padded buffer」。
+  真实 MoE（含 GPT-OSS）路由后每个 expert 的 token 数是**动态、不等、不保证 128 对齐**的 → 现状下 GPT-OSS 给不出 mxfp8 能吃的输入。
+  参照：mxfp4 用户传 `offs`（累积 offset，如 `[128,128,256,...]`），内部 `create_indices_from_offsets_nosync`（`mxfp4/.../functional.py:23`）转 indices；nvfp4 另有 `test_nvfp4_grouped_gemm_accepts_padded_buffer` 覆盖 `M_bufferlen > 实际 token 数` 的补零场景。mxfp8 需补同款 `offsets` 入口 + `M_bufferlen` vs `M_total` 区分。
+
+- [ ] **【阻塞】CDNA4 真机验证默认 `tl.dot_scaled` 路径。**
+  GPT-OSS 真训大概率在 MI350(CDNA4) 走默认 `tl.dot_scaled`，而该路径至今未在真机验证（既有 52+2 用例均在 MI300X/CDNA3 跑 dequant fallback）。接入前必须先在 CDNA4 容器重跑确认数值正确（与 §6 open item 合并）。
+
+- [ ] **【高 · 接入前评估】e5m2 混合格式可能是前提，而非 v2 优化。**
+  §0 自述全 e4m3「通常几百到几千步发散」，而 toy test 只跑 100 步、单层；GPT-OSS 是多层 + 几千步，很可能踩进发散区。e5m2 通道代码已预留但**从未启用/测过**。建议接入前先评估是否必须先开 e5m2，避免训崩后回头。
+
+- [ ] **【中】性能可用性验证。**
+  wgrad 为「每 tile 扫所有 group」的 O(experts) 朴素实现、`BLOCK_SIZE_K=32`、无 autotune（§4 划线项）。能跑通 ≠ 训得起，GPT-OSS 规模下需实测吞吐，必要时提前做 split-K / autotune（原列为 v2）。
+
+- [ ] **【中】接入形态对齐。**
+  `mxfp8_grouped_gemm` 签名要能直接替换 GPT-OSS MoE forward 中的 grouped GEMM 调用；具体集成视 GPT-OSS 训练栈 PR 时再定。
+
+一句话：**发动机（算子）基本造好，但接进整车（GPT-OSS）所需的传动接口（offsets/padding）、CDNA4 路试、以及很可能必需的 e5m2，都还没做。** 最小可用 kernel ≈ 70% 到位，差的恰是「接真实模型」这一段。
+
+### 8.1 offsets 入口 + padded buffer 实施方案（已批准 · 待实施）
+
+> 状态：方案已评审通过，**代码尚未动手**。本节是落地蓝图，实施时按此执行并回填结果。
+
+**核心发现：三个 Triton kernel 无需改动。** 它们已用 `M_TOTAL` 作为迭代上界并 `offs_m < M_TOTAL` 掩码；输出张量是独立 `torch.zeros` 分配，padding 行从不被写入、天然保持 0。buffer-vs-routed 的混淆**只存在于 Python wrapper**。两个长度的定义：
+- **M_total** = 路由 token 数 = `expert_indices.numel()` = `offs[-1]`，必须 128 对齐（GPT-OSS 把每 expert 的 token 数向上 padding 到 128）。
+- **M_bufferlen** = `inputs.shape[0]` = 实际激活 buffer 长度，尾部可能有超出路由范围的 padding 行。
+
+**改动清单：**
+
+1. **`cg_forward.py` wrapper**（kernel 不动）：`M_bufferlen, K = inputs.shape`、`M_total = expert_indices.numel()`；保留 `M_total % ALIGN_SIZE_M == 0`，删除现已恒真的 `numel == M_total` 等值检查，改加 `M_bufferlen >= M_total` 与 `M_bufferlen % 32 == 0`；`output`/`expected_input_scales` 按 **M_bufferlen** 尺寸；kernel 仍传 `M_TOTAL=M_total`（路由长度）。
+
+2. **`cg_backward.py` 两个 wrapper**（kernel 不动）：dgrad/wgrad 同样拆 `M_bufferlen`(=`grad_output.shape[0]`) vs `M_total`(=`expert_indices.numel()`)；`grad_inputs` 按 bufferlen 分配；scale shape 按 bufferlen；grid 用 M_total。wgrad kernel 只遍历 `M_TOTAL // GROUP_SIZE_M` 个路由 group → padding 行永不累加进 dW。
+
+3. **`functional.py` 新增 dispatch 入口**（对齐 GPT-OSS 约定，零改动 drop-in）：`_quantize_then_mxfp8_scaled_grouped_mm(A, B, offs, *, use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False, fwd_format="e4m3", bwd_grad_format="e4m3")`。`B` 以 dispatch 布局 `[E,K,N]` 传入，入口内 `B.transpose(-2,-1).contiguous()` 转成 canonical `[E,N,K]`（仿 nvfp4 `functional.py:149`），再以 `trans_weights=True` 调 `MXFP8GroupedGEMM.apply`；`offs` 经 `create_indices_from_offsets_nosync`（`alto/kernels/dsgemm_utils.py`）转 indices。`__init__.py` 导出该入口。
+
+4. **`MXFP8GroupedGEMM` autograd**：结构无需改（已用 bufferlen 张量 + ctx 透传 indices）；仅需确认 M 轴量化 `convert_to_mxfp8(..., axis=0)` 在 bufferlen buffer 上成立（要求 `M_bufferlen % 32 == 0`，由新断言保证）。
+
+5. **测试**（`test_mxfp8_grouped_gemm_backward.py`）：新增 `test_mxfp8_grouped_gemm_accepts_padded_buffer`，采 **padded-vs-unpadded 自比**（最强校验，闭环证明 padding 零干扰）。**关键：固定 `use_sr_grad=False`** 使量化确定性，否则随机舍入令 `torch.equal` 偶发失败；routed 行两次跑应逐位相等。断言：`y_pad.shape==(M_bufferlen,N)`、`y_pad[M_routed:]` 全 0、`y_pad[:M_routed]==y_ref`、`inputs_pad.grad[M_routed:]` 全 0、`inputs_pad.grad[:M_routed]==inputs_ref.grad`、`w_pad.grad==w_ref.grad`；另加 offsets 入口 smoke test。
+
+**验证**：现有 54 用例不回归 + 新 padded-buffer 测试通过；额外确认「若 wrapper 仍按 `[M_total,N]` 分配则新测试会失败」以证明确实触发了 padding 路径。本改动 device-agnostic（只动 wrapper/入口），CDNA4 路试仍为独立 open item。
+
+**完成后**：勾选 §8 头号 item，并在 §3 相应 Step 回填新入口 `_quantize_then_mxfp8_scaled_grouped_mm` 与 padded-buffer 测试。
+
 ---
 
 ## 4. 不做的事（明确划线）
@@ -225,7 +289,7 @@ functional.py        # 暴露顶层入口
 | 风险 | 对策 |
 |---|---|
 | `tl.dot_scaled` 在 CDNA4 上 e5m2 × e4m3 混合 dtype 行为未验证 | Step 3 先单独写一个 toy 测试验证 mixed-dtype dot_scaled 输出，再嵌入 grouped GEMM |
-| `BLOCK_SIZE_K=32` 单 group 太小，K 累加 overhead 大 | 先确认正确性，再尝试 64/128 评估数值偏差是否可接受 |
+| `BLOCK_SIZE_K=32` 单 group 太小，K 累加 overhead 大 | 先确认正确性，再尝试 64/128 评估数值偏差是否可接受。`tests/unittest/mxfp8/repro_mxfp8_dot_scaled.py` 已量化「单次 `dot_scaled` 跨多个 32-wide group」的精度损失（DOT_K=32/64/128 对比），佐证保守取 32 |
 | wgrad kernel 在 expert 数多时性能差（每个 tile 扫所有 group） | v1 接受；v2 改 split-K + 按 expert grouping 调度 |
 | GPT-OSS 实际 token 路由可能不满足 GROUP_SIZE_M=128 对齐 | 上游 padding（已是 mxfp4 路径假设），不在 kernel 内处理 |
 
@@ -233,7 +297,51 @@ functional.py        # 暴露顶层入口
 
 ## 6. 验收标准（v1 完成定义）
 
-1. fwd / dgrad / wgrad 三个 kernel 在 CDNA4 与 CDNA3 上都能跑通
-2. 数值对齐 bf16 reference：fwd cos-sim > 0.999，bwd cos-sim > 0.995
-3. toy MoE 训练 100 steps loss 单调下降，与 bf16 baseline 同形
-4. 单元测试覆盖 1D/2D block × CDNA3/CDNA4 各组合（V1 全 e4m3；e5m2 分支留待 v2）
+| # | 验收标准 | 状态 | 说明 |
+|---|---|---|---|
+| 1 | fwd / dgrad / wgrad 三个 kernel 在 CDNA4 与 CDNA3 上都能跑通 | ⏳ 部分 | CDNA3 ✅（2026-06-09 MI300X 52 passed）；CDNA4 默认 `tl.dot_scaled` 路径**真机未验证**，见下方 open item |
+| 2 | 数值对齐 bf16 reference：fwd cos-sim > 0.999，bwd cos-sim > 0.995 | ✅ | 测试用 cos-sim > 0.999 + SNR > 40 dB 双门槛卡住（比标准更严） |
+| 3 | 端到端 autograd 梯度对齐 | ✅ | `test_mxfp8_grouped_gemm_autograd*` 单步 forward+backward，dX/dW vs bf16 reference cos-sim > 0.99。**这与同目录 mxfp4 / nvfp4 的端到端标准一致**——两者也止步于单步梯度对齐，均未做训练 loop |
+| 4 | 单元测试覆盖 1D/2D block × CDNA3/CDNA4 各组合（V1 全 e4m3；e5m2 分支留待 v2） | ⏳ 部分 | 1D/2D ✅、CDNA3 ✅；CDNA4 同标准 1 |
+
+**收口说明**：标准 3 原文为「toy MoE 训练 100 steps loss 单调下降」。对齐 mxfp4 / nvfp4 的既有验收口径后，**V1 把端到端硬验收降级为单步 autograd 梯度对齐（已达成）**；toy MoE 训练 loop 作为更高一档的 sanity，单列于 §7 记录与跟进，不阻塞 V1 完成定义。
+
+**V1 剩余 open items**：
+- **CDNA4 真机验证**（标准 1、4）：当前手头为 MI300X(CDNA3)，CDNA4 默认 `tl.dot_scaled` 路径需在 CDNA4（如 MI350）容器上重跑现有 52 个用例确认数值正确。纯运行、不改代码。
+- **toy MoE 训练 sanity**（§7）：验证 §0「全 e4m3 是否够用、会不会几百步发散」这一核心假设；mxfp4/nvfp4 未做，属 mxfp8 主动加严项。
+
+---
+
+## 7. toy MoE 训练 sanity（§0 假设的端到端验证）
+
+### 7.1 动机
+§0 的核心假设是「V1 全 e4m3 在 toy MoE 上够用，不会几百步就发散」。单步 autograd 梯度对齐（§6 标准 3）只能证明**一次** fwd+bwd 的数值正确，无法暴露**累积效应**——underflow 的小尾部、spike 的 overflow 要在多步训练里才会把 loss 推离 bf16 baseline。本章的训练 loop 是这一假设唯一的实测手段，也是日后决定「是否需要升级 v2 混合格式」的判据。
+
+> ⚠️ **重要限定：这个 "toy MoE" 不是真正的 MoE 模型，而是一个"单层 grouped GEMM 拟合任务"。** 它的全部结构是 `num_experts` 个权重矩阵 `W_e [N, K]`，前向 `Y = X @ W[expert]^T`、损失 `MSE(Y, target)`、手写 SGD 更新 `W`。它**刻意不含**真实 MoE 的关键部件：可学习 router / gating（路由是固定的 `g % num_experts`，不参与训练）、top-k 选择与负载均衡 loss、多层堆叠、激活函数、残差、真实输入分布（输入是随机高斯 + 0.5% 离群点）。
+>
+> 因此本测试**只验证一件事**：把 `mxfp8_grouped_gemm` 放进一个反复更新权重的迭代循环里，e4m3 量化误差累积 100 步**不会让 grouped GEMM 训不动 / 发散**——即 §0 假设的最小实测。它**不能**被解读为「mxfp8 已验证可训练 MoE 模型」；router、激活、多层耦合下的数值行为均未覆盖，那需要更接近真实的网络（或直接接 GPT-OSS，§8）才能回答。
+
+mxfp4 / nvfp4 都没有这层测试（见 §6 标准 3 说明），所以这是 mxfp8 相对参照实现的**主动加严项**，放在独立章节、独立测试文件，避免与最小路径的单测耦合。
+
+### 7.2 设计（拟定）
+测试文件：`tests/unittest/mxfp8/test_e2e_moe.py`
+- **toy MoE layer**：`num_experts` 个专家，每专家一个 `[N, K]` 权重；token 经一个简单（可固定/随机）router 路由到专家，按 contiguous group 排好后调 `mxfp8_grouped_gemm` 做 expert GEMM。为对齐 kernel 的 `GROUP_SIZE_M=ALIGN_SIZE_M` 契约，token 数按 `ALIGN_SIZE_M` 对齐（上游 padding，沿用 mxfp4 假设）。
+- **训练 loop**：固定 toy 任务（如回归到随机 target），同一份初始化分别跑 **mxfp8 路径** 与 **bf16 baseline 路径**，各 ~100 步，SGD/AdamW 任一。
+- **断言**：
+  1. mxfp8 全程 loss 有限（无 NaN/Inf）。
+  2. loss 整体下降（不要求严格单调；用「末段窗口均值 < 首段窗口均值」或拟合斜率 < 0 之类的鲁棒判据，避免量化噪声造成的逐步抖动误杀）。
+  3. mxfp8 与 bf16 的 loss 曲线**同形**：终点 loss 差距在阈值内（阈值待跑通后据实标定，先放宽）。
+- **设备**：CDNA3 可跑（dequant 路径）；CDNA4 默认路径与 §6 open item 一起在 CDNA4 机器上复跑。
+
+### 7.3 状态
+✅ 已实现 `tests/unittest/mxfp8/test_e2e_moe.py`，与 7.2 设计一致：
+- toy MoE：`num_groups=4`、`m_total=4×ALIGN_SIZE_M`、`N=K=128`，contiguous router（group g → expert `g % num_experts`），用 `mxfp8_grouped_gemm(trans_weights=True)` 做 expert GEMM；参数化 `num_experts ∈ {2, 4}`。
+- 训练 loop：同一份 `w_init`/`inputs`/`target`/`indices`，mxfp8 与 bf16 各跑 100 步 SGD（lr=0.5）拟合随机 target。
+- 三条断言：① mxfp8 loss 全程有限；② 末段 20% 窗口均值 < 首段 20% 窗口均值 × 0.9（鲁棒下降判据，避开逐步量化抖动）；③ mxfp8 末段窗口均值 < bf16 末段 × 2.0（同形）。
+
+**验证记录**（2026-06-09，MI300X / CDNA3）：
+- `python -m pytest tests/unittest/mxfp8/test_e2e_moe.py -q` → **2 passed**。
+- 实测 loss（start → end，100 步）：`experts=2` mxfp8 `10736.72 → 0.4239` vs bf16 `10618.74 → 0.3542`；`experts=4` mxfp8 `18734.94 → 0.9231` vs bf16 `18832.10 → 0.8694`。mxfp8 终点 loss 与 bf16 比值约 1.06~1.20×，远在 2.0× 阈值内 → **§0「全 e4m3 在 toy MoE 上够用、100 步不发散」的假设在 CDNA3 上成立**。
+- loss 曲线：见 `tests/unittest/mxfp8/e2e_moe_loss_curve.png`（对数 y 轴，mxfp8 vs bf16 并排）。复跑/刷新用自包含脚本 `tests/unittest/mxfp8/plot_e2e_moe_curve.py`（训练逻辑与 `test_e2e_moe.py` 一致，并打印逐步 loss 序列）。
+- 曲线观察：前 ~40 步两条线在对数轴上基本贴死，下降形状完全一致；**分叉只出现在尾部**——loss 逼近收敛底部（≲1）时，e4m3 量化噪声才表现为 mxfp8 略高于 bf16 + 轻微逐步抖动（`experts=2` 比 `experts=4` 明显，后者尾部几乎仍咬合），但全程贴着 baseline、无上翘/发散。即量化误差只在 loss 很小时显现为小幅抬升,不破坏训练动态。
+- 仍待：CDNA4 默认 `tl.dot_scaled` 路径上重跑本测试（与 §6 open item 合并）；步数/网络规模放大后是否仍贴合 bf16，留待接 GPT-OSS 时据实加严。
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
index 14b61c6..4daf682 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
@@ -1,9 +1,6 @@
 # Copyright (c) 2026 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-"""MXFP8 contiguous grouped GEMM — backward (dgrad + wgrad) + autograd Function.
-
-Skeleton only; kernel bodies filled in Steps 3-5.
-"""
+"""MXFP8 contiguous grouped GEMM — backward (dgrad + wgrad) + autograd Function."""
 
 from typing import Optional
 
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
index 7ab40c8..37ab9f1 100644
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
@@ -12,6 +12,7 @@
     mxfp8_grouped_gemm_backward_weights,
 )
 from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
+from alto.kernels.fp4.testing_utils import calc_snr
 
 from .utils import prepare_data, convert_from_mxfp8_pytorch
 
@@ -71,7 +72,8 @@ def _reference_wgrad(grad_output, inputs, indices, num_experts, trans_weights):
 @pytest.mark.parametrize("trans_weights", [True, False])
 @pytest.mark.parametrize("use_2dblock_go", [False, True])
 @pytest.mark.parametrize("use_2dblock_w", [False, True])
-def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go, use_2dblock_w):
+@pytest.mark.parametrize("use_dot_scaled", [None, False])
+def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go, use_2dblock_w, use_dot_scaled):
     m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
     dtype = torch.bfloat16
 
@@ -106,6 +108,7 @@ def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go
         use_2dblock_x=use_2dblock_go,
         use_2dblock_w=use_2dblock_w,
         output_dtype=dtype,
+        use_dot_scaled=use_dot_scaled,
     )
 
     grad_output_dq = convert_from_mxfp8_pytorch(
@@ -117,11 +120,15 @@ def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go
     cos = _cossim(grad_inputs, grad_inputs_ref)
     assert cos > 0.999, \
         f"dgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_go={use_2dblock_go}, 2d_w={use_2dblock_w})"
+    snr = calc_snr(grad_inputs_ref, grad_inputs.float())
+    assert snr > 40, \
+        f"dgrad kernel vs dequant-matmul SNR too low: {snr:.1f}dB (2d_go={use_2dblock_go}, 2d_w={use_2dblock_w})"
 
 
 @pytest.mark.parametrize("trans_weights", [True, False])
 @pytest.mark.parametrize("use_2dblock_x", [False, True])
-def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x):
+@pytest.mark.parametrize("use_dot_scaled", [None, False])
+def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x, use_dot_scaled):
     m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
     dtype = torch.bfloat16
 
@@ -157,6 +164,7 @@ def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x
         use_2dblock_go=use_2dblock_x,
         use_2dblock_x=use_2dblock_x,
         output_dtype=dtype,
+        use_dot_scaled=use_dot_scaled,
     )
 
     grad_output_dq = convert_from_mxfp8_pytorch(
@@ -167,9 +175,11 @@ def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x
 
     cos = _cossim(grad_weights, grad_weights_ref)
     assert cos > 0.999, f"wgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_x={use_2dblock_x})"
+    snr = calc_snr(grad_weights_ref, grad_weights.float())
+    assert snr > 40, f"wgrad kernel vs dequant-matmul SNR too low: {snr:.1f}dB (2d_x={use_2dblock_x})"
 
 
-def _run_autograd_case(trans_weights, use_2dblock_x, use_2dblock_w, shape=(256, 128, 128, 2)):
+def _run_autograd_case(trans_weights, use_2dblock_x, use_2dblock_w, shape=(384, 128, 128, 2)):
     m_total, n_dim, k_dim, num_experts = shape
     device = torch.device("cuda")
     dtype = torch.bfloat16
@@ -218,5 +228,54 @@ def test_mxfp8_grouped_gemm_autograd_trans_weights_false():
         trans_weights=False,
         use_2dblock_x=False,
         use_2dblock_w=False,
-        shape=(256, 256, 128, 2),
+        shape=(384, 256, 128, 2),
     )
+
+
+def test_backward_wrappers_reject_non_aligned_mtotal():
+    """Both backward wrappers must fail fast on M_total not aligned to ALIGN_SIZE_M."""
+    m_total, n_dim, k_dim, num_experts = 64, 128, 128, 1  # 64 not a multiple of 128
+    dtype = torch.bfloat16
+    grad_output = prepare_data((m_total, n_dim), dtype)
+    inputs = prepare_data((m_total, k_dim), dtype)
+    expert_weights = prepare_data((num_experts, n_dim, k_dim), dtype)
+    indices = torch.zeros(m_total, dtype=torch.int32, device="cuda")
+
+    go_lp, go_s = torch.ops.alto.convert_to_mxfp8(
+        grad_output, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-2, is_2d_block=False)
+    go_m_lp, go_m_s = torch.ops.alto.convert_to_mxfp8(
+        grad_output, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=0, is_2d_block=False)
+    x_m_lp, x_m_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=0, is_2d_block=False)
+
+    with pytest.raises(AssertionError, match="multiple of group_size_m"):
+        mxfp8_grouped_gemm_backward_inputs(go_lp, w_lp, indices, go_s, w_s)
+    with pytest.raises(AssertionError, match="multiple of group_size_m"):
+        mxfp8_grouped_gemm_backward_weights(go_m_lp, x_m_lp, indices, num_experts, go_m_s, x_m_s)
+
+
+def test_autograd_many_experts_with_empty_expert():
+    """experts > groups => some experts receive zero tokens (dW row must be 0),
+    and the wgrad scan-all-groups path is exercised with finite gradients."""
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    m_total, n_dim, k_dim, num_experts = ALIGN_SIZE_M * 2, 128, 128, 8  # 2 groups, 8 experts
+
+    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
+    expert_weights = prepare_data((num_experts, n_dim, k_dim), dtype).requires_grad_(True)
+    # Route both groups to experts 0 and 1; experts 2..7 stay empty.
+    expert_indices = torch.zeros(m_total, dtype=torch.int32, device=device)
+    expert_indices[ALIGN_SIZE_M:] = 1
+    target = prepare_data((m_total, n_dim), dtype)
+
+    outputs = mxfp8_grouped_gemm(inputs, expert_weights, expert_indices, trans_weights=True)
+    torch.nn.functional.mse_loss(outputs, target).backward()
+
+    assert torch.isfinite(outputs).all()
+    assert torch.isfinite(inputs.grad).all()
+    assert torch.isfinite(expert_weights.grad).all()
+    # Empty experts must get exactly zero weight gradient.
+    assert torch.count_nonzero(expert_weights.grad[2:]) == 0, "unused experts must have zero dW"
+    assert torch.count_nonzero(expert_weights.grad[:2]) > 0, "routed experts must have nonzero dW"
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
index 1a038cd..5bd3b3b 100644
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
@@ -8,6 +8,7 @@
 from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
+from alto.kernels.fp4.testing_utils import calc_snr
 
 from .utils import prepare_data, convert_from_mxfp8_pytorch
 
@@ -78,6 +79,10 @@ def test_forward(shape, use_2dblock_x, use_2dblock_w, trans_weights):
     cos_dq = _cossim(out, ref_dq)
     assert cos_dq > 0.999, \
         f"kernel vs dequant-matmul cos-sim too low: {cos_dq} (shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w})"
+    # SNR catches magnitude errors (missed scale, wrong accumulator) that cossim is blind to.
+    snr_dq = calc_snr(ref_dq, out.float())
+    assert snr_dq > 40, \
+        f"kernel vs dequant-matmul SNR too low: {snr_dq:.1f}dB (shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w})"
 
     # Looser sanity check against the full-precision bf16 path.
     ref_bf16 = _reference(inputs, expert_weights, indices, num_groups, trans_weights)
@@ -85,6 +90,73 @@ def test_forward(shape, use_2dblock_x, use_2dblock_w, trans_weights):
     assert cos_bf16 > 0.99, f"kernel vs bf16 cos-sim too low: {cos_bf16}"
 
 
+def test_forward_dequant_fallback_matches_dot_scaled():
+    """CDNA3 path (use_dot_scaled=False) must match the dequant-matmul reference.
+
+    Forced on regardless of the running device so the _dequantize_fp8 -> tl.dot
+    branch is covered; real MI300 ground-truth still needs Step 7 on CDNA3.
+    """
+    M_total, N, K, num_experts = 256, 128, 128, 2
+    M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
+    num_groups = M_total // ALIGN_SIZE_M
+    device = torch.device("cuda")
+    data_type = torch.bfloat16
+
+    inputs = prepare_data((M_total, K), data_type)
+    expert_weights = prepare_data((num_experts, N, K), data_type)
+    indices = _make_indices(num_groups, num_experts, device)
+
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=True)
+
+    out = mxfp8_grouped_gemm_forward(
+        x_lp, w_lp, indices, x_s, w_s,
+        trans_weights=True,
+        use_2dblock_x=False,
+        use_2dblock_w=True,
+        output_dtype=data_type,
+        use_dot_scaled=False,
+    )
+
+    x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, False)
+    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, True)
+    ref_dq = _reference(x_dq, w_dq, indices, num_groups, trans_weights=True)
+    cos = _cossim(out, ref_dq)
+    assert cos > 0.999, f"fallback forward vs dequant-matmul cos-sim too low: {cos}"
+
+
+def test_forward_single_expert_matches_mxfp8_linear():
+    """All tokens to one expert => grouped GEMM must match mxfp8 linear (x @ w^T).
+
+    Independent cross-check: the linear path quantizes via its own autograd
+    Function, so this catches bugs the dequant-matmul reference can't — that
+    reference shares this test's convert_to_mxfp8 output, so a quant bug would
+    corrupt both sides equally and still pass.
+    """
+    from alto.kernels.mxfp8.mxfp8_linear import _to_mxfp8_then_scaled_mm
+
+    M, N, K = ALIGN_SIZE_M, 256, 256
+    data_type = torch.bfloat16
+    x = prepare_data((M, K), data_type)
+    w = prepare_data((N, K), data_type)
+    indices = torch.zeros(M, dtype=torch.int32, device="cuda")
+
+    y_linear = _to_mxfp8_then_scaled_mm(x, w, use_2dblock_x=False, use_2dblock_w=False)
+
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        x, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        w.unsqueeze(0), block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    y_gg = mxfp8_grouped_gemm_forward(
+        x_lp, w_lp, indices, x_s, w_s, trans_weights=True,
+        use_2dblock_x=False, use_2dblock_w=False, output_dtype=data_type)
+
+    snr = calc_snr(y_linear.float(), y_gg.float())
+    assert snr > 30, f"single-expert grouped GEMM vs mxfp8 linear SNR too low: {snr:.1f}dB"
+
+
 def test_forward_rejects_indices_length_mismatch():
     M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
     data_type = torch.bfloat16
@@ -100,6 +172,21 @@ def test_forward_rejects_indices_length_mismatch():
         mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
 
 
+def test_forward_rejects_non_aligned_mtotal():
+    M_total, N, K = 64, 128, 128  # 64 not a multiple of ALIGN_SIZE_M (128)
+    data_type = torch.bfloat16
+    inputs = prepare_data((M_total, K), data_type)
+    expert_weights = prepare_data((1, N, K), data_type)
+    indices = torch.zeros(M_total, dtype=torch.int32, device="cuda")
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+
+    with pytest.raises(AssertionError, match="multiple of group_size_m"):
+        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
+
+
 def test_forward_rejects_wrong_scale_shape():
     M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
     data_type = torch.bfloat16

From 4643649803a2e6d6a84c5f873aa8751418be6e86 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Wed, 10 Jun 2026 01:36:50 -0500
Subject: [PATCH 09/12] docs: add m355 MXFP8 grouped GEMM test result

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index 30e636f..a1ee605 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -207,7 +207,8 @@ functional.py        # 暴露顶层入口
 **验证记录**：
 - 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward，彼时 **17 passed**；之后扩充 `use_dot_scaled` 参数化与空 expert / 对齐负例。
 - 2026-06-09 在 MI300X（CDNA3）上重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 20.74s**。
-- 仍需补 toy MoE 训练 sanity；`use_dot_scaled=False` fallback 已在 CI 强制覆盖，真实 CDNA4 `tl.dot_scaled` 路径仍需在 CDNA4 硬件上单独复验（本次为 CDNA3 机器）。
+- 2026-06-10 在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 18.51s**。环境确认：PyTorch `2.12.0a0+git78d5fb4`，`is_cdna4=True`。结论：m355 默认 `tl.dot_scaled` 路径的 grouped GEMM fwd / dgrad / wgrad 数值单测已通过。
+- 仍需补 toy MoE 训练 sanity；`use_dot_scaled=False` fallback 已在 CI 强制覆盖，CDNA4/m355 默认 `tl.dot_scaled` 路径已通过上述 52 个 grouped GEMM 单测。
 
 ### Step 7 — MI300 fallback 验证
 仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
@@ -226,8 +227,8 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
   真实 MoE（含 GPT-OSS）路由后每个 expert 的 token 数是**动态、不等、不保证 128 对齐**的 → 现状下 GPT-OSS 给不出 mxfp8 能吃的输入。
   参照：mxfp4 用户传 `offs`（累积 offset，如 `[128,128,256,...]`），内部 `create_indices_from_offsets_nosync`（`mxfp4/.../functional.py:23`）转 indices；nvfp4 另有 `test_nvfp4_grouped_gemm_accepts_padded_buffer` 覆盖 `M_bufferlen > 实际 token 数` 的补零场景。mxfp8 需补同款 `offsets` 入口 + `M_bufferlen` vs `M_total` 区分。
 
-- [ ] **【阻塞】CDNA4 真机验证默认 `tl.dot_scaled` 路径。**
-  GPT-OSS 真训大概率在 MI350(CDNA4) 走默认 `tl.dot_scaled`，而该路径至今未在真机验证（既有 52+2 用例均在 MI300X/CDNA3 跑 dequant fallback）。接入前必须先在 CDNA4 容器重跑确认数值正确（与 §6 open item 合并）。
+- [x] **CDNA4/m355 真机验证默认 `tl.dot_scaled` 路径。**
+  2026-06-10 已在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器中重跑 forward + backward 52 个 grouped GEMM 单测，结果 **52 passed, 14 warnings in 18.51s**。结论：m355 默认 `tl.dot_scaled` 路径的 fwd / dgrad / wgrad 数值正确性单测已通过。
 
 - [ ] **【高 · 接入前评估】e5m2 混合格式可能是前提，而非 v2 优化。**
   §0 自述全 e4m3「通常几百到几千步发散」，而 toy test 只跑 100 步、单层；GPT-OSS 是多层 + 几千步，很可能踩进发散区。e5m2 通道代码已预留但**从未启用/测过**。建议接入前先评估是否必须先开 e5m2，避免训崩后回头。
@@ -238,7 +239,7 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 - [ ] **【中】接入形态对齐。**
   `mxfp8_grouped_gemm` 签名要能直接替换 GPT-OSS MoE forward 中的 grouped GEMM 调用；具体集成视 GPT-OSS 训练栈 PR 时再定。
 
-一句话：**发动机（算子）基本造好，但接进整车（GPT-OSS）所需的传动接口（offsets/padding）、CDNA4 路试、以及很可能必需的 e5m2，都还没做。** 最小可用 kernel ≈ 70% 到位，差的恰是「接真实模型」这一段。
+一句话：**发动机（算子）基本造好，CDNA4/m355 路试已过；但接进整车（GPT-OSS）所需的传动接口（offsets/padding）以及很可能必需的 e5m2，都还没做。** 最小可用 kernel ≈ 75% 到位，差的恰是「接真实模型」这一段。
 
 ### 8.1 offsets 入口 + padded buffer 实施方案（已批准 · 待实施）
 
@@ -299,15 +300,14 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 
 | # | 验收标准 | 状态 | 说明 |
 |---|---|---|---|
-| 1 | fwd / dgrad / wgrad 三个 kernel 在 CDNA4 与 CDNA3 上都能跑通 | ⏳ 部分 | CDNA3 ✅（2026-06-09 MI300X 52 passed）；CDNA4 默认 `tl.dot_scaled` 路径**真机未验证**，见下方 open item |
+| 1 | fwd / dgrad / wgrad 三个 kernel 在 CDNA4 与 CDNA3 上都能跑通 | ✅ | CDNA3 ✅（2026-06-09 MI300X 52 passed）；CDNA4/m355 ✅（2026-06-10 MI355X 52 passed，默认 `tl.dot_scaled` 路径） |
 | 2 | 数值对齐 bf16 reference：fwd cos-sim > 0.999，bwd cos-sim > 0.995 | ✅ | 测试用 cos-sim > 0.999 + SNR > 40 dB 双门槛卡住（比标准更严） |
 | 3 | 端到端 autograd 梯度对齐 | ✅ | `test_mxfp8_grouped_gemm_autograd*` 单步 forward+backward，dX/dW vs bf16 reference cos-sim > 0.99。**这与同目录 mxfp4 / nvfp4 的端到端标准一致**——两者也止步于单步梯度对齐，均未做训练 loop |
-| 4 | 单元测试覆盖 1D/2D block × CDNA3/CDNA4 各组合（V1 全 e4m3；e5m2 分支留待 v2） | ⏳ 部分 | 1D/2D ✅、CDNA3 ✅；CDNA4 同标准 1 |
+| 4 | 单元测试覆盖 1D/2D block × CDNA3/CDNA4 各组合（V1 全 e4m3；e5m2 分支留待 v2） | ✅ | 1D/2D ✅、CDNA3 ✅、CDNA4/m355 ✅ |
 
 **收口说明**：标准 3 原文为「toy MoE 训练 100 steps loss 单调下降」。对齐 mxfp4 / nvfp4 的既有验收口径后，**V1 把端到端硬验收降级为单步 autograd 梯度对齐（已达成）**；toy MoE 训练 loop 作为更高一档的 sanity，单列于 §7 记录与跟进，不阻塞 V1 完成定义。
 
 **V1 剩余 open items**：
-- **CDNA4 真机验证**（标准 1、4）：当前手头为 MI300X(CDNA3)，CDNA4 默认 `tl.dot_scaled` 路径需在 CDNA4（如 MI350）容器上重跑现有 52 个用例确认数值正确。纯运行、不改代码。
 - **toy MoE 训练 sanity**（§7）：验证 §0「全 e4m3 是否够用、会不会几百步发散」这一核心假设；mxfp4/nvfp4 未做，属 mxfp8 主动加严项。
 
 ---

From 52ffa43142f0b34002aa4ebd4bb788b9334a5273 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Wed, 10 Jun 2026 07:53:53 +0000
Subject: [PATCH 10/12] mxfp8: add toy-MoE training sanity and cross-format
 comparison

---
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md |  37 ++++-
 .../unittest/compare_grouped_gemm_toy_moe.py  | 156 ++++++++++++++++++
 tests/unittest/mxfp8/plot_e2e_moe_curve.py    | 107 ++++++++++++
 tests/unittest/mxfp8/test_e2e_moe.py          |  90 ++++++++++
 4 files changed, 387 insertions(+), 3 deletions(-)
 create mode 100644 tests/unittest/compare_grouped_gemm_toy_moe.py
 create mode 100644 tests/unittest/mxfp8/plot_e2e_moe_curve.py
 create mode 100644 tests/unittest/mxfp8/test_e2e_moe.py

diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index a1ee605..d675f53 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -149,7 +149,7 @@ functional.py        # 暴露顶层入口
   - `test_forward_dequant_fallback_matches_dot_scaled`：强制 `use_dot_scaled=False`，无论运行设备都覆盖 `_dequantize_fp8 → tl.dot` 分支（真实 MI300 ground-truth 仍待 Step 7）。
   - `test_forward_single_expert_matches_mxfp8_linear`：全部 token 路由到单 expert，与 `mxfp8_linear._to_mxfp8_then_scaled_mm` 交叉校验（SNR > 30 dB）。这是独立交叉验证——linear 路径用自己的 autograd Function 量化，能抓到 dequant-matmul reference 抓不到的量化 bug（后者与测试共用同一份 `convert_to_mxfp8` 输出，量化 bug 会两边同时错而仍通过）。
   - 3 个负例：`expert_indices` 长度不匹配、`M_total` 未对齐 `ALIGN_SIZE_M`、`weight_scales` shape 错误。
-- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径。CDNA3 dequant 分支已由 `use_dot_scaled=False` 测试在 CI 中强制覆盖；真实 CDNA3/MI300 硬件 ground-truth 复验仍属 Step 7。
+- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径。CDNA3 dequant 分支已由 `use_dot_scaled=False` 测试在 CI 中强制覆盖；真实 CDNA3/MI300 硬件 ground-truth 已于 2026-06-09 在 MI300X 复验（见 §6 验证记录）。
 
 ### Step 3 — Backward dgrad kernel ✅ 已完成
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
@@ -210,9 +210,11 @@ functional.py        # 暴露顶层入口
 - 2026-06-10 在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 18.51s**。环境确认：PyTorch `2.12.0a0+git78d5fb4`，`is_cdna4=True`。结论：m355 默认 `tl.dot_scaled` 路径的 grouped GEMM fwd / dgrad / wgrad 数值单测已通过。
 - 仍需补 toy MoE 训练 sanity；`use_dot_scaled=False` fallback 已在 CI 强制覆盖，CDNA4/m355 默认 `tl.dot_scaled` 路径已通过上述 52 个 grouped GEMM 单测。
 
-### Step 7 — MI300 fallback 验证
+### Step 7 — MI300 fallback 验证 ✅ 已完成
 仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
 
+**完成情况**：2026-06-09 在 MI300X（CDNA3）跑 dequant fallback、2026-06-10 在 MI355X（CDNA4）跑默认 `tl.dot_scaled`，两机各 **52 passed**（详见 §6 验证记录），CDNA3 与 CDNA4 数值一致。
+
 ### Step 8 — 接 GPT-OSS（不在最小版本范围）
 预留接口：`mxfp8_grouped_gemm` 签名要能直接替换现有 MoE forward 中的 grouped GEMM 调用。具体集成视 GPT-OSS 训练栈 PR 时再做。
 
@@ -261,7 +263,7 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 
 5. **测试**（`test_mxfp8_grouped_gemm_backward.py`）：新增 `test_mxfp8_grouped_gemm_accepts_padded_buffer`，采 **padded-vs-unpadded 自比**（最强校验，闭环证明 padding 零干扰）。**关键：固定 `use_sr_grad=False`** 使量化确定性，否则随机舍入令 `torch.equal` 偶发失败；routed 行两次跑应逐位相等。断言：`y_pad.shape==(M_bufferlen,N)`、`y_pad[M_routed:]` 全 0、`y_pad[:M_routed]==y_ref`、`inputs_pad.grad[M_routed:]` 全 0、`inputs_pad.grad[:M_routed]==inputs_ref.grad`、`w_pad.grad==w_ref.grad`；另加 offsets 入口 smoke test。
 
-**验证**：现有 54 用例不回归 + 新 padded-buffer 测试通过；额外确认「若 wrapper 仍按 `[M_total,N]` 分配则新测试会失败」以证明确实触发了 padding 路径。本改动 device-agnostic（只动 wrapper/入口），CDNA4 路试仍为独立 open item。
+**验证**：现有 54 用例不回归 + 新 padded-buffer 测试通过；额外确认「若 wrapper 仍按 `[M_total,N]` 分配则新测试会失败」以证明确实触发了 padding 路径。本改动 device-agnostic（只动 wrapper/入口）；CDNA4/m355 路试已于 2026-06-10 通过（见 §6 验证记录）。
 
 **完成后**：勾选 §8 头号 item，并在 §3 相应 Step 回填新入口 `_quantize_then_mxfp8_scaled_grouped_mm` 与 padded-buffer 测试。
 
@@ -345,3 +347,32 @@ mxfp4 / nvfp4 都没有这层测试（见 §6 标准 3 说明），所以这是
 - loss 曲线：见 `tests/unittest/mxfp8/e2e_moe_loss_curve.png`（对数 y 轴，mxfp8 vs bf16 并排）。复跑/刷新用自包含脚本 `tests/unittest/mxfp8/plot_e2e_moe_curve.py`（训练逻辑与 `test_e2e_moe.py` 一致，并打印逐步 loss 序列）。
 - 曲线观察：前 ~40 步两条线在对数轴上基本贴死，下降形状完全一致；**分叉只出现在尾部**——loss 逼近收敛底部（≲1）时，e4m3 量化噪声才表现为 mxfp8 略高于 bf16 + 轻微逐步抖动（`experts=2` 比 `experts=4` 明显，后者尾部几乎仍咬合），但全程贴着 baseline、无上翘/发散。即量化误差只在 loss 很小时显现为小幅抬升,不破坏训练动态。
 - 仍待：CDNA4 默认 `tl.dot_scaled` 路径上重跑本测试（与 §6 open item 合并）；步数/网络规模放大后是否仍贴合 bf16，留待接 GPT-OSS 时据实加严。
+
+### 7.4 跨格式对比：mxfp8 vs mxfp4 vs nvfp4（同配置 toy MoE）
+
+把 §7 的 toy MoE 训练 loop 原样扩展到 mxfp4 / nvfp4 grouped GEMM，与 bf16 baseline 在同一张图上对比。三个 kernel 入口同形（`fn(inputs[M_total,K], expert_weights[E,N,K], expert_indices[M_total], trans_weights=True) -> [M_total,N]`，`ALIGN_SIZE_M=128`），toy 任务原样迁移。
+
+**配置**（与 §7.3 逐项一致，保证可比）：`num_groups=4`、`m_total=4×128=512`、`N=K=128`、contiguous router（group g → expert `g % num_experts`）、`num_experts ∈ {2,4}`、同一份 `prepare_data` 种子(1234)+0.5% 离群点注入的 `inputs/target/w_init`、SGD 100 步 `lr=0.5`、MSE loss。**公平性关键**：mxfp4/nvfp4 均显式传 `use_sr_grad=False`（nvfp4 默认 `True` 的随机舍入会令曲线带噪不可复现），使所有量化路径确定性可比。
+
+脚本（自包含、可直接 `python` 运行）：`tests/unittest/compare_grouped_gemm_toy_moe.py`，四条曲线叠加输出 `tests/unittest/compare_grouped_gemm_toy_moe.png`，并打印全部逐步 loss。
+
+**验证记录**（2026-06-10，MI300X / CDNA3）：
+
+| 格式 | experts=2 末段均值 (vs bf16) | experts=4 末段均值 (vs bf16) | 跌破 loss<10 的步数 (exp2 / exp4) | 跌破 loss<2 |
+|---|---|---|---|---|
+| bf16 baseline | 0.400 (1×) | 0.969 (1×) | 15 / 24 | 33 / 53 |
+| mxfp8 (e4m3) | 0.496 (**1.24×**) | 1.041 (**1.08×**) | 16 / 24 | 34 / 55 |
+| mxfp4 | 5.735 (**14.3×**) | 6.430 (**6.6×**) | 33 / 51 | 从未跌破 |
+| nvfp4 | N/A | N/A | — | — |
+
+（末段均值 = 最后 20 步均值；尾部逐步抖动 mean\|Δ\| bf16≈0.007~0.015、mxfp8≈0.015~0.022、mxfp4≈0.20~0.28。）
+
+**结论：**
+1. **三条路径均稳定收敛、无发散**：全程 finite、单调下降到稳定底部，无上翘/NaN。算子层面 mxfp8/mxfp4 放进权重更新循环 100 步均不崩 → 算子可装车。
+2. **mxfp8(e4m3) ≈ bf16，近无损替代**：收敛轨迹逐点贴死（到各阈值的步数与 bf16 差 0~2 步），末段 loss 仅 1.08~1.24×；量化噪声只在收敛底部表现为极小抖动。再次坐实 §0「全 e4m3 在 toy MoE 上够用」。
+3. **mxfp4 能训练但精度地板明显抬高**（4-bit 固有代价、非 bug）：中期即分叉（跌破 10 慢一倍多、从未跌破 2），尾部卡在 loss≈5.5~6.4 的更高底部（experts=2 达 bf16 的 14.3×），tail jitter 比 mxfp8 大一个量级 → 4-bit 量化噪声成为收敛底部主导误差项。适合「容忍精度损失换吞吐/显存」的场景。
+4. **二级现象**：experts 越多 mxfp4 相对差距越小（14.3×→6.6×），因 experts=4 时 bf16 自身底部也更高（任务更难），mxfp4 的固定噪声地板占比相对下降。即 mxfp4 的劣势在「能收敛到极低 loss 的容易任务」上最刺眼。
+
+**边界与待办**：
+- 这是单层 grouped GEMM 拟合任务，不含 router/gating 梯度、激活、多层耦合、残差，仅 100 步。toy 通过 ≠ 模型级可用；真实 GPT-OSS（多层+几千步）很可能踩进 §0 发散区，mxfp4 的精度地板暗示其在深网络累积误差风险更高。
+- **nvfp4 在 CDNA3 上无数据**：其量化 kernel 在本机 Triton 3.6.0 下有预存编译错（`F4_E2M1_MAX` 在 `@triton.jit` 内非 constexpr，nvfp4 自身 grouped GEMM 单测亦 65 failed / 2 passed）。判断为需更新硬件（疑似 CDNA4）方可跑通，**CDNA3 上按现状不修**；脚本对 nvfp4 容错跳过并在图例标 `N/A (CDNA3)`。nvfp4 三/四路对比留待 CDNA4 补齐。
diff --git a/tests/unittest/compare_grouped_gemm_toy_moe.py b/tests/unittest/compare_grouped_gemm_toy_moe.py
new file mode 100644
index 0000000..10a077b
--- /dev/null
+++ b/tests/unittest/compare_grouped_gemm_toy_moe.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+"""Toy-MoE training comparison across low-precision grouped GEMM kernels.
+
+Extends the mxfp8 toy-MoE sanity (mxfp8/test_e2e_moe.py, PLAN.md §7) to mxfp4
+and nvfp4 using the *identical* config (routing, init, data, lr, steps) so all
+three low-precision paths can be compared against the same bf16 baseline on one
+plot.
+
+The three kernels share the same entry-point shape:
+    fn(inputs[M_total,K], expert_weights[E,N,K], expert_indices[M_total],
+       trans_weights=True) -> [M_total, N]
+with ALIGN_SIZE_M=128 each, so the toy task transfers unchanged.
+
+Determinism note: nvfp4 defaults to use_sr_grad=True (stochastic rounding),
+which would make its loss curve noisy/non-reproducible. We force use_sr_grad
+=False on every quantized path so all four curves are a fair, repeatable
+comparison.
+
+Standalone (no relative imports) so it runs directly:
+
+    python tests/unittest/compare_grouped_gemm_toy_moe.py
+
+Outputs compare_grouped_gemm_toy_moe.png next to this file and prints the full
+per-step loss series for every path.
+"""
+
+import os
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import torch
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
+from alto.kernels.fp4.mxfp4.mxfp_grouped_gemm import mxfp4_grouped_gemm
+from alto.kernels.fp4.nvfp4.nvfp_grouped_gemm import nvfp4_grouped_gemm
+
+ALIGN_SIZE_M = 128  # shared across mxfp8 / mxfp4 / nvfp4 grouped GEMM
+
+
+def prepare_data(tensor_shape, data_type):
+    """Same outlier-injecting init as tests/unittest/mxfp8/utils.py:prepare_data."""
+    torch.manual_seed(1234)
+    x = torch.randn(tensor_shape, dtype=data_type, device="cuda")
+    p_mask = torch.bernoulli(torch.ones_like(x) * 0.005)
+    x += 100 * torch.randn_like(x) * p_mask
+    return x
+
+
+def _make_indices(num_groups, num_experts, device):
+    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
+    return indices
+
+
+def _bf16_grouped_matmul(inputs, expert_weights, indices):
+    m_total, n_dim = inputs.shape[0], expert_weights.shape[1]
+    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        end = start + ALIGN_SIZE_M
+        out[start:end] = inputs[start:end] @ expert_weights[indices[start].item()].t()
+    return out
+
+
+# Quantized paths, all forced deterministic (use_sr_grad=False) for fair comparison.
+PATHS = {
+    "bf16 baseline": _bf16_grouped_matmul,
+    "mxfp8 (e4m3)": lambda x, w, idx: mxfp8_grouped_gemm(x, w, idx, trans_weights=True),
+    "mxfp4": lambda x, w, idx: mxfp4_grouped_gemm(x, w, idx, trans_weights=True, use_sr_grad=False),
+    "nvfp4": lambda x, w, idx: nvfp4_grouped_gemm(x, w, idx, trans_weights=True, use_sr_grad=False),
+}
+COLORS = {
+    "bf16 baseline": "tab:orange",
+    "mxfp8 (e4m3)": "tab:blue",
+    "mxfp4": "tab:green",
+    "nvfp4": "tab:red",
+}
+
+
+def _train(grouped_fn, inputs, w_init, indices, target, steps, lr):
+    weights = w_init.clone().detach().requires_grad_(True)
+    losses = []
+    for _ in range(steps):
+        out = grouped_fn(inputs, weights, indices)
+        loss = torch.nn.functional.mse_loss(out.float(), target.float())
+        losses.append(loss.item())
+        weights.grad = None
+        loss.backward()
+        with torch.no_grad():
+            weights -= lr * weights.grad
+    return losses
+
+
+def main():
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    num_groups, n_dim, k_dim = 4, 128, 128
+    m_total = num_groups * ALIGN_SIZE_M
+    steps, lr = 100, 0.5
+
+    fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
+    for ax, num_experts in zip(axes, (2, 4)):
+        inputs = prepare_data((m_total, k_dim), dtype)
+        target = prepare_data((m_total, n_dim), dtype)
+        w_init = prepare_data((num_experts, n_dim, k_dim), dtype) * 0.1
+        indices = _make_indices(num_groups, num_experts, device)
+
+        series = {}
+        for name, fn in PATHS.items():
+            try:
+                series[name] = _train(fn, inputs, w_init, indices, target, steps, lr)
+            except Exception as e:
+                # nvfp4's quant kernel has a pre-existing Triton compile bug on
+                # CDNA3 (F4_E2M1_MAX not constexpr); skip rather than abort the
+                # whole comparison.
+                print(f"[skip] {name}: {type(e).__name__}: {str(e).splitlines()[0]}")
+                series[name] = None
+
+        print(f"\n===== num_experts={num_experts} =====")
+        live = [n for n in PATHS if series[n] is not None]
+        skipped = [n for n in PATHS if series[n] is None]
+        header = f"{'step':>4} " + " ".join(f"{name:>16}" for name in live)
+        print(header)
+        for i in range(steps):
+            row = f"{i:>4} " + " ".join(f"{series[name][i]:>16.6f}" for name in live)
+            print(row)
+        if skipped:
+            print(f"(unavailable: {', '.join(skipped)})")
+
+        linestyle = {"bf16 baseline": "--"}
+        for name in PATHS:
+            if series[name] is None:
+                continue
+            ax.plot(series[name], label=name, color=COLORS[name],
+                    linestyle=linestyle.get(name, "-"))
+        if skipped:
+            ax.plot([], [], " ", label=f"{', '.join(skipped)}: N/A (CDNA3)")
+        ax.set_yscale("log")
+        ax.set_xlabel("step")
+        ax.set_ylabel("MSE loss (log)")
+        ax.set_title(f"toy MoE, num_experts={num_experts}")
+        ax.legend()
+        ax.grid(True, which="both", alpha=0.3)
+
+    fig.tight_layout()
+    out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                            "compare_grouped_gemm_toy_moe.png")
+    fig.savefig(out_path, dpi=120)
+    print(f"\nsaved curve to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/unittest/mxfp8/plot_e2e_moe_curve.py b/tests/unittest/mxfp8/plot_e2e_moe_curve.py
new file mode 100644
index 0000000..211efea
--- /dev/null
+++ b/tests/unittest/mxfp8/plot_e2e_moe_curve.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+"""Plot toy-MoE training loss curves (mxfp8 vs bf16) for PLAN.md §7.
+
+Standalone helper, not a pytest test. The training loop is identical to
+test_e2e_moe.py (same routing, init, lr, steps) so the curves match what the
+sanity test asserts on. Self-contained (no relative imports) so it runs directly:
+
+    python tests/unittest/mxfp8/plot_e2e_moe_curve.py
+
+Outputs e2e_moe_loss_curve.png next to this file and prints the full per-step
+series for both paths.
+"""
+
+import os
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import torch
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+
+
+def prepare_data(tensor_shape, data_type):
+    """Same outlier-injecting init as tests/unittest/mxfp8/utils.py:prepare_data."""
+    torch.manual_seed(1234)
+    x = torch.randn(tensor_shape, dtype=data_type, device="cuda")
+    p_mask = torch.bernoulli(torch.ones_like(x) * 0.005)
+    x += 100 * torch.randn_like(x) * p_mask
+    return x
+
+
+def _make_indices(num_groups, num_experts, device):
+    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
+    return indices
+
+
+def _bf16_grouped_matmul(inputs, expert_weights, indices):
+    m_total, n_dim = inputs.shape[0], expert_weights.shape[1]
+    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        end = start + ALIGN_SIZE_M
+        out[start:end] = inputs[start:end] @ expert_weights[indices[start].item()].t()
+    return out
+
+
+def _train(grouped_fn, inputs, w_init, indices, target, steps, lr):
+    weights = w_init.clone().detach().requires_grad_(True)
+    losses = []
+    for _ in range(steps):
+        out = grouped_fn(inputs, weights, indices)
+        loss = torch.nn.functional.mse_loss(out.float(), target.float())
+        losses.append(loss.item())
+        weights.grad = None
+        loss.backward()
+        with torch.no_grad():
+            weights -= lr * weights.grad
+    return losses
+
+
+def main():
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    num_groups, n_dim, k_dim = 4, 128, 128
+    m_total = num_groups * ALIGN_SIZE_M
+    steps, lr = 100, 0.5
+
+    fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
+    for ax, num_experts in zip(axes, (2, 4)):
+        inputs = prepare_data((m_total, k_dim), dtype)
+        target = prepare_data((m_total, n_dim), dtype)
+        w_init = prepare_data((num_experts, n_dim, k_dim), dtype) * 0.1
+        indices = _make_indices(num_groups, num_experts, device)
+
+        mxfp8_losses = _train(
+            lambda x, w, idx: mxfp8_grouped_gemm(x, w, idx, trans_weights=True),
+            inputs, w_init, indices, target, steps, lr,
+        )
+        bf16_losses = _train(_bf16_grouped_matmul, inputs, w_init, indices, target, steps, lr)
+
+        print(f"\n===== num_experts={num_experts} =====")
+        print(f"{'step':>4} {'mxfp8':>14} {'bf16':>14}")
+        for i, (ml, bl) in enumerate(zip(mxfp8_losses, bf16_losses)):
+            print(f"{i:>4} {ml:>14.6f} {bl:>14.6f}")
+
+        ax.plot(mxfp8_losses, label="mxfp8 (e4m3)", color="tab:blue")
+        ax.plot(bf16_losses, label="bf16 baseline", color="tab:orange", linestyle="--")
+        ax.set_yscale("log")
+        ax.set_xlabel("step")
+        ax.set_ylabel("MSE loss (log)")
+        ax.set_title(f"toy MoE, num_experts={num_experts}")
+        ax.legend()
+        ax.grid(True, which="both", alpha=0.3)
+
+    fig.tight_layout()
+    out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "e2e_moe_loss_curve.png")
+    fig.savefig(out_path, dpi=120)
+    print(f"\nsaved curve to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/unittest/mxfp8/test_e2e_moe.py b/tests/unittest/mxfp8/test_e2e_moe.py
new file mode 100644
index 0000000..b3e102e
--- /dev/null
+++ b/tests/unittest/mxfp8/test_e2e_moe.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+"""toy MoE training sanity for mxfp8 grouped GEMM (PLAN.md §7).
+
+Single-step autograd correctness is already covered in
+test_mxfp8_grouped_gemm_backward.py. This file validates the §0 assumption that
+V1 all-e4m3 is good enough to *train* a toy MoE for ~100 steps without diverging:
+loss must stay finite, trend down, and track a bf16 baseline run with the same
+init / data / routing.
+"""
+
+import pytest
+import torch
+
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+
+from .utils import prepare_data
+
+
+def _make_indices(num_groups, num_experts, device):
+    """Contiguous routing: group g -> expert (g % num_experts), aligned to ALIGN_SIZE_M."""
+    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
+    return indices
+
+
+def _bf16_grouped_matmul(inputs, expert_weights, indices):
+    """bf16 reference for Y = X @ W[expert]^T, per contiguous group."""
+    m_total, n_dim = inputs.shape[0], expert_weights.shape[1]
+    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        end = start + ALIGN_SIZE_M
+        out[start:end] = inputs[start:end] @ expert_weights[indices[start].item()].t()
+    return out
+
+
+def _train(grouped_fn, inputs, w_init, indices, target, steps, lr):
+    """Run `steps` of SGD fitting W so grouped_fn(X, W) -> target. Returns loss list."""
+    weights = w_init.clone().detach().requires_grad_(True)
+    losses = []
+    for _ in range(steps):
+        out = grouped_fn(inputs, weights, indices)
+        loss = torch.nn.functional.mse_loss(out.float(), target.float())
+        losses.append(loss.item())
+        weights.grad = None
+        loss.backward()
+        with torch.no_grad():
+            weights -= lr * weights.grad
+    return losses
+
+
+def _window_mean(xs, frac=0.2):
+    w = max(1, int(len(xs) * frac))
+    return sum(xs[:w]) / w, sum(xs[-w:]) / w
+
+
+@pytest.mark.parametrize("num_experts", [2, 4])
+def test_toy_moe_trains_and_tracks_bf16(num_experts):
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    num_groups, n_dim, k_dim = 4, 128, 128
+    m_total = num_groups * ALIGN_SIZE_M
+    steps, lr = 100, 0.5
+
+    inputs = prepare_data((m_total, k_dim), dtype)
+    target = prepare_data((m_total, n_dim), dtype)
+    w_init = prepare_data((num_experts, n_dim, k_dim), dtype) * 0.1
+    indices = _make_indices(num_groups, num_experts, device)
+
+    mxfp8_losses = _train(
+        lambda x, w, idx: mxfp8_grouped_gemm(x, w, idx, trans_weights=True),
+        inputs, w_init, indices, target, steps, lr,
+    )
+    bf16_losses = _train(_bf16_grouped_matmul, inputs, w_init, indices, target, steps, lr)
+
+    # 1. Finite throughout — the core "does e4m3 blow up" check.
+    assert all(torch.isfinite(torch.tensor(l)) for l in mxfp8_losses), \
+        f"mxfp8 loss went non-finite: {mxfp8_losses}"
+
+    # 2. Loss trends down (robust to per-step quant jitter: compare window means).
+    head, tail = _window_mean(mxfp8_losses)
+    assert tail < head * 0.9, f"mxfp8 loss did not trend down: head={head:.4f} tail={tail:.4f}"
+
+    # 3. Tracks the bf16 baseline — same init/data/routing, so curves should be same shape.
+    bf16_head, bf16_tail = _window_mean(bf16_losses)
+    assert tail < bf16_tail * 2.0, \
+        f"mxfp8 final loss diverged from bf16: mxfp8_tail={tail:.4f} bf16_tail={bf16_tail:.4f}"

From b7fa4e1d1525521c89e500acf5ba32a262b0d468 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Mon, 15 Jun 2026 08:33:04 +0000
Subject: [PATCH 11/12] mxfp8: add offsets entry + padded buffer for grouped
 GEMM, wire dispatch

---
 alto/kernels/dispatch/tensor.py               | 32 ++++++-
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md | 52 +++++++++---
 .../mxfp8/mxfp8_grouped_gemm/__init__.py      |  7 +-
 .../mxfp8/mxfp8_grouped_gemm/cg_backward.py   | 34 ++++----
 .../mxfp8/mxfp8_grouped_gemm/cg_forward.py    | 17 ++--
 .../mxfp8/mxfp8_grouped_gemm/functional.py    | 37 ++++++++
 .../mxfp8/test_mxfp8_grouped_gemm_backward.py | 84 ++++++++++++++++++-
 .../mxfp8/test_mxfp8_grouped_gemm_forward.py  |  4 +-
 8 files changed, 227 insertions(+), 40 deletions(-)

diff --git a/alto/kernels/dispatch/tensor.py b/alto/kernels/dispatch/tensor.py
index de96b9f..c6070c7 100644
--- a/alto/kernels/dispatch/tensor.py
+++ b/alto/kernels/dispatch/tensor.py
@@ -22,6 +22,7 @@
     _quantize_then_nvfp4_scaled_grouped_mm,
 )
 from alto.kernels.mxfp8.mxfp8_linear import _to_mxfp8_then_scaled_mm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import _quantize_then_mxfp8_scaled_grouped_mm
 from .config import TrainingOpConfig
 
 aten = torch.ops.aten
@@ -403,9 +404,34 @@ class MXFP8TrainingWeightWrapperTensor(TrainingWeightWrapperBaseTensor):
     @classmethod
     def __torch_function__(cls, func, types, args, kwargs={}):
         if func.__name__ == "_grouped_mm":
-            raise NotImplementedError(
-                "MXFP8 _grouped_mm is not supported by this dispatch path; "
-                "restrict MXFP8 schemes to Linear targets."
+            # Routed-expert MoE path: 2d activations x 3d weights with offsets.
+            A, B = args[0], args[1]
+            bias = kwargs.get("bias", None)
+            offs = kwargs.get("offs", None)
+
+            assert not isinstance(A, cls), f"A should not be a {cls.__name__}"
+            assert isinstance(B, cls), f"B should be a {cls.__name__}"
+            assert A.ndim == 2 and B.ndim == 3 and offs is not None, (
+                "Only 2d x 3d with offsets is supported for MXFP8 grouped_mm"
+            )
+            assert bias is None, "Bias is not supported for grouped_mm"
+
+            config = B.config
+            assert config.precision == "mxfp8_e4m3", (
+                "MXFP8 grouped_mm V1 supports only mxfp8_e4m3; "
+                f"got {config.precision} (e5m2 grouped path is not yet validated)"
+            )
+            assert not config.use_hadamard and not config.use_dge, (
+                "MXFP8 grouped_mm V1 does not support Hadamard or DGE options."
+            )
+
+            return _quantize_then_mxfp8_scaled_grouped_mm(
+                A,
+                B,
+                offs=offs,
+                use_2dblock_x=config.use_2dblock_x,
+                use_2dblock_w=config.use_2dblock_w,
+                use_sr_grad=config.use_sr_grad,
             )
 
         if func.__name__ in gemm_ops:
diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index d675f53..47b6f77 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -224,10 +224,8 @@ functional.py        # 暴露顶层入口
 
 V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三 pass 数值对、autograd 通、toy 训练 100 步不发散）。但「支持 GPT-OSS training」卡在算子与真实 MoE 之间的**接口契约**与若干前置验证上。下列待办按接入优先级排：
 
-- [ ] **【阻塞 · 头号】offsets 入口 + padded buffer 支持，对齐 mxfp4/nvfp4。**（方案已批准，待实施，见 §8.1）
-  现状（`cg_forward.py:247-249` 等）：入口要求 `M_total % 128 == 0`、每 128 token 整块同一 expert、`expert_indices.numel() == M_total`，且 forward 注释明确「V1 不支持 padded buffer」。
-  真实 MoE（含 GPT-OSS）路由后每个 expert 的 token 数是**动态、不等、不保证 128 对齐**的 → 现状下 GPT-OSS 给不出 mxfp8 能吃的输入。
-  参照：mxfp4 用户传 `offs`（累积 offset，如 `[128,128,256,...]`），内部 `create_indices_from_offsets_nosync`（`mxfp4/.../functional.py:23`）转 indices；nvfp4 另有 `test_nvfp4_grouped_gemm_accepts_padded_buffer` 覆盖 `M_bufferlen > 实际 token 数` 的补零场景。mxfp8 需补同款 `offsets` 入口 + `M_bufferlen` vs `M_total` 区分。
+- [x] **offsets 入口 + padded buffer 支持，对齐 mxfp4/nvfp4。**（2026-06-10 已实施，见 §8.1）
+  新增 dispatch 入口 `_quantize_then_mxfp8_scaled_grouped_mm(A, B, offs, ...)`，并把三个 wrapper 改为区分 `M_bufferlen`(=`inputs/grad_output.shape[0]`) 与 `M_total`(=`expert_indices.numel()`=`offs[-1]`)：M_total 仍须 128 对齐，但 buffer 尾部可有 padding 行（输出/梯度恒 0）。真实 MoE（含 GPT-OSS）路由后每 expert token 数动态、不等、不保证 128 对齐 + 上游 padded buffer → 现在 mxfp8 能直接吃。Triton kernel 零改动（详见 §8.1）。
 
 - [x] **CDNA4/m355 真机验证默认 `tl.dot_scaled` 路径。**
   2026-06-10 已在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器中重跑 forward + backward 52 个 grouped GEMM 单测，结果 **52 passed, 14 warnings in 18.51s**。结论：m355 默认 `tl.dot_scaled` 路径的 fwd / dgrad / wgrad 数值正确性单测已通过。
@@ -238,14 +236,14 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 - [ ] **【中】性能可用性验证。**
   wgrad 为「每 tile 扫所有 group」的 O(experts) 朴素实现、`BLOCK_SIZE_K=32`、无 autotune（§4 划线项）。能跑通 ≠ 训得起，GPT-OSS 规模下需实测吞吐，必要时提前做 split-K / autotune（原列为 v2）。
 
-- [ ] **【中】接入形态对齐。**
-  `mxfp8_grouped_gemm` 签名要能直接替换 GPT-OSS MoE forward 中的 grouped GEMM 调用；具体集成视 GPT-OSS 训练栈 PR 时再定。
+- [x] **接入形态对齐（dispatch subclass wiring）。**（2026-06-10 已实施，见 §8.2）
+  `MXFP8TrainingWeightWrapperTensor.__torch_function__` 的 `_grouped_mm` 分支原为 `raise NotImplementedError`，现已照搬 mxfp4 模式接上 §8.1 的 `_quantize_then_mxfp8_scaled_grouped_mm`，并加 V1 边界保护（只收 `mxfp8_e4m3`，拒 e5m2 grouped / hadamard / dge）。至此 `lpt_recipe.yaml` 全链路（modifier 白名单 → conversion 选类 → dispatch 路由 → offsets/padding 入口 → 算子）打通，接 GPT-OSS **无需再写新代码**，只需改 recipe 配置（见 §8.2）。
 
-一句话：**发动机（算子）基本造好，CDNA4/m355 路试已过；但接进整车（GPT-OSS）所需的传动接口（offsets/padding）以及很可能必需的 e5m2，都还没做。** 最小可用 kernel ≈ 75% 到位，差的恰是「接真实模型」这一段。
+一句话：**发动机（算子）造好，CDNA4/m355 路试已过，传动接口（offsets/padding）与 dispatch wiring 都接好了；接 GPT-OSS 代码侧已就绪，只差改 recipe 配置。剩下的真正风险是很可能必需的 e5m2、性能实测、以及整网真训的数值收敛。** 最小可用 kernel ≈ 90% 到位，差的主要是「真训路况下的数值/性能验证」。
 
-### 8.1 offsets 入口 + padded buffer 实施方案（已批准 · 待实施）
+### 8.1 offsets 入口 + padded buffer 实施方案（✅ 已实施 · 2026-06-10）
 
-> 状态：方案已评审通过，**代码尚未动手**。本节是落地蓝图，实施时按此执行并回填结果。
+> 状态：**已落地并通过测试**（MI300X/CDNA3，54 passed）。本节为实施记录。
 
 **核心发现：三个 Triton kernel 无需改动。** 它们已用 `M_TOTAL` 作为迭代上界并 `offs_m < M_TOTAL` 掩码；输出张量是独立 `torch.zeros` 分配，padding 行从不被写入、天然保持 0。buffer-vs-routed 的混淆**只存在于 Python wrapper**。两个长度的定义：
 - **M_total** = 路由 token 数 = `expert_indices.numel()` = `offs[-1]`，必须 128 对齐（GPT-OSS 把每 expert 的 token 数向上 padding 到 128）。
@@ -257,15 +255,45 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 
 2. **`cg_backward.py` 两个 wrapper**（kernel 不动）：dgrad/wgrad 同样拆 `M_bufferlen`(=`grad_output.shape[0]`) vs `M_total`(=`expert_indices.numel()`)；`grad_inputs` 按 bufferlen 分配；scale shape 按 bufferlen；grid 用 M_total。wgrad kernel 只遍历 `M_TOTAL // GROUP_SIZE_M` 个路由 group → padding 行永不累加进 dW。
 
-3. **`functional.py` 新增 dispatch 入口**（对齐 GPT-OSS 约定，零改动 drop-in）：`_quantize_then_mxfp8_scaled_grouped_mm(A, B, offs, *, use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False, fwd_format="e4m3", bwd_grad_format="e4m3")`。`B` 以 dispatch 布局 `[E,K,N]` 传入，入口内 `B.transpose(-2,-1).contiguous()` 转成 canonical `[E,N,K]`（仿 nvfp4 `functional.py:149`），再以 `trans_weights=True` 调 `MXFP8GroupedGEMM.apply`；`offs` 经 `create_indices_from_offsets_nosync`（`alto/kernels/dsgemm_utils.py`）转 indices。`__init__.py` 导出该入口。
+3. **`functional.py` 新增 dispatch 入口**（对齐 GPT-OSS 约定，零改动 drop-in）：`_quantize_then_mxfp8_scaled_grouped_mm(A, B, offs, *, use_2dblock_x=False, use_2dblock_w=True, use_sr_grad=False, fwd_format="e4m3", bwd_grad_format="e4m3")`。`offs` 经 `create_indices_from_offsets_nosync`（`alto/kernels/dsgemm_utils.py`）转 indices。`__init__.py` 导出该入口。
+   > **实施偏离蓝图（已批准）**：原蓝图拟仿 nvfp4 在入口内 `B.transpose(-2,-1).contiguous()` 转 canonical `[E,N,K]` 再以 `trans_weights=True` 调。最终改采 **mxfp4 式**：`B` 以 dispatch 布局 `[E,K,N]` 原样传入，`trans_weights=False`，**不做 transpose 拷贝**（仿 `mxfp4/.../functional.py:_quantize_then_mxfp_scaled_grouped_mm` 的 7 行最简模板）。理由：省每步一次 `E×K×N` 权重拷贝、模板更简。代价是 mxfp8 的 `trans_weights=False` 路径原本单测覆盖较少——故新增的 padded-buffer 测试**专门走该路径**补强覆盖。
 
 4. **`MXFP8GroupedGEMM` autograd**：结构无需改（已用 bufferlen 张量 + ctx 透传 indices）；仅需确认 M 轴量化 `convert_to_mxfp8(..., axis=0)` 在 bufferlen buffer 上成立（要求 `M_bufferlen % 32 == 0`，由新断言保证）。
 
 5. **测试**（`test_mxfp8_grouped_gemm_backward.py`）：新增 `test_mxfp8_grouped_gemm_accepts_padded_buffer`，采 **padded-vs-unpadded 自比**（最强校验，闭环证明 padding 零干扰）。**关键：固定 `use_sr_grad=False`** 使量化确定性，否则随机舍入令 `torch.equal` 偶发失败；routed 行两次跑应逐位相等。断言：`y_pad.shape==(M_bufferlen,N)`、`y_pad[M_routed:]` 全 0、`y_pad[:M_routed]==y_ref`、`inputs_pad.grad[M_routed:]` 全 0、`inputs_pad.grad[:M_routed]==inputs_ref.grad`、`w_pad.grad==w_ref.grad`；另加 offsets 入口 smoke test。
 
-**验证**：现有 54 用例不回归 + 新 padded-buffer 测试通过；额外确认「若 wrapper 仍按 `[M_total,N]` 分配则新测试会失败」以证明确实触发了 padding 路径。本改动 device-agnostic（只动 wrapper/入口）；CDNA4/m355 路试已于 2026-06-10 通过（见 §6 验证记录）。
+5b. **smoke test**：另加 `test_mxfp8_dispatch_entry_offsets_matches_indices`——同一路由下 offsets 入口与 indices 路径输出 SNR > 30 dB，验证 `create_indices_from_offsets_nosync` round-trip 正确。
 
-**完成后**：勾选 §8 头号 item，并在 §3 相应 Step 回填新入口 `_quantize_then_mxfp8_scaled_grouped_mm` 与 padded-buffer 测试。
+**验证记录**（2026-06-10，MI300X / CDNA3）：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` → **54 passed**（52 原有不回归 + 2 新增：padded-buffer 自比、offsets smoke）。padded-buffer 测试的 `y_pad.shape==(M_bufferlen,N)` 断言本身即负控——若 wrapper 仍按 `[M_total,N]` 分配则该断言失败，证明确实触发 padding 路径。本改动 device-agnostic（只动 wrapper/入口，Triton kernel 零改动）；CDNA4/m355 路试已于 2026-06-10 通过（见 §6 验证记录）。
+
+**后续（已于同日完成）**：dispatch `__torch_function__` subclass wiring（`alto/kernels/dispatch/tensor.py`，即 §8「接入形态对齐」）当时列为后续 PR，已于 2026-06-10 一并实施，见 §8.2。
+
+### 8.2 dispatch wiring + GPT-OSS 接入（✅ 已实施 · 2026-06-10）
+
+> 状态：**已落地并验证**（MI300X/CDNA3）。打通 `lpt_recipe.yaml` 到算子的全链路。
+
+**全链路**：`lpt_recipe.yaml` → `LowPrecisionTrainingModifier`（`alto/modifiers/lpt/base.py`，已支持 `mxfp8_e4m3` scheme 与 `GptOssGroupedExperts` target）→ `swap_params` → `conversion.py:_get_tensor_cls_for_config` 选 `MXFP8TrainingWeightWrapperTensor` → 训练时 MoE 调 `torch._grouped_mm` → wrapper `__torch_function__` 拦截 → §8.1 入口 → 算子。除 dispatch 分支外其余环节本就齐备。
+
+**改动（`alto/kernels/dispatch/tensor.py`）**：`MXFP8TrainingWeightWrapperTensor.__torch_function__` 的 `_grouped_mm` 分支原为 `raise NotImplementedError("... restrict MXFP8 schemes to Linear targets.")`，改为照搬 mxfp4 分支模式：取 `A/B/offs`，断言 2d×3d+offs，调 `_quantize_then_mxfp8_scaled_grouped_mm(A, B, offs=offs, use_2dblock_x/w, use_sr_grad)`。**V1 边界保护**：断言 `config.precision == "mxfp8_e4m3"`（拒 e5m2 grouped——该路径未验证）、`not use_hadamard and not use_dge`。新增顶部 import。
+
+**验证记录**（2026-06-10，MI300X / CDNA3）：
+- 端到端 dispatch smoke：包一个 `MXFP8TrainingWeightWrapperTensor` 权重 `[E,K,N]`，`torch._grouped_mm(A, B, offs=[128,256,384,512])` → forward `(512,256)` finite、`.sum().backward()` 后 `A.grad` finite。
+- 边界：`mxfp8_e5m2` 与 `use_hadamard=True` 均被正确 `AssertionError` 拒绝。
+- 回归：`test_mxfp8_grouped_gemm_forward.py` + `_backward.py` **54 passed** 无回归。
+
+**接 GPT-OSS：无需再写代码，只改 recipe 配置。** `alto/models/gpt_oss/configs/lpt_recipe.yaml`（当前为 mxfp4）改成最小 e4m3 V1 需动 3 行（其余保持）：
+
+| 字段 | 当前(mxfp4) | mxfp8 V1 | 原因 |
+|---|---|---|---|
+| `scheme` | `mxfp4` | `mxfp8_e4m3` | 切格式 |
+| `use_hadamard` | `true` | `false` | mxfp8 grouped 不支持（dispatch 断言拒绝） |
+| `use_sr_grad` | `true` | `false` | V1 grouped sr 路径未验证 |
+| `use_2dblock_x` | `false` | `false` | 支持，保持 |
+| `use_2dblock_w` | `true` | `true` | 支持，保持 |
+| `use_dge` / `clip_mode` / `two_level_scaling` | `false`/`none`/`none` | 同 | 已关，保持 |
+| `targets` / `ignore` | 不变 | 不变 | `["Linear","GptOssGroupedExperts"]` 均支持 |
+
+**接入后仍需注意**（非本次范围，属真训验证）：① 本机为 CDNA3 dequant fallback，GPT-OSS 真训若在 CDNA4 走默认 `tl.dot_scaled`，算子单测已在 m355 过（§6），但**整网真训未跑**；② §0 风险——全 e4m3 多层+几千步可能发散（toy 仅 100 步单层），能跑 ≠ 能收敛，真训需盯 loss，必要时回到「e5m2 混合格式」open item。
 
 ---
 
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
index b2bb4e6..4bc9915 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/__init__.py
@@ -1,6 +1,9 @@
 # Copyright (c) 2026 Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 
-from alto.kernels.mxfp8.mxfp8_grouped_gemm.functional import mxfp8_grouped_gemm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.functional import (
+    mxfp8_grouped_gemm,
+    _quantize_then_mxfp8_scaled_grouped_mm,
+)
 
-__all__ = ["mxfp8_grouped_gemm"]
+__all__ = ["mxfp8_grouped_gemm", "_quantize_then_mxfp8_scaled_grouped_mm"]
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
index 4daf682..c0a6c6a 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_backward.py
@@ -332,12 +332,15 @@ def mxfp8_grouped_gemm_backward_inputs(
     if use_dot_scaled is None:
         use_dot_scaled = is_cdna4()
 
-    M_total, N = grad_output.shape
+    M_bufferlen, N = grad_output.shape
+    M_total = expert_indices.numel()
     torch._check(M_total > 0)
+    assert M_bufferlen >= M_total, \
+        f"M_bufferlen ({M_bufferlen}) must be >= M_total ({M_total})"
+    assert M_bufferlen % BLOCK_SIZE_DEFAULT == 0, \
+        f"M_bufferlen ({M_bufferlen}) must be a multiple of block_size ({BLOCK_SIZE_DEFAULT})"
     assert M_total % ALIGN_SIZE_M == 0, \
         f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
-    assert expert_indices.numel() == M_total, \
-        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
     if N % BLOCK_SIZE_DEFAULT != 0:
         raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
 
@@ -368,16 +371,16 @@ def mxfp8_grouped_gemm_backward_inputs(
         raise ValueError(f"K ({K}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
 
     expected_go_scales = (
-        (M_total // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
         if use_2dblock_x else
-        (M_total, N // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen, N // BLOCK_SIZE_DEFAULT)
     )
     assert go_scales.shape == torch.Size(expected_go_scales), \
         f"go_scales shape {go_scales.shape} must be {expected_go_scales}"
     assert expert_weight_scales.shape == torch.Size(expected_weight_scales), \
         f"expert_weight_scales shape {expert_weight_scales.shape} must be {expected_weight_scales}"
 
-    grad_inputs = torch.zeros((M_total, K), device=grad_output.device, dtype=output_dtype)
+    grad_inputs = torch.zeros((M_bufferlen, K), device=grad_output.device, dtype=output_dtype)
     stride_gom, stride_gon = grad_output.stride()
     stride_gosm, stride_gosn = go_scales.stride()
     stride_gim, stride_gik = grad_inputs.stride()
@@ -433,14 +436,17 @@ def mxfp8_grouped_gemm_backward_weights(
     if use_dot_scaled is None:
         use_dot_scaled = is_cdna4()
 
-    M_total, N = grad_output.shape
+    M_bufferlen, N = grad_output.shape
     M_inputs, K = inputs.shape
-    assert M_inputs == M_total, f"inputs M ({M_inputs}) must match grad_output M ({M_total})"
+    assert M_inputs == M_bufferlen, f"inputs M ({M_inputs}) must match grad_output M ({M_bufferlen})"
+    M_total = expert_indices.numel()
     torch._check(M_total > 0)
+    assert M_bufferlen >= M_total, \
+        f"M_bufferlen ({M_bufferlen}) must be >= M_total ({M_total})"
+    assert M_bufferlen % BLOCK_SIZE_DEFAULT == 0, \
+        f"M_bufferlen ({M_bufferlen}) must be a multiple of block_size ({BLOCK_SIZE_DEFAULT})"
     assert M_total % ALIGN_SIZE_M == 0, \
         f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
-    assert expert_indices.numel() == M_total, \
-        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
     if N % BLOCK_SIZE_DEFAULT != 0:
         raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
     if K % BLOCK_SIZE_DEFAULT != 0:
@@ -450,14 +456,14 @@ def mxfp8_grouped_gemm_backward_weights(
         expert_indices = expert_indices.to(torch.int32)
 
     expected_go_scales = (
-        (M_total // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, N // BLOCK_SIZE_DEFAULT)
         if use_2dblock_go else
-        (M_total // BLOCK_SIZE_DEFAULT, N)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, N)
     )
     expected_input_scales = (
-        (M_total // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
         if use_2dblock_x else
-        (M_total // BLOCK_SIZE_DEFAULT, K)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, K)
     )
     assert go_scales.shape == torch.Size(expected_go_scales), \
         f"go_scales shape {go_scales.shape} must be {expected_go_scales}"
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
index fa6b143..0016efd 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/cg_forward.py
@@ -219,7 +219,7 @@ def _kernel_mxfp8_grouped_gemm_forward(
 
 @triton_op("alto::mxfp8_grouped_gemm_forward", mutates_args={})
 def mxfp8_grouped_gemm_forward(
-    inputs: torch.Tensor,            # [M_total, K], fp8
+    inputs: torch.Tensor,            # [M_bufferlen, K], fp8 (M_bufferlen may be padded >= M_total)
     expert_weights: torch.Tensor,    # [num_experts, N, K] (trans) or [num_experts, K, N], fp8
     expert_indices: torch.Tensor,    # [M_total], int32
     input_scales: torch.Tensor,
@@ -242,12 +242,15 @@ def mxfp8_grouped_gemm_forward(
     if use_dot_scaled is None:
         use_dot_scaled = is_cdna4()
 
-    M_total, K = inputs.shape
+    M_bufferlen, K = inputs.shape
+    M_total = expert_indices.numel()
     torch._check(M_total > 0)
+    assert M_bufferlen >= M_total, \
+        f"M_bufferlen ({M_bufferlen}) must be >= M_total ({M_total})"
+    assert M_bufferlen % BLOCK_SIZE_DEFAULT == 0, \
+        f"M_bufferlen ({M_bufferlen}) must be a multiple of block_size ({BLOCK_SIZE_DEFAULT})"
     assert M_total % ALIGN_SIZE_M == 0, \
         f"M_total ({M_total}) must be a multiple of group_size_m ({ALIGN_SIZE_M})"
-    assert expert_indices.numel() == M_total, \
-        f"expert_indices length ({expert_indices.numel()}) must match M_total ({M_total})"
     if K % BLOCK_SIZE_DEFAULT != 0:
         raise ValueError(f"K ({K}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT})")
 
@@ -268,9 +271,9 @@ def mxfp8_grouped_gemm_forward(
         raise ValueError(f"N ({N}) must be divisible by block_size ({BLOCK_SIZE_DEFAULT}) for 2D weight scales")
 
     expected_input_scales = (
-        (M_total // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen // BLOCK_SIZE_DEFAULT, K // BLOCK_SIZE_DEFAULT)
         if use_2dblock_x else
-        (M_total, K // BLOCK_SIZE_DEFAULT)
+        (M_bufferlen, K // BLOCK_SIZE_DEFAULT)
     )
     if trans_weights:
         expected_weight_scales = (
@@ -289,7 +292,7 @@ def mxfp8_grouped_gemm_forward(
     assert weight_scales.shape == torch.Size(expected_weight_scales), \
         f"weight_scales shape {weight_scales.shape} must be {expected_weight_scales}"
 
-    output = torch.zeros((M_total, N), device=inputs.device, dtype=output_dtype)
+    output = torch.zeros((M_bufferlen, N), device=inputs.device, dtype=output_dtype)
 
     stride_am, stride_ak = inputs.stride()
     stride_asm, stride_ask = input_scales.stride()
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
index 06b4802..9194365 100644
--- a/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
+++ b/alto/kernels/mxfp8/mxfp8_grouped_gemm/functional.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from alto.kernels.dsgemm_utils import create_indices_from_offsets_nosync
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_backward import MXFP8GroupedGEMM
 
 
@@ -40,3 +41,39 @@ def mxfp8_grouped_gemm(
         fwd_format,
         bwd_grad_format,
     )
+
+
+def _quantize_then_mxfp8_scaled_grouped_mm(
+    A: torch.Tensor,       # [M_bufferlen, K], bf16/fp32; may be padded (M_bufferlen >= M_total)
+    B: torch.Tensor,       # [num_experts, K, N], dispatch convention (trans_weights=False)
+    offs: torch.Tensor,    # [num_experts], int32 cumulative offsets; offs[-1] == M_total (routed)
+    *,
+    use_2dblock_x: bool = False,
+    use_2dblock_w: bool = True,
+    use_sr_grad: bool = False,
+    fwd_format: str = "e4m3",
+    bwd_grad_format: str = "e4m3",
+) -> torch.Tensor:
+    """Dispatch-layer entry for mxfp8 grouped GEMM (GPT-OSS-style MoE call).
+
+    Mirrors the mxfp4 ``_quantize_then_mxfp_scaled_grouped_mm`` contract: ``offs``
+    is the 1-D cumulative offset tensor from the MoE routing layer (e.g.
+    [128, 256, 384, 512]), so ``offs[-1] == M_total`` (routed token count). ``A``
+    may have ``shape[0] > offs[-1]`` (padded activation buffer); trailing rows
+    produce zero output and zero gradient.
+
+    ``B`` is kept in dispatch convention ``[E, K, N]`` and passed straight through
+    with ``trans_weights=False`` (no transpose copy).
+    """
+    m_indices = create_indices_from_offsets_nosync(offs)
+    return mxfp8_grouped_gemm(
+        A,
+        B,
+        m_indices,
+        trans_weights=False,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+        use_sr_grad=use_sr_grad,
+        fwd_format=fwd_format,
+        bwd_grad_format=bwd_grad_format,
+    )
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
index 37ab9f1..7018aa6 100644
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
@@ -5,7 +5,10 @@
 import pytest
 import torch
 
-from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import (
+    mxfp8_grouped_gemm,
+    _quantize_then_mxfp8_scaled_grouped_mm,
+)
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_backward import (
     mxfp8_grouped_gemm_backward_inputs,
@@ -279,3 +282,82 @@ def test_autograd_many_experts_with_empty_expert():
     # Empty experts must get exactly zero weight gradient.
     assert torch.count_nonzero(expert_weights.grad[2:]) == 0, "unused experts must have zero dW"
     assert torch.count_nonzero(expert_weights.grad[:2]) > 0, "routed experts must have nonzero dW"
+
+
+# =============== offsets dispatch entry + padded buffer (PLAN.md §8.1) ===============
+
+def test_mxfp8_grouped_gemm_accepts_padded_buffer():
+    """Padded activation buffer (M_bufferlen > routed M_total) must not disturb
+    routed rows, and padding rows must stay zero in both output and gradients.
+
+    Padded-vs-unpadded self-comparison through the offsets dispatch entry (which
+    uses trans_weights=False, also boosting that path's coverage). use_sr_grad is
+    forced False so quantization is deterministic and torch.equal holds bitwise.
+    """
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    M_routed, M_pad = 4 * ALIGN_SIZE_M, 2 * ALIGN_SIZE_M  # 512 routed, 256 padding
+    M_bufferlen = M_routed + M_pad
+    num_experts, K, N = 4, 256, 256
+
+    routed_rows = prepare_data((M_routed, K), dtype)
+    weights = prepare_data((num_experts, K, N), dtype)  # dispatch convention [E, K, N]
+    # Each expert owns one ALIGN_SIZE_M group of routed tokens -> cumulative offs.
+    offs = torch.tensor(
+        [(i + 1) * (M_routed // num_experts) for i in range(num_experts)],
+        dtype=torch.int32, device=device,
+    )
+
+    inputs_pad = torch.zeros(M_bufferlen, K, dtype=dtype, device=device)
+    inputs_pad[:M_routed] = routed_rows
+    inputs_pad.requires_grad_(True)
+    weights_pad = weights.clone().requires_grad_(True)
+    y_pad = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs_pad, weights_pad, offs,
+        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
+
+    inputs_ref = routed_rows.clone().requires_grad_(True)
+    weights_ref = weights.clone().requires_grad_(True)
+    y_ref = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs_ref, weights_ref, offs,
+        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
+
+    assert y_pad.shape == (M_bufferlen, N)
+    assert torch.equal(y_pad[M_routed:], torch.zeros_like(y_pad[M_routed:])), \
+        "padding output rows must be zero"
+    assert torch.equal(y_pad[:M_routed], y_ref), \
+        "routed output rows must be unaffected by buffer padding"
+
+    y_pad.sum().backward()
+    y_ref.sum().backward()
+    assert inputs_pad.grad.shape == (M_bufferlen, K)
+    assert torch.equal(inputs_pad.grad[M_routed:], torch.zeros_like(inputs_pad.grad[M_routed:])), \
+        "padding rows must receive zero input gradient"
+    assert torch.equal(inputs_pad.grad[:M_routed], inputs_ref.grad), \
+        "routed-row input gradients must be unaffected by buffer padding"
+    assert torch.equal(weights_pad.grad, weights_ref.grad), \
+        "weight gradients must be unaffected by buffer padding"
+
+
+def test_mxfp8_dispatch_entry_offsets_matches_indices():
+    """The offsets entry must equal the indices path: create_indices_from_offsets
+    round-trip is correct. No padding here (M_bufferlen == M_total)."""
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    num_groups, num_experts, K, N = 4, 4, 256, 256
+    M_total = num_groups * ALIGN_SIZE_M
+
+    inputs = prepare_data((M_total, K), dtype)
+    weights = prepare_data((num_experts, K, N), dtype)  # [E, K, N] dispatch convention
+    # Contiguous round-robin routing: group g -> expert (g % num_experts).
+    indices = _make_indices(num_groups, num_experts, device)
+    # Matching cumulative offsets (one group per expert here, all sizes ALIGN_SIZE_M).
+    offs = torch.arange(1, num_groups + 1, dtype=torch.int32, device=device) * ALIGN_SIZE_M
+
+    y_indices = mxfp8_grouped_gemm(
+        inputs, weights, indices, trans_weights=False, use_sr_grad=False)
+    y_offsets = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs, weights, offs, use_sr_grad=False)
+
+    snr = calc_snr(y_indices.float(), y_offsets.float())
+    assert snr > 30, f"offsets entry vs indices path SNR too low: {snr:.1f}dB"
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
index 5bd3b3b..a0dbd3d 100644
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
@@ -162,13 +162,15 @@ def test_forward_rejects_indices_length_mismatch():
     data_type = torch.bfloat16
     inputs = prepare_data((M_total, K), data_type)
     expert_weights = prepare_data((num_experts, N, K), data_type)
+    # A routed count below the buffer is now the padded-buffer case, not an error;
+    # but a non-128-aligned routed count (127) must still be rejected.
     indices = torch.zeros(M_total - 1, dtype=torch.int32, device="cuda")
     x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
         inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
     w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
         expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
 
-    with pytest.raises(AssertionError, match="expert_indices length"):
+    with pytest.raises(AssertionError, match="multiple of group_size_m"):
         mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
 
 

From 54687cbbdee53ffc65df89b6e459255d89e08932 Mon Sep 17 00:00:00 2001
From: Yue Sun <Yue.Sun2@amd.com>
Date: Tue, 16 Jun 2026 09:00:48 +0000
Subject: [PATCH 12/12] refactor: merge and simplify mxfp8 grouped gemm related
 tests, update design and feature doc

---
 README.md                                     |   1 +
 alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md |  61 +--
 .../mxfp8_grouped_gemm/tests/__init__.py      |   0
 tests/unittest/mxfp8/test_e2e_moe.py          |  12 +-
 .../unittest/mxfp8/test_mxfp8_grouped_gemm.py | 305 +++++++++++++++
 .../mxfp8/test_mxfp8_grouped_gemm_backward.py | 363 ------------------
 .../mxfp8/test_mxfp8_grouped_gemm_forward.py  | 204 ----------
 tests/unittest/mxfp8/utils.py                 |  20 +
 8 files changed, 362 insertions(+), 604 deletions(-)
 delete mode 100644 alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py
 create mode 100644 tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py
 delete mode 100644 tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
 delete mode 100644 tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py

diff --git a/README.md b/README.md
index bb4a5e2..2594283 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ Training-oriented kernels and schemes include:
 
 - **[Blockwise FP8](alto/kernels/blockwise_fp8)** — linear, grouped GEMM, and FlashAttention.
 - **[MXFP4](alto/kernels/fp4/mxfp4)** — linear, grouped GEMM, and FlashAttention.
+- **[MXFP8](alto/kernels/mxfp8)** — linear and grouped GEMM (block-scaled E4M3, with E5M2 reserved for gradients).
 
 Techniques used to narrow the gap versus BF16 include:
 
diff --git a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
index 47b6f77..d75033b 100644
--- a/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
+++ b/alto/kernels/mxfp8/MXFP8_GROUPED_GEMM_PLAN.md
@@ -143,13 +143,11 @@ functional.py        # 暴露顶层入口
 **完成情况**：
 - `cg_forward.py` kernel body + wrapper 已实现。fp8 load 用 `other=0.0`；CDNA3 dequant 路径 `_dequantize_fp8` 一律传 `IS_2D_BLOCK=False`（scale 偏移已在 kernel 内展开为逐行 `[BLOCK, n_rep_k]`，与参考 `blockwise_mxfp8_gemm_kernel` 一致）。
 - wrapper 已补最小输入契约检查：`inputs`/`expert_weights` 维度、`expert_indices.numel() == M_total`、`M_total` 按 `ALIGN_SIZE_M` 对齐、`K % 32 == 0`、2D weight scale 的 `N % 32 == 0`、以及 input/weight scale shape 精确匹配。V1 仍不支持 padded buffer；如需支持，应像 mxfp4/nvfp4 一样显式区分 `M_bufferlen` 与 `M_total`。
-- wrapper 暴露了 `use_dot_scaled: Optional[bool] = None` 开关：`None` 时按设备自动选择（CDNA4→`tl.dot_scaled`，否则 dequant fallback），显式传 `False` 可在任意设备上强制走 CDNA3 dequant 路径用于测试。
-- 测试 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py` 共 **21 个**，主校验全部用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999 且 **SNR > 40 dB**（SNR 抓 cos-sim 看不到的幅度错误，如漏 scale / 错累加器），另加 bf16 宽松 sanity（> 0.99）：
-  - `test_forward`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` = 16 个正例。
-  - `test_forward_dequant_fallback_matches_dot_scaled`：强制 `use_dot_scaled=False`，无论运行设备都覆盖 `_dequantize_fp8 → tl.dot` 分支（真实 MI300 ground-truth 仍待 Step 7）。
-  - `test_forward_single_expert_matches_mxfp8_linear`：全部 token 路由到单 expert，与 `mxfp8_linear._to_mxfp8_then_scaled_mm` 交叉校验（SNR > 30 dB）。这是独立交叉验证——linear 路径用自己的 autograd Function 量化，能抓到 dequant-matmul reference 抓不到的量化 bug（后者与测试共用同一份 `convert_to_mxfp8` 输出，量化 bug 会两边同时错而仍通过）。
-  - 3 个负例：`expert_indices` 长度不匹配、`M_total` 未对齐 `ALIGN_SIZE_M`、`weight_scales` shape 错误。
-- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径。CDNA3 dequant 分支已由 `use_dot_scaled=False` 测试在 CI 中强制覆盖；真实 CDNA3/MI300 硬件 ground-truth 已于 2026-06-09 在 MI300X 复验（见 §6 验证记录）。
+- wrapper 暴露了 `use_dot_scaled: Optional[bool] = None` 开关：`None` 时按设备自动选择（CDNA4→`tl.dot_scaled`，否则 dequant fallback），显式传 `False` 可在任意设备上强制走 CDNA3 dequant 路径。
+- 测试见单文件 `tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py` 的 forward section（2026-06-16 重构，详见 §6）。forward 主校验用 **dequant-then-matmul reference**（隔离 kernel 移植正确性 vs mxfp8 量化误差），cos-sim > 0.999 且 **SNR > 40 dB**，另加 bf16 宽松 sanity（> 0.99）：
+  - `test_forward`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` = 16 个正例。`use_dot_scaled` 不再做测试参数——走哪条路由底层 `is_cdna4()` 自动决定（对齐 linear/quantization 的做法），CDNA3 跑 fallback、CDNA4 跑 native，无需人为强制。
+  - `test_forward_single_expert_matches_mxfp8_linear`：全部 token 路由到单 expert，与 `mxfp8_linear._to_mxfp8_then_scaled_mm` 交叉校验（SNR > 30 dB）。独立交叉验证——linear 路径用自己的 autograd Function 量化，能抓 dequant-matmul reference 抓不到的量化 bug。
+- 2026-06-02 在 `friendly_elgamal` 容器中验证：`is_cdna4()=True`，forward 默认走 **CDNA4 `tl.dot_scaled`** 路径。真实 CDNA3/MI300 硬件 ground-truth 已于 2026-06-09 在 MI300X 复验（见 §6 验证记录）。
 
 ### Step 3 — Backward dgrad kernel ✅ 已完成
 基于 `mxfp4/cg_backward.py` 的 `_kernel_mxfp4_grouped_gemm_backward_dx`：
@@ -190,25 +188,34 @@ functional.py        # 暴露顶层入口
 
 ### Step 6 — 数值正确性测试 ⏳ 部分完成
 测试位置在 `tests/unittest/mxfp8/`：
-1. `test_mxfp8_grouped_gemm_forward.py`（**21 tests**）：见 Step 2 清单。
-2. `test_mxfp8_grouped_gemm_backward.py`（**31 tests**）：用 dequant-then-matmul reference（先 `convert_to_mxfp8`，再用 `convert_from_mxfp8_pytorch` 回 fp32，然后走 PyTorch 原生 GEMM）校验 dgrad/wgrad，并加 cos-sim > 0.999 + SNR > 40 dB 双重门槛；另覆盖 autograd 端到端 forward + backward。详见下方覆盖清单。
-3. `repro_mxfp8_dot_scaled.py` + `repro_mxfp8_dot_scaled.md`（见 §5 风险 2）：独立复现脚本，用一个最小 `tl.dot_scaled` kernel 对比 `convert_from_mxfp8(a) @ convert_from_mxfp8(b)`，验证「单次 `dot_scaled` 跨多个 32-wide scale group 会发散」这一约束。四个 case：`DOT_K=32` baseline、`DOT_K=32` over K=128 safe path、`DOT_K=64`（跨 2 group）与 `DOT_K=128`（跨 4 group）problem path，输入用 outlier-heavy 的 32-wide K block 放大 scale 差异，打印 `max_diff`/`mean_diff`/`relative_max_diff` 与 problem-vs-safe 的 mean_diff 比值。需在 CDNA4 环境手动运行（非 pytest 自动收集）。
-4. `test_e2e_moe.py`：toy MoE layer + optimizer step 仍未实现，设计与跟进见 §7。
-
-**backward 测试覆盖清单**：
-- `test_backward_inputs_matches_dequant_reference`：dgrad，`trans_weights` × 2D GO × 2D W × `use_dot_scaled∈{None, False}` = 16 个正例。
-- `test_backward_weights_matches_dequant_reference`：wgrad，`trans_weights` × 2D X × `use_dot_scaled∈{None, False}` = 8 个正例。
-- `test_mxfp8_grouped_gemm_autograd`：autograd 端到端，`trans_weights=True` 下 2D X/W 四种组合。
-- `test_mxfp8_grouped_gemm_autograd_trans_weights_false`：`trans_weights=False` 的 1D 路径（shape `(384,256,128,2)`）。
-- `test_backward_wrappers_reject_non_aligned_mtotal`：两个 backward wrapper 在 `M_total` 未对齐 `ALIGN_SIZE_M` 时 fail-fast 的负例。
-- `test_autograd_many_experts_with_empty_expert`：experts(8) > groups(2)，部分 expert 收到零 token，校验空 expert 的 `dW` 严格为 0、被路由 expert 的 `dW` 非零、且全程梯度有限，覆盖 wgrad「扫所有 group 判等」调度的空 expert 分支。
-- 其中 `use_dot_scaled=False` 的参数化在 CI 中强制覆盖了 CDNA3 `_dequantize_fp8 → tl.dot` fallback（任意设备可跑）。
+1. `test_mxfp8_grouped_gemm.py`（**52 tests**）：forward + autograd + dispatch 三段合一（2026-06-16 把原 forward/backward 两文件合并，对齐 mxfp4/nvfp4 的单文件结构）。详见下方覆盖清单。
+2. `repro_mxfp8_dot_scaled.py` + `repro_mxfp8_dot_scaled.md`（见 §5 风险 2）：独立复现脚本，用一个最小 `tl.dot_scaled` kernel 对比 `convert_from_mxfp8(a) @ convert_from_mxfp8(b)`，验证「单次 `dot_scaled` 跨多个 32-wide scale group 会发散」这一约束。四个 case：`DOT_K=32` baseline、`DOT_K=32` over K=128 safe path、`DOT_K=64`（跨 2 group）与 `DOT_K=128`（跨 4 group）problem path，输入用 outlier-heavy 的 32-wide K block 放大 scale 差异，打印 `max_diff`/`mean_diff`/`relative_max_diff` 与 problem-vs-safe 的 mean_diff 比值。需在 CDNA4 环境手动运行（非 pytest 自动收集）。
+3. `test_e2e_moe.py`（**2 tests**）：toy MoE 多步训练收敛 sanity（见 §7），独立文件——它测的是「多步训练不发散」，与上面的单步数值正确性正交，故不并入主文件。
+
+**`test_mxfp8_grouped_gemm.py` 覆盖清单**（2026-06-16 重构后）：
+
+forward section（kernel 级，喂预量化 fp8）：
+- `test_forward`：2 shapes × 2D-block-x × 2D-block-w × `trans_weights` = 16 个正例，dequant-then-matmul reference，cos-sim > 0.999 + SNR > 40 dB，另加 bf16 sanity > 0.99。
+- `test_forward_single_expert_matches_mxfp8_linear`：单 expert 与 mxfp8 linear 交叉校验（SNR > 30 dB）。
+
+autograd section（op 级，走用户入口 `mxfp8_grouped_gemm` 的真实 fwd+bwd）：
+- `test_mxfp8_grouped_gemm_autograd`：O/dX/dW 同测，对标 mxfp4/nvfp4 的 autograd 测试。网格 4 shapes（含大 K=2048、N≠K 非方阵）× `trans_weights` × 2D X × 2D W = **32 个正例**。vs BF16 autograd reference，SNR 门 **O>20 / dX>15 / dW>15 dB**（实测最小 O≈24.9、dX/dW≈19.0，留 4–5 dB 裕度），cossim>0.99 做方向兜底。**这一个 autograd 测试取代了原先分开的 dgrad/wgrad kernel-wrapper 单测**——用户入口内部量化，kernel 的 dgrad/wgrad 通过真实 backprop 被覆盖（与 mxfp4/nvfp4 的做法一致，它们也不单测 dgrad/wgrad）。
+- `test_autograd_many_experts_with_empty_expert`：experts(8) > groups(2)，部分 expert 零 token，校验空 expert `dW` 严格为 0、被路由 expert `dW` 非零、全程梯度有限，覆盖 wgrad「扫所有 group 判等」的空 expert 分支。
+
+dispatch section（offsets 入口 + padded buffer，对应 nvfp4 的 Test 6/7）：
+- `test_mxfp8_grouped_gemm_accepts_padded_buffer`：padded-vs-unpadded 自比（见 §8.1）。
+- `test_mxfp8_dispatch_entry_offsets_matches_indices`：offsets 入口与 indices 路径等价（SNR > 30 dB），验 `create_indices_from_offsets_nosync` round-trip。
+
+2026-06-16 重构的其它精简（功能等价，覆盖不降）：
+- `use_dot_scaled` 不再作为测试参数：CDNA3 上 `None` 与 `False` 解析到同一条 fallback，正交参数化只是把网格翻倍而零新增覆盖。改为底层 `is_cdna4()` 自动选路（对齐 linear/quantization）。
+- 删除 kernel 级入口契约负例（非对齐 `M_total`、错误 scale shape 的 reject 测试）：这些是 kernel 入口断言，mxfp4/nvfp4 在 op 级也不单测，正确性主测试已足够。
+- `make_indices`、`calc_snr`/`calc_cossim` 统一到 `mxfp8/utils.py` 共享（对齐 fp4 的随机 routing 约定 + 单一真源 `alto.kernels.fp4.testing_utils`）。
 
 **验证记录**：
-- 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward，彼时 **17 passed**；之后扩充 `use_dot_scaled` 参数化与空 expert / 对齐负例。
-- 2026-06-09 在 MI300X（CDNA3）上重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 20.74s**。
-- 2026-06-10 在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中重跑 forward + backward：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` 结果为 **52 passed（21 forward + 31 backward）, 14 warnings in 18.51s**。环境确认：PyTorch `2.12.0a0+git78d5fb4`，`is_cdna4=True`。结论：m355 默认 `tl.dot_scaled` 路径的 grouped GEMM fwd / dgrad / wgrad 数值单测已通过。
-- 仍需补 toy MoE 训练 sanity；`use_dot_scaled=False` fallback 已在 CI 强制覆盖，CDNA4/m355 默认 `tl.dot_scaled` 路径已通过上述 52 个 grouped GEMM 单测。
+- 2026-06-05 在 `cranky_shockley` 容器（`wanghanthu/torchtitan:ubuntu22.04-pytorch2.12.0dev20260217-rocm7.2-patch`）中验证 backward，彼时 **17 passed**。
+- 2026-06-09 在 MI300X（CDNA3）重跑（重构前）：forward + backward **52 passed（21 forward + 31 backward）**。
+- 2026-06-10 在 MI355X（CDNA4 / m355）`gracious_lovelace` 容器重跑（重构前）：forward + backward **52 passed**。环境：PyTorch `2.12.0a0+git78d5fb4`，`is_cdna4=True`。结论：m355 默认 `tl.dot_scaled` 路径的 fwd / dgrad / wgrad 数值单测通过。
+- 2026-06-16 在 MI300X（CDNA3）上完成测试重构（合并两文件 + 上述精简）并复跑：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py tests/unittest/mxfp8/test_e2e_moe.py -q` → **54 passed**（52 grouped GEMM + 2 e2e）。CDNA4 默认 `tl.dot_scaled` 路径的数值正确性已由 2026-06-10 记录覆盖；本次重构 device-agnostic（仅测试组织变化，kernel 零改动）。
 
 ### Step 7 — MI300 fallback 验证 ✅ 已完成
 仅切 `USE_DOT_SCALED=False` 路径重跑 Step 6，确保 CDNA3 上数值与 CDNA4 一致（dequant + fp32 dot 是 ground truth）。
@@ -260,11 +267,11 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 
 4. **`MXFP8GroupedGEMM` autograd**：结构无需改（已用 bufferlen 张量 + ctx 透传 indices）；仅需确认 M 轴量化 `convert_to_mxfp8(..., axis=0)` 在 bufferlen buffer 上成立（要求 `M_bufferlen % 32 == 0`，由新断言保证）。
 
-5. **测试**（`test_mxfp8_grouped_gemm_backward.py`）：新增 `test_mxfp8_grouped_gemm_accepts_padded_buffer`，采 **padded-vs-unpadded 自比**（最强校验，闭环证明 padding 零干扰）。**关键：固定 `use_sr_grad=False`** 使量化确定性，否则随机舍入令 `torch.equal` 偶发失败；routed 行两次跑应逐位相等。断言：`y_pad.shape==(M_bufferlen,N)`、`y_pad[M_routed:]` 全 0、`y_pad[:M_routed]==y_ref`、`inputs_pad.grad[M_routed:]` 全 0、`inputs_pad.grad[:M_routed]==inputs_ref.grad`、`w_pad.grad==w_ref.grad`；另加 offsets 入口 smoke test。
+5. **测试**（当时在 `test_mxfp8_grouped_gemm_backward.py`，2026-06-16 已合并入 `test_mxfp8_grouped_gemm.py`）：新增 `test_mxfp8_grouped_gemm_accepts_padded_buffer`，采 **padded-vs-unpadded 自比**（最强校验，闭环证明 padding 零干扰）。**关键：固定 `use_sr_grad=False`** 使量化确定性，否则随机舍入令 `torch.equal` 偶发失败；routed 行两次跑应逐位相等。断言：`y_pad.shape==(M_bufferlen,N)`、`y_pad[M_routed:]` 全 0、`y_pad[:M_routed]==y_ref`、`inputs_pad.grad[M_routed:]` 全 0、`inputs_pad.grad[:M_routed]==inputs_ref.grad`、`w_pad.grad==w_ref.grad`；另加 offsets 入口 smoke test。
 
 5b. **smoke test**：另加 `test_mxfp8_dispatch_entry_offsets_matches_indices`——同一路由下 offsets 入口与 indices 路径输出 SNR > 30 dB，验证 `create_indices_from_offsets_nosync` round-trip 正确。
 
-**验证记录**（2026-06-10，MI300X / CDNA3）：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` → **54 passed**（52 原有不回归 + 2 新增：padded-buffer 自比、offsets smoke）。padded-buffer 测试的 `y_pad.shape==(M_bufferlen,N)` 断言本身即负控——若 wrapper 仍按 `[M_total,N]` 分配则该断言失败，证明确实触发 padding 路径。本改动 device-agnostic（只动 wrapper/入口，Triton kernel 零改动）；CDNA4/m355 路试已于 2026-06-10 通过（见 §6 验证记录）。
+**验证记录**（2026-06-10，MI300X / CDNA3，文件合并前）：`python -m pytest tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py -q` → **54 passed**（52 原有不回归 + 2 新增：padded-buffer 自比、offsets smoke）。padded-buffer 测试的 `y_pad.shape==(M_bufferlen,N)` 断言本身即负控——若 wrapper 仍按 `[M_total,N]` 分配则该断言失败，证明确实触发 padding 路径。本改动 device-agnostic（只动 wrapper/入口，Triton kernel 零改动）；CDNA4/m355 路试已于 2026-06-10 通过（见 §6 验证记录）。
 
 **后续（已于同日完成）**：dispatch `__torch_function__` subclass wiring（`alto/kernels/dispatch/tensor.py`，即 §8「接入形态对齐」）当时列为后续 PR，已于 2026-06-10 一并实施，见 §8.2。
 
@@ -279,7 +286,7 @@ V1「最小可用 kernel」在**算子数值正确性**这层基本达标（三
 **验证记录**（2026-06-10，MI300X / CDNA3）：
 - 端到端 dispatch smoke：包一个 `MXFP8TrainingWeightWrapperTensor` 权重 `[E,K,N]`，`torch._grouped_mm(A, B, offs=[128,256,384,512])` → forward `(512,256)` finite、`.sum().backward()` 后 `A.grad` finite。
 - 边界：`mxfp8_e5m2` 与 `use_hadamard=True` 均被正确 `AssertionError` 拒绝。
-- 回归：`test_mxfp8_grouped_gemm_forward.py` + `_backward.py` **54 passed** 无回归。
+- 回归：`test_mxfp8_grouped_gemm_forward.py` + `_backward.py` **54 passed** 无回归（两文件 2026-06-16 已合并为 `test_mxfp8_grouped_gemm.py`，见 §6）。
 
 **接 GPT-OSS：无需再写代码，只改 recipe 配置。** `alto/models/gpt_oss/configs/lpt_recipe.yaml`（当前为 mxfp4）改成最小 e4m3 V1 需动 3 行（其余保持）：
 
diff --git a/alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py b/alto/kernels/mxfp8/mxfp8_grouped_gemm/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/unittest/mxfp8/test_e2e_moe.py b/tests/unittest/mxfp8/test_e2e_moe.py
index b3e102e..9e94c08 100644
--- a/tests/unittest/mxfp8/test_e2e_moe.py
+++ b/tests/unittest/mxfp8/test_e2e_moe.py
@@ -16,15 +16,7 @@
 from alto.kernels.mxfp8.mxfp8_grouped_gemm import mxfp8_grouped_gemm
 from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
 
-from .utils import prepare_data
-
-
-def _make_indices(num_groups, num_experts, device):
-    """Contiguous routing: group g -> expert (g % num_experts), aligned to ALIGN_SIZE_M."""
-    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
-    for g in range(num_groups):
-        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
-    return indices
+from .utils import prepare_data, make_indices
 
 
 def _bf16_grouped_matmul(inputs, expert_weights, indices):
@@ -68,7 +60,7 @@ def test_toy_moe_trains_and_tracks_bf16(num_experts):
     inputs = prepare_data((m_total, k_dim), dtype)
     target = prepare_data((m_total, n_dim), dtype)
     w_init = prepare_data((num_experts, n_dim, k_dim), dtype) * 0.1
-    indices = _make_indices(num_groups, num_experts, device)
+    indices = make_indices(num_groups, num_experts, device)
 
     mxfp8_losses = _train(
         lambda x, w, idx: mxfp8_grouped_gemm(x, w, idx, trans_weights=True),
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py
new file mode 100644
index 0000000..2691185
--- /dev/null
+++ b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# SPDX-License-Identifier: MIT
+"""Op-level precision tests for MXFP8 contiguous grouped GEMM.
+
+Forward (kernel-level correctness), full autograd (O/dX/dW), and the offsets
+dispatch entry all live here, mirroring the single-file mxfp4/nvfp4 grouped-GEMM
+test layout.
+"""
+
+import pytest
+from tabulate import tabulate
+import torch
+
+from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
+from alto.kernels.mxfp8.mxfp8_grouped_gemm import (
+    mxfp8_grouped_gemm,
+    _quantize_then_mxfp8_scaled_grouped_mm,
+)
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
+
+from .utils import prepare_data, convert_from_mxfp8_pytorch, calc_snr, calc_cossim, make_indices
+
+
+def _reference(inputs, expert_weights, indices, trans_weights):
+    """BF16/fp32 grouped matmul reference: Y[g] = X[g] @ W[expert(g)]^T per group."""
+    m_total = inputs.shape[0]
+    n_dim = expert_weights.shape[1] if trans_weights else expert_weights.shape[2]
+    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
+    for start in range(0, m_total, ALIGN_SIZE_M):
+        expert_idx = indices[start].item()
+        weight = expert_weights[expert_idx].t() if trans_weights else expert_weights[expert_idx]
+        out[start:start + ALIGN_SIZE_M] = inputs[start:start + ALIGN_SIZE_M] @ weight
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Forward – kernel-level correctness on pre-quantized operands
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize("shape", [(256, 128, 128, 1), (512, 256, 256, 4)])
+@pytest.mark.parametrize("use_2dblock_x", [False, True])
+@pytest.mark.parametrize("use_2dblock_w", [False, True])
+@pytest.mark.parametrize("trans_weights", [True, False])
+def test_forward(shape, use_2dblock_x, use_2dblock_w, trans_weights):
+    M_total, N, K, num_experts = shape
+    M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
+    num_groups = M_total // ALIGN_SIZE_M
+    device = torch.device("cuda")
+    data_type = torch.bfloat16
+
+    inputs = prepare_data((M_total, K), data_type)
+    weight_shape = (num_experts, N, K) if trans_weights else (num_experts, K, N)
+    weight_axis = -1 if trans_weights else -2
+    expert_weights = prepare_data(weight_shape, data_type)
+    indices = make_indices(num_groups, num_experts, device)
+
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_x)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=weight_axis, is_2d_block=use_2dblock_w)
+
+    out = mxfp8_grouped_gemm_forward(
+        x_lp, w_lp, indices, x_s, w_s,
+        trans_weights=trans_weights,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+        lhs_format_id=0,
+        rhs_format_id=0,
+        output_dtype=data_type,
+    )
+
+    tag = f"shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w}"
+
+    # Primary correctness gate: dequantize the exact fp8 operands the kernel
+    # consumed, matmul in PyTorch. This isolates kernel-port correctness from
+    # mxfp8-vs-bf16 quantization error.
+    x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_x)
+    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, weight_axis, use_2dblock_w)
+    ref_dq = _reference(x_dq, w_dq, indices, trans_weights)
+    cos_dq = calc_cossim(out, ref_dq)
+    assert cos_dq > 0.999, f"kernel vs dequant-matmul cos-sim too low: {cos_dq} ({tag})"
+    # SNR catches magnitude errors (missed scale, wrong accumulator) that cossim is blind to.
+    snr_dq = calc_snr(ref_dq, out.float())
+    assert snr_dq > 40, f"kernel vs dequant-matmul SNR too low: {snr_dq:.1f}dB ({tag})"
+
+    # Looser sanity check against the full-precision bf16 path.
+    ref_bf16 = _reference(inputs, expert_weights, indices, trans_weights)
+    cos_bf16 = calc_cossim(out, ref_bf16)
+    assert cos_bf16 > 0.99, f"kernel vs bf16 cos-sim too low: {cos_bf16} ({tag})"
+
+
+def test_forward_single_expert_matches_mxfp8_linear():
+    """All tokens to one expert => grouped GEMM must match mxfp8 linear (x @ w^T).
+
+    Independent cross-check: the linear path quantizes via its own autograd
+    Function, so this catches bugs the dequant-matmul reference can't — that
+    reference shares this test's convert_to_mxfp8 output, so a quant bug would
+    corrupt both sides equally and still pass.
+    """
+    from alto.kernels.mxfp8.mxfp8_linear import _to_mxfp8_then_scaled_mm
+
+    M, N, K = ALIGN_SIZE_M, 256, 256
+    data_type = torch.bfloat16
+    x = prepare_data((M, K), data_type)
+    w = prepare_data((N, K), data_type)
+    indices = torch.zeros(M, dtype=torch.int32, device="cuda")
+
+    y_linear = _to_mxfp8_then_scaled_mm(x, w, use_2dblock_x=False, use_2dblock_w=False)
+
+    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
+        x, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
+        w.unsqueeze(0), block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
+    y_gg = mxfp8_grouped_gemm_forward(
+        x_lp, w_lp, indices, x_s, w_s, trans_weights=True,
+        use_2dblock_x=False, use_2dblock_w=False, output_dtype=data_type)
+
+    snr = calc_snr(y_linear.float(), y_gg.float())
+    assert snr > 30, f"single-expert grouped GEMM vs mxfp8 linear SNR too low: {snr:.1f}dB"
+
+
+# ---------------------------------------------------------------------------
+# Autograd – forward+backward (O, dX, dW) vs a BF16 autograd reference
+# ---------------------------------------------------------------------------
+
+# Mirrors the mxfp4/nvfp4 grouped-GEMM autograd tests. The user-facing path
+# quantizes internally, so the kernel's dgrad and wgrad are exercised here
+# through real backprop (no separate dgrad/wgrad kernel-wrapper tests needed).
+#
+# Floors are set from the measured min/max across this full 32-case grid (vs the
+# BF16 autograd reference, so the spread is e4m3 quantization error, not a bug):
+#     O  SNR 24.9 .. 28.8 dB   cossim 0.999 .. 0.999
+#     dX SNR 19.0 .. 26.5 dB   cossim 0.996 .. 0.999
+#     dW SNR 19.0 .. 27.0 dB   cossim 0.996 .. 0.999
+# SNR is the real precision gate (far stricter than fp4's 10/9 dB bounds, with
+# ~4-5 dB headroom below the observed minima). cossim is only a coarse
+# direction-sanity backstop, kept loose at 0.99 against the 0.996 worst case.
+@pytest.mark.parametrize("shape", [
+    (384, 128, 128, 2),
+    (512, 256, 256, 4),
+    (512, 512, 2048, 4),   # large-K stress
+    (384, 256, 128, 2),    # N != K, exercises non-square weight layout / strides
+])
+@pytest.mark.parametrize("trans_weights", [True, False])
+@pytest.mark.parametrize("use_2dblock_x", [False, True])
+@pytest.mark.parametrize("use_2dblock_w", [False, True])
+def test_mxfp8_grouped_gemm_autograd(shape, trans_weights, use_2dblock_x, use_2dblock_w):
+    m_total, n_dim, k_dim, num_experts = shape
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+
+    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
+    weight_shape = (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim)
+    expert_weights = prepare_data(weight_shape, dtype).requires_grad_(True)
+    expert_indices = make_indices(m_total // ALIGN_SIZE_M, num_experts, device)
+    target = prepare_data((m_total, n_dim), dtype)
+
+    outputs_ref = _reference(inputs, expert_weights, expert_indices, trans_weights)
+    torch.nn.functional.mse_loss(outputs_ref, target).backward()
+    grad_inputs_ref = inputs.grad.detach().clone()
+    grad_weights_ref = expert_weights.grad.detach().clone()
+    inputs.grad.zero_()
+    expert_weights.grad.zero_()
+
+    outputs = mxfp8_grouped_gemm(
+        inputs,
+        expert_weights,
+        expert_indices,
+        trans_weights=trans_weights,
+        use_2dblock_x=use_2dblock_x,
+        use_2dblock_w=use_2dblock_w,
+        use_sr_grad=False,
+    )
+    torch.nn.functional.mse_loss(outputs, target).backward()
+
+    o_snr = calc_snr(outputs_ref, outputs.detach())
+    dx_snr = calc_snr(grad_inputs_ref, inputs.grad)
+    dw_snr = calc_snr(grad_weights_ref, expert_weights.grad)
+    o_sim = calc_cossim(outputs_ref, outputs.detach())
+    dx_sim = calc_cossim(grad_inputs_ref, inputs.grad)
+    dw_sim = calc_cossim(grad_weights_ref, expert_weights.grad)
+
+    print()
+    print(tabulate(
+        [["O", f"{o_snr:.2f}", f"{o_sim:.6f}"],
+         ["dX", f"{dx_snr:.2f}", f"{dx_sim:.6f}"],
+         ["dW", f"{dw_snr:.2f}", f"{dw_sim:.6f}"]],
+        headers=["Tensor", "SNR(dB)", "CosSim"], tablefmt="github"))
+
+    tag = f"shape={shape} tr={trans_weights} x2={use_2dblock_x} w2={use_2dblock_w}"
+    assert o_snr > 20, f"O SNR too low: {o_snr:.2f} ({tag})"
+    assert dx_snr > 15, f"dX SNR too low: {dx_snr:.2f} ({tag})"
+    assert dw_snr > 15, f"dW SNR too low: {dw_snr:.2f} ({tag})"
+    assert o_sim > 0.99 and dx_sim > 0.99 and dw_sim > 0.99, \
+        f"cossim too low: O={o_sim:.4f} dX={dx_sim:.4f} dW={dw_sim:.4f} ({tag})"
+
+
+def test_autograd_many_experts_with_empty_expert():
+    """experts > groups => some experts receive zero tokens (dW row must be 0),
+    and the wgrad scan-all-groups path is exercised with finite gradients."""
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    m_total, n_dim, k_dim, num_experts = ALIGN_SIZE_M * 2, 128, 128, 8  # 2 groups, 8 experts
+
+    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
+    expert_weights = prepare_data((num_experts, n_dim, k_dim), dtype).requires_grad_(True)
+    # Route both groups to experts 0 and 1; experts 2..7 stay empty.
+    expert_indices = torch.zeros(m_total, dtype=torch.int32, device=device)
+    expert_indices[ALIGN_SIZE_M:] = 1
+    target = prepare_data((m_total, n_dim), dtype)
+
+    outputs = mxfp8_grouped_gemm(inputs, expert_weights, expert_indices, trans_weights=True)
+    torch.nn.functional.mse_loss(outputs, target).backward()
+
+    assert torch.isfinite(outputs).all()
+    assert torch.isfinite(inputs.grad).all()
+    assert torch.isfinite(expert_weights.grad).all()
+    # Empty experts must get exactly zero weight gradient.
+    assert torch.count_nonzero(expert_weights.grad[2:]) == 0, "unused experts must have zero dW"
+    assert torch.count_nonzero(expert_weights.grad[:2]) > 0, "routed experts must have nonzero dW"
+
+
+# ---------------------------------------------------------------------------
+# Offsets dispatch entry + padded buffer (PLAN.md §8.1)
+# ---------------------------------------------------------------------------
+
+def test_mxfp8_grouped_gemm_accepts_padded_buffer():
+    """Padded activation buffer (M_bufferlen > routed M_total) must not disturb
+    routed rows, and padding rows must stay zero in both output and gradients.
+
+    Padded-vs-unpadded self-comparison through the offsets dispatch entry (which
+    uses trans_weights=False, also boosting that path's coverage). use_sr_grad is
+    forced False so quantization is deterministic and torch.equal holds bitwise.
+    """
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    M_routed, M_pad = 4 * ALIGN_SIZE_M, 2 * ALIGN_SIZE_M  # 512 routed, 256 padding
+    M_bufferlen = M_routed + M_pad
+    num_experts, K, N = 4, 256, 256
+
+    routed_rows = prepare_data((M_routed, K), dtype)
+    weights = prepare_data((num_experts, K, N), dtype)  # dispatch convention [E, K, N]
+    # Each expert owns one ALIGN_SIZE_M group of routed tokens -> cumulative offs.
+    offs = torch.tensor(
+        [(i + 1) * (M_routed // num_experts) for i in range(num_experts)],
+        dtype=torch.int32, device=device,
+    )
+
+    inputs_pad = torch.zeros(M_bufferlen, K, dtype=dtype, device=device)
+    inputs_pad[:M_routed] = routed_rows
+    inputs_pad.requires_grad_(True)
+    weights_pad = weights.clone().requires_grad_(True)
+    y_pad = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs_pad, weights_pad, offs,
+        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
+
+    inputs_ref = routed_rows.clone().requires_grad_(True)
+    weights_ref = weights.clone().requires_grad_(True)
+    y_ref = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs_ref, weights_ref, offs,
+        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
+
+    assert y_pad.shape == (M_bufferlen, N)
+    assert torch.equal(y_pad[M_routed:], torch.zeros_like(y_pad[M_routed:])), \
+        "padding output rows must be zero"
+    assert torch.equal(y_pad[:M_routed], y_ref), \
+        "routed output rows must be unaffected by buffer padding"
+
+    y_pad.sum().backward()
+    y_ref.sum().backward()
+    assert inputs_pad.grad.shape == (M_bufferlen, K)
+    assert torch.equal(inputs_pad.grad[M_routed:], torch.zeros_like(inputs_pad.grad[M_routed:])), \
+        "padding rows must receive zero input gradient"
+    assert torch.equal(inputs_pad.grad[:M_routed], inputs_ref.grad), \
+        "routed-row input gradients must be unaffected by buffer padding"
+    assert torch.equal(weights_pad.grad, weights_ref.grad), \
+        "weight gradients must be unaffected by buffer padding"
+
+
+def test_mxfp8_dispatch_entry_offsets_matches_indices():
+    """The offsets entry must equal the indices path: create_indices_from_offsets
+    round-trip is correct. No padding here (M_bufferlen == M_total)."""
+    device = torch.device("cuda")
+    dtype = torch.bfloat16
+    num_groups, num_experts, K, N = 4, 4, 256, 256
+    M_total = num_groups * ALIGN_SIZE_M
+
+    inputs = prepare_data((M_total, K), dtype)
+    weights = prepare_data((num_experts, K, N), dtype)  # [E, K, N] dispatch convention
+    # Sorted one-group-per-expert routing (group g -> expert g) so it matches the
+    # cumulative offs below; the random make_indices helper would not line up.
+    indices = torch.zeros(M_total, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g
+    offs = torch.arange(1, num_groups + 1, dtype=torch.int32, device=device) * ALIGN_SIZE_M
+
+    y_indices = mxfp8_grouped_gemm(
+        inputs, weights, indices, trans_weights=False, use_sr_grad=False)
+    y_offsets = _quantize_then_mxfp8_scaled_grouped_mm(
+        inputs, weights, offs, use_sr_grad=False)
+
+    snr = calc_snr(y_indices.float(), y_offsets.float())
+    assert snr > 30, f"offsets entry vs indices path SNR too low: {snr:.1f}dB"
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
deleted file mode 100644
index 7018aa6..0000000
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_backward.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# Copyright (c) 2026 Advanced Micro Devices, Inc.
-#
-# SPDX-License-Identifier: MIT
-
-import pytest
-import torch
-
-from alto.kernels.mxfp8.mxfp8_grouped_gemm import (
-    mxfp8_grouped_gemm,
-    _quantize_then_mxfp8_scaled_grouped_mm,
-)
-from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
-from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_backward import (
-    mxfp8_grouped_gemm_backward_inputs,
-    mxfp8_grouped_gemm_backward_weights,
-)
-from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
-from alto.kernels.fp4.testing_utils import calc_snr
-
-from .utils import prepare_data, convert_from_mxfp8_pytorch
-
-
-def _cossim(x, y):
-    return torch.nn.functional.cosine_similarity(x.flatten().float(), y.flatten().float(), dim=0).item()
-
-
-def _make_indices(num_groups, num_experts, device):
-    indices = torch.empty(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
-    for g in range(num_groups):
-        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = g % num_experts
-    return indices
-
-
-def _reference(inputs, expert_weights, indices, trans_weights):
-    m_total = inputs.shape[0]
-    n_dim = expert_weights.shape[1] if trans_weights else expert_weights.shape[2]
-    out = torch.zeros((m_total, n_dim), dtype=inputs.dtype, device=inputs.device)
-    for start in range(0, m_total, ALIGN_SIZE_M):
-        expert_idx = indices[start].item()
-        weight = expert_weights[expert_idx].t() if trans_weights else expert_weights[expert_idx]
-        out[start:start + ALIGN_SIZE_M] = inputs[start:start + ALIGN_SIZE_M] @ weight
-    return out
-
-
-def _reference_dgrad(grad_output, expert_weights, indices, trans_weights):
-    m_total = grad_output.shape[0]
-    k_dim = expert_weights.shape[2] if trans_weights else expert_weights.shape[1]
-    grad_inputs = torch.zeros((m_total, k_dim), dtype=grad_output.dtype, device=grad_output.device)
-    for start in range(0, m_total, ALIGN_SIZE_M):
-        end = start + ALIGN_SIZE_M
-        expert_idx = indices[start].item()
-        weight = expert_weights[expert_idx] if trans_weights else expert_weights[expert_idx].t()
-        grad_inputs[start:end] = grad_output[start:end] @ weight
-    return grad_inputs
-
-
-def _reference_wgrad(grad_output, inputs, indices, num_experts, trans_weights):
-    n_dim = grad_output.shape[1]
-    k_dim = inputs.shape[1]
-    grad_weights = torch.zeros(
-        (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim),
-        dtype=grad_output.dtype,
-        device=grad_output.device,
-    )
-    for start in range(0, inputs.shape[0], ALIGN_SIZE_M):
-        end = start + ALIGN_SIZE_M
-        expert_idx = indices[start].item()
-        if trans_weights:
-            grad_weights[expert_idx] += grad_output[start:end].t() @ inputs[start:end]
-        else:
-            grad_weights[expert_idx] += inputs[start:end].t() @ grad_output[start:end]
-    return grad_weights
-
-
-@pytest.mark.parametrize("trans_weights", [True, False])
-@pytest.mark.parametrize("use_2dblock_go", [False, True])
-@pytest.mark.parametrize("use_2dblock_w", [False, True])
-@pytest.mark.parametrize("use_dot_scaled", [None, False])
-def test_backward_inputs_matches_dequant_reference(trans_weights, use_2dblock_go, use_2dblock_w, use_dot_scaled):
-    m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
-    dtype = torch.bfloat16
-
-    grad_output = prepare_data((m_total, n_dim), dtype)
-    weight_shape = (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim)
-    expert_weights = prepare_data(weight_shape, dtype)
-    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, torch.device("cuda"))
-
-    weight_axis = (-1 if trans_weights else -2) if use_2dblock_w else (-2 if trans_weights else -1)
-    grad_output_lp, grad_output_scales = torch.ops.alto.convert_to_mxfp8(
-        grad_output,
-        block_size=BLOCK_SIZE_DEFAULT,
-        mxfp_format="e4m3",
-        axis=-1,
-        is_2d_block=use_2dblock_go,
-    )
-    expert_weights_lp, expert_weight_scales = torch.ops.alto.convert_to_mxfp8(
-        expert_weights,
-        block_size=BLOCK_SIZE_DEFAULT,
-        mxfp_format="e4m3",
-        axis=weight_axis,
-        is_2d_block=use_2dblock_w,
-    )
-
-    grad_inputs = mxfp8_grouped_gemm_backward_inputs(
-        grad_output_lp,
-        expert_weights_lp,
-        expert_indices,
-        grad_output_scales,
-        expert_weight_scales,
-        trans_weights=trans_weights,
-        use_2dblock_x=use_2dblock_go,
-        use_2dblock_w=use_2dblock_w,
-        output_dtype=dtype,
-        use_dot_scaled=use_dot_scaled,
-    )
-
-    grad_output_dq = convert_from_mxfp8_pytorch(
-        grad_output_lp, grad_output_scales, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_go)
-    expert_weights_dq = convert_from_mxfp8_pytorch(
-        expert_weights_lp, expert_weight_scales, torch.float32, BLOCK_SIZE_DEFAULT, weight_axis, use_2dblock_w)
-    grad_inputs_ref = _reference_dgrad(grad_output_dq, expert_weights_dq, expert_indices, trans_weights)
-
-    cos = _cossim(grad_inputs, grad_inputs_ref)
-    assert cos > 0.999, \
-        f"dgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_go={use_2dblock_go}, 2d_w={use_2dblock_w})"
-    snr = calc_snr(grad_inputs_ref, grad_inputs.float())
-    assert snr > 40, \
-        f"dgrad kernel vs dequant-matmul SNR too low: {snr:.1f}dB (2d_go={use_2dblock_go}, 2d_w={use_2dblock_w})"
-
-
-@pytest.mark.parametrize("trans_weights", [True, False])
-@pytest.mark.parametrize("use_2dblock_x", [False, True])
-@pytest.mark.parametrize("use_dot_scaled", [None, False])
-def test_backward_weights_matches_dequant_reference(trans_weights, use_2dblock_x, use_dot_scaled):
-    m_total, n_dim, k_dim, num_experts = 384, 128, 128, 2
-    dtype = torch.bfloat16
-
-    inputs = prepare_data((m_total, k_dim), dtype)
-    grad_output = prepare_data((m_total, n_dim), dtype)
-    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, torch.device("cuda"))
-
-    grad_axis = -1 if use_2dblock_x else 0
-    input_axis = -1 if use_2dblock_x else 0
-    grad_output_lp, grad_output_scales = torch.ops.alto.convert_to_mxfp8(
-        grad_output,
-        block_size=BLOCK_SIZE_DEFAULT,
-        mxfp_format="e4m3",
-        axis=grad_axis,
-        is_2d_block=use_2dblock_x,
-    )
-    inputs_lp, input_scales = torch.ops.alto.convert_to_mxfp8(
-        inputs,
-        block_size=BLOCK_SIZE_DEFAULT,
-        mxfp_format="e4m3",
-        axis=input_axis,
-        is_2d_block=use_2dblock_x,
-    )
-
-    grad_weights = mxfp8_grouped_gemm_backward_weights(
-        grad_output_lp,
-        inputs_lp,
-        expert_indices,
-        num_experts,
-        grad_output_scales,
-        input_scales,
-        trans_weights=trans_weights,
-        use_2dblock_go=use_2dblock_x,
-        use_2dblock_x=use_2dblock_x,
-        output_dtype=dtype,
-        use_dot_scaled=use_dot_scaled,
-    )
-
-    grad_output_dq = convert_from_mxfp8_pytorch(
-        grad_output_lp, grad_output_scales, torch.float32, BLOCK_SIZE_DEFAULT, grad_axis, use_2dblock_x)
-    inputs_dq = convert_from_mxfp8_pytorch(
-        inputs_lp, input_scales, torch.float32, BLOCK_SIZE_DEFAULT, input_axis, use_2dblock_x)
-    grad_weights_ref = _reference_wgrad(grad_output_dq, inputs_dq, expert_indices, num_experts, trans_weights)
-
-    cos = _cossim(grad_weights, grad_weights_ref)
-    assert cos > 0.999, f"wgrad kernel vs dequant-matmul cos-sim too low: {cos} (2d_x={use_2dblock_x})"
-    snr = calc_snr(grad_weights_ref, grad_weights.float())
-    assert snr > 40, f"wgrad kernel vs dequant-matmul SNR too low: {snr:.1f}dB (2d_x={use_2dblock_x})"
-
-
-def _run_autograd_case(trans_weights, use_2dblock_x, use_2dblock_w, shape=(384, 128, 128, 2)):
-    m_total, n_dim, k_dim, num_experts = shape
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-
-    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
-    weight_shape = (num_experts, n_dim, k_dim) if trans_weights else (num_experts, k_dim, n_dim)
-    expert_weights = prepare_data(weight_shape, dtype).requires_grad_(True)
-    expert_indices = _make_indices(m_total // ALIGN_SIZE_M, num_experts, device)
-    target = prepare_data((m_total, n_dim), dtype)
-
-    outputs_ref = _reference(inputs, expert_weights, expert_indices, trans_weights)
-    torch.nn.functional.mse_loss(outputs_ref, target).backward()
-    grad_inputs_ref = inputs.grad.detach().clone()
-    grad_weights_ref = expert_weights.grad.detach().clone()
-    inputs.grad.zero_()
-    expert_weights.grad.zero_()
-
-    outputs = mxfp8_grouped_gemm(
-        inputs,
-        expert_weights,
-        expert_indices,
-        trans_weights=trans_weights,
-        use_2dblock_x=use_2dblock_x,
-        use_2dblock_w=use_2dblock_w,
-        use_sr_grad=False,
-    )
-    torch.nn.functional.mse_loss(outputs, target).backward()
-
-    assert _cossim(outputs, outputs_ref) > 0.99
-    assert _cossim(inputs.grad, grad_inputs_ref) > 0.99
-    assert _cossim(expert_weights.grad, grad_weights_ref) > 0.99
-
-
-@pytest.mark.parametrize("use_2dblock_x", [False, True])
-@pytest.mark.parametrize("use_2dblock_w", [False, True])
-def test_mxfp8_grouped_gemm_autograd(use_2dblock_x, use_2dblock_w):
-    _run_autograd_case(
-        trans_weights=True,
-        use_2dblock_x=use_2dblock_x,
-        use_2dblock_w=use_2dblock_w,
-    )
-
-
-def test_mxfp8_grouped_gemm_autograd_trans_weights_false():
-    _run_autograd_case(
-        trans_weights=False,
-        use_2dblock_x=False,
-        use_2dblock_w=False,
-        shape=(384, 256, 128, 2),
-    )
-
-
-def test_backward_wrappers_reject_non_aligned_mtotal():
-    """Both backward wrappers must fail fast on M_total not aligned to ALIGN_SIZE_M."""
-    m_total, n_dim, k_dim, num_experts = 64, 128, 128, 1  # 64 not a multiple of 128
-    dtype = torch.bfloat16
-    grad_output = prepare_data((m_total, n_dim), dtype)
-    inputs = prepare_data((m_total, k_dim), dtype)
-    expert_weights = prepare_data((num_experts, n_dim, k_dim), dtype)
-    indices = torch.zeros(m_total, dtype=torch.int32, device="cuda")
-
-    go_lp, go_s = torch.ops.alto.convert_to_mxfp8(
-        grad_output, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-2, is_2d_block=False)
-    go_m_lp, go_m_s = torch.ops.alto.convert_to_mxfp8(
-        grad_output, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=0, is_2d_block=False)
-    x_m_lp, x_m_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=0, is_2d_block=False)
-
-    with pytest.raises(AssertionError, match="multiple of group_size_m"):
-        mxfp8_grouped_gemm_backward_inputs(go_lp, w_lp, indices, go_s, w_s)
-    with pytest.raises(AssertionError, match="multiple of group_size_m"):
-        mxfp8_grouped_gemm_backward_weights(go_m_lp, x_m_lp, indices, num_experts, go_m_s, x_m_s)
-
-
-def test_autograd_many_experts_with_empty_expert():
-    """experts > groups => some experts receive zero tokens (dW row must be 0),
-    and the wgrad scan-all-groups path is exercised with finite gradients."""
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-    m_total, n_dim, k_dim, num_experts = ALIGN_SIZE_M * 2, 128, 128, 8  # 2 groups, 8 experts
-
-    inputs = prepare_data((m_total, k_dim), dtype).requires_grad_(True)
-    expert_weights = prepare_data((num_experts, n_dim, k_dim), dtype).requires_grad_(True)
-    # Route both groups to experts 0 and 1; experts 2..7 stay empty.
-    expert_indices = torch.zeros(m_total, dtype=torch.int32, device=device)
-    expert_indices[ALIGN_SIZE_M:] = 1
-    target = prepare_data((m_total, n_dim), dtype)
-
-    outputs = mxfp8_grouped_gemm(inputs, expert_weights, expert_indices, trans_weights=True)
-    torch.nn.functional.mse_loss(outputs, target).backward()
-
-    assert torch.isfinite(outputs).all()
-    assert torch.isfinite(inputs.grad).all()
-    assert torch.isfinite(expert_weights.grad).all()
-    # Empty experts must get exactly zero weight gradient.
-    assert torch.count_nonzero(expert_weights.grad[2:]) == 0, "unused experts must have zero dW"
-    assert torch.count_nonzero(expert_weights.grad[:2]) > 0, "routed experts must have nonzero dW"
-
-
-# =============== offsets dispatch entry + padded buffer (PLAN.md §8.1) ===============
-
-def test_mxfp8_grouped_gemm_accepts_padded_buffer():
-    """Padded activation buffer (M_bufferlen > routed M_total) must not disturb
-    routed rows, and padding rows must stay zero in both output and gradients.
-
-    Padded-vs-unpadded self-comparison through the offsets dispatch entry (which
-    uses trans_weights=False, also boosting that path's coverage). use_sr_grad is
-    forced False so quantization is deterministic and torch.equal holds bitwise.
-    """
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-    M_routed, M_pad = 4 * ALIGN_SIZE_M, 2 * ALIGN_SIZE_M  # 512 routed, 256 padding
-    M_bufferlen = M_routed + M_pad
-    num_experts, K, N = 4, 256, 256
-
-    routed_rows = prepare_data((M_routed, K), dtype)
-    weights = prepare_data((num_experts, K, N), dtype)  # dispatch convention [E, K, N]
-    # Each expert owns one ALIGN_SIZE_M group of routed tokens -> cumulative offs.
-    offs = torch.tensor(
-        [(i + 1) * (M_routed // num_experts) for i in range(num_experts)],
-        dtype=torch.int32, device=device,
-    )
-
-    inputs_pad = torch.zeros(M_bufferlen, K, dtype=dtype, device=device)
-    inputs_pad[:M_routed] = routed_rows
-    inputs_pad.requires_grad_(True)
-    weights_pad = weights.clone().requires_grad_(True)
-    y_pad = _quantize_then_mxfp8_scaled_grouped_mm(
-        inputs_pad, weights_pad, offs,
-        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
-
-    inputs_ref = routed_rows.clone().requires_grad_(True)
-    weights_ref = weights.clone().requires_grad_(True)
-    y_ref = _quantize_then_mxfp8_scaled_grouped_mm(
-        inputs_ref, weights_ref, offs,
-        use_2dblock_x=False, use_2dblock_w=False, use_sr_grad=False)
-
-    assert y_pad.shape == (M_bufferlen, N)
-    assert torch.equal(y_pad[M_routed:], torch.zeros_like(y_pad[M_routed:])), \
-        "padding output rows must be zero"
-    assert torch.equal(y_pad[:M_routed], y_ref), \
-        "routed output rows must be unaffected by buffer padding"
-
-    y_pad.sum().backward()
-    y_ref.sum().backward()
-    assert inputs_pad.grad.shape == (M_bufferlen, K)
-    assert torch.equal(inputs_pad.grad[M_routed:], torch.zeros_like(inputs_pad.grad[M_routed:])), \
-        "padding rows must receive zero input gradient"
-    assert torch.equal(inputs_pad.grad[:M_routed], inputs_ref.grad), \
-        "routed-row input gradients must be unaffected by buffer padding"
-    assert torch.equal(weights_pad.grad, weights_ref.grad), \
-        "weight gradients must be unaffected by buffer padding"
-
-
-def test_mxfp8_dispatch_entry_offsets_matches_indices():
-    """The offsets entry must equal the indices path: create_indices_from_offsets
-    round-trip is correct. No padding here (M_bufferlen == M_total)."""
-    device = torch.device("cuda")
-    dtype = torch.bfloat16
-    num_groups, num_experts, K, N = 4, 4, 256, 256
-    M_total = num_groups * ALIGN_SIZE_M
-
-    inputs = prepare_data((M_total, K), dtype)
-    weights = prepare_data((num_experts, K, N), dtype)  # [E, K, N] dispatch convention
-    # Contiguous round-robin routing: group g -> expert (g % num_experts).
-    indices = _make_indices(num_groups, num_experts, device)
-    # Matching cumulative offsets (one group per expert here, all sizes ALIGN_SIZE_M).
-    offs = torch.arange(1, num_groups + 1, dtype=torch.int32, device=device) * ALIGN_SIZE_M
-
-    y_indices = mxfp8_grouped_gemm(
-        inputs, weights, indices, trans_weights=False, use_sr_grad=False)
-    y_offsets = _quantize_then_mxfp8_scaled_grouped_mm(
-        inputs, weights, offs, use_sr_grad=False)
-
-    snr = calc_snr(y_indices.float(), y_offsets.float())
-    assert snr > 30, f"offsets entry vs indices path SNR too low: {snr:.1f}dB"
diff --git a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py b/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
deleted file mode 100644
index a0dbd3d..0000000
--- a/tests/unittest/mxfp8/test_mxfp8_grouped_gemm_forward.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) 2026 Advanced Micro Devices, Inc.
-#
-# SPDX-License-Identifier: MIT
-
-import pytest
-import torch
-
-from alto.kernels.mxfp8.mxfp8_quantization import BLOCK_SIZE_DEFAULT
-from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
-from alto.kernels.mxfp8.mxfp8_grouped_gemm.cg_forward import mxfp8_grouped_gemm_forward
-from alto.kernels.fp4.testing_utils import calc_snr
-
-from .utils import prepare_data, convert_from_mxfp8_pytorch
-
-
-def _cossim(x, y):
-    x = x.flatten().to(torch.float32)
-    y = y.flatten().to(torch.float32)
-    return torch.nn.functional.cosine_similarity(x, y, dim=0).item()
-
-
-def _make_indices(num_groups, num_experts, device):
-    indices = torch.zeros(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
-    for g in range(num_groups):
-        e = torch.randint(0, num_experts, (1,), device=device, dtype=torch.int32).item()
-        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = e
-    return indices
-
-
-def _reference(inputs, expert_weights, indices, num_groups, trans_weights):
-    M_total, N = inputs.shape[0], (expert_weights.shape[1] if trans_weights else expert_weights.shape[2])
-    out = torch.zeros((M_total, N), dtype=inputs.dtype, device=inputs.device)
-    for g in range(num_groups):
-        s, e = g * ALIGN_SIZE_M, (g + 1) * ALIGN_SIZE_M
-        w = expert_weights[indices[s].item()]
-        w = w.t() if trans_weights else w
-        out[s:e] = inputs[s:e] @ w
-    return out
-
-
-@pytest.mark.parametrize("shape", [(256, 128, 128, 1), (512, 256, 256, 4)])
-@pytest.mark.parametrize("use_2dblock_x", [False, True])
-@pytest.mark.parametrize("use_2dblock_w", [False, True])
-@pytest.mark.parametrize("trans_weights", [True, False])
-def test_forward(shape, use_2dblock_x, use_2dblock_w, trans_weights):
-    M_total, N, K, num_experts = shape
-    M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
-    num_groups = M_total // ALIGN_SIZE_M
-    device = torch.device("cuda")
-    data_type = torch.bfloat16
-
-    inputs = prepare_data((M_total, K), data_type)
-    weight_shape = (num_experts, N, K) if trans_weights else (num_experts, K, N)
-    weight_axis = -1 if trans_weights else -2
-    expert_weights = prepare_data(weight_shape, data_type)
-    indices = _make_indices(num_groups, num_experts, device)
-
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=use_2dblock_x)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=weight_axis, is_2d_block=use_2dblock_w)
-
-    out = mxfp8_grouped_gemm_forward(
-        x_lp, w_lp, indices, x_s, w_s,
-        trans_weights=trans_weights,
-        use_2dblock_x=use_2dblock_x,
-        use_2dblock_w=use_2dblock_w,
-        lhs_format_id=0,
-        rhs_format_id=0,
-        output_dtype=data_type,
-    )
-
-    # Primary correctness gate: dequantize the exact fp8 operands the kernel
-    # consumed, matmul in PyTorch. This isolates kernel-port correctness from
-    # mxfp8-vs-bf16 quantization error.
-    x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, use_2dblock_x)
-    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, weight_axis, use_2dblock_w)
-    ref_dq = _reference(x_dq, w_dq, indices, num_groups, trans_weights)
-    cos_dq = _cossim(out, ref_dq)
-    assert cos_dq > 0.999, \
-        f"kernel vs dequant-matmul cos-sim too low: {cos_dq} (shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w})"
-    # SNR catches magnitude errors (missed scale, wrong accumulator) that cossim is blind to.
-    snr_dq = calc_snr(ref_dq, out.float())
-    assert snr_dq > 40, \
-        f"kernel vs dequant-matmul SNR too low: {snr_dq:.1f}dB (shape={shape}, 2dx={use_2dblock_x}, 2dw={use_2dblock_w})"
-
-    # Looser sanity check against the full-precision bf16 path.
-    ref_bf16 = _reference(inputs, expert_weights, indices, num_groups, trans_weights)
-    cos_bf16 = _cossim(out, ref_bf16)
-    assert cos_bf16 > 0.99, f"kernel vs bf16 cos-sim too low: {cos_bf16}"
-
-
-def test_forward_dequant_fallback_matches_dot_scaled():
-    """CDNA3 path (use_dot_scaled=False) must match the dequant-matmul reference.
-
-    Forced on regardless of the running device so the _dequantize_fp8 -> tl.dot
-    branch is covered; real MI300 ground-truth still needs Step 7 on CDNA3.
-    """
-    M_total, N, K, num_experts = 256, 128, 128, 2
-    M_total = (M_total // ALIGN_SIZE_M) * ALIGN_SIZE_M
-    num_groups = M_total // ALIGN_SIZE_M
-    device = torch.device("cuda")
-    data_type = torch.bfloat16
-
-    inputs = prepare_data((M_total, K), data_type)
-    expert_weights = prepare_data((num_experts, N, K), data_type)
-    indices = _make_indices(num_groups, num_experts, device)
-
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=True)
-
-    out = mxfp8_grouped_gemm_forward(
-        x_lp, w_lp, indices, x_s, w_s,
-        trans_weights=True,
-        use_2dblock_x=False,
-        use_2dblock_w=True,
-        output_dtype=data_type,
-        use_dot_scaled=False,
-    )
-
-    x_dq = convert_from_mxfp8_pytorch(x_lp, x_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, False)
-    w_dq = convert_from_mxfp8_pytorch(w_lp, w_s, torch.float32, BLOCK_SIZE_DEFAULT, -1, True)
-    ref_dq = _reference(x_dq, w_dq, indices, num_groups, trans_weights=True)
-    cos = _cossim(out, ref_dq)
-    assert cos > 0.999, f"fallback forward vs dequant-matmul cos-sim too low: {cos}"
-
-
-def test_forward_single_expert_matches_mxfp8_linear():
-    """All tokens to one expert => grouped GEMM must match mxfp8 linear (x @ w^T).
-
-    Independent cross-check: the linear path quantizes via its own autograd
-    Function, so this catches bugs the dequant-matmul reference can't — that
-    reference shares this test's convert_to_mxfp8 output, so a quant bug would
-    corrupt both sides equally and still pass.
-    """
-    from alto.kernels.mxfp8.mxfp8_linear import _to_mxfp8_then_scaled_mm
-
-    M, N, K = ALIGN_SIZE_M, 256, 256
-    data_type = torch.bfloat16
-    x = prepare_data((M, K), data_type)
-    w = prepare_data((N, K), data_type)
-    indices = torch.zeros(M, dtype=torch.int32, device="cuda")
-
-    y_linear = _to_mxfp8_then_scaled_mm(x, w, use_2dblock_x=False, use_2dblock_w=False)
-
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        x, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        w.unsqueeze(0), block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    y_gg = mxfp8_grouped_gemm_forward(
-        x_lp, w_lp, indices, x_s, w_s, trans_weights=True,
-        use_2dblock_x=False, use_2dblock_w=False, output_dtype=data_type)
-
-    snr = calc_snr(y_linear.float(), y_gg.float())
-    assert snr > 30, f"single-expert grouped GEMM vs mxfp8 linear SNR too low: {snr:.1f}dB"
-
-
-def test_forward_rejects_indices_length_mismatch():
-    M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
-    data_type = torch.bfloat16
-    inputs = prepare_data((M_total, K), data_type)
-    expert_weights = prepare_data((num_experts, N, K), data_type)
-    # A routed count below the buffer is now the padded-buffer case, not an error;
-    # but a non-128-aligned routed count (127) must still be rejected.
-    indices = torch.zeros(M_total - 1, dtype=torch.int32, device="cuda")
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-
-    with pytest.raises(AssertionError, match="multiple of group_size_m"):
-        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
-
-
-def test_forward_rejects_non_aligned_mtotal():
-    M_total, N, K = 64, 128, 128  # 64 not a multiple of ALIGN_SIZE_M (128)
-    data_type = torch.bfloat16
-    inputs = prepare_data((M_total, K), data_type)
-    expert_weights = prepare_data((1, N, K), data_type)
-    indices = torch.zeros(M_total, dtype=torch.int32, device="cuda")
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-
-    with pytest.raises(AssertionError, match="multiple of group_size_m"):
-        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s)
-
-
-def test_forward_rejects_wrong_scale_shape():
-    M_total, N, K, num_experts = ALIGN_SIZE_M, 128, 128, 1
-    data_type = torch.bfloat16
-    inputs = prepare_data((M_total, K), data_type)
-    expert_weights = prepare_data((num_experts, N, K), data_type)
-    indices = torch.zeros(M_total, dtype=torch.int32, device="cuda")
-    x_lp, x_s = torch.ops.alto.convert_to_mxfp8(
-        inputs, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-    w_lp, w_s = torch.ops.alto.convert_to_mxfp8(
-        expert_weights, block_size=BLOCK_SIZE_DEFAULT, mxfp_format="e4m3", axis=-1, is_2d_block=False)
-
-    with pytest.raises(AssertionError, match="weight_scales shape"):
-        mxfp8_grouped_gemm_forward(x_lp, w_lp, indices, x_s, w_s[:, :, :-1])
diff --git a/tests/unittest/mxfp8/utils.py b/tests/unittest/mxfp8/utils.py
index 6a29ece..0cc5039 100644
--- a/tests/unittest/mxfp8/utils.py
+++ b/tests/unittest/mxfp8/utils.py
@@ -10,6 +10,26 @@
     FORMAT_TO_TARGET_MAX,
     FORMAT_TO_MBITS,
 )
+from alto.kernels.mxfp8.mxfp8_grouped_gemm.autotune import ALIGN_SIZE_M
+
+# Re-exported so test call-sites can ``from .utils import calc_snr, calc_cossim``
+# (mirrors mxfp4/nvfp4 utils); the single source of truth lives in
+# ``alto.kernels.fp4.testing_utils``.
+from alto.kernels.fp4.testing_utils import calc_snr, calc_cossim  # noqa: F401
+
+
+def make_indices(num_groups, num_experts, device):
+    """Contiguous routing: every ALIGN_SIZE_M-token group shares one expert.
+
+    Matches the mxfp4/nvfp4 convention (random per-group expert id). Calls to
+    ``prepare_data`` reset the global seed, so the ``randint`` draws that follow
+    them are deterministic across runs without an explicit seed here.
+    """
+    indices = torch.zeros(num_groups * ALIGN_SIZE_M, dtype=torch.int32, device=device)
+    for g in range(num_groups):
+        e = torch.randint(0, num_experts, (1,), device=device, dtype=torch.int32).item()
+        indices[g * ALIGN_SIZE_M:(g + 1) * ALIGN_SIZE_M] = e
+    return indices
 
 
 def prepare_data(tensor_shape, data_type, pattern="random"):