pytorch · Gasoonjia · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/backends/cuda/tests/test_int4_matmul.py b/backends/cuda/tests/test_int4_matmul.py
@@ -19,7 +19,6 @@
 import unittest
 
 import torch
-import torch.nn as nn
 
 from executorch.backends.cuda.triton.kernels.int4_matmul import (
     dequant_w4_to_bf16,

diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py
@@ -8,6 +8,7 @@
     fused_moe,
     fused_moe_batched,
     fused_moe_batched_gemm,
+    fused_moe_batched_gemm_int8,
     moe_align_block_size,
 )
 
@@ -23,6 +24,8 @@
     "fused_moe",
     "fused_moe_batched",
     "fused_moe_batched_gemm",
+    "fused_moe_batched_gemm_int8",
+    "int4_matvec",
     "moe_align_block_size",
     "sdpa",
     "sdpa_decode_splitk",