InfiniTensor · spike-zhu · May 21, 2026 · wooway777 · May 26, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -5,3 +5,9 @@
 	path = third_party/nlohmann_json
 	url = https://github.com/nlohmann/json.git
 	branch = master
+[submodule "third_party/mate"]
+	path = third_party/mate
+	url = https://github.com/MooreThreads/mate
+  branch = v0.1.3
+	ignore = untracked
+	update = none
diff --git a/README.md b/README.md
@@ -155,7 +155,7 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
      xmake f --ascend-npu=true -cv
      ```
 
-##### 试验功能 -- 使用flash attention库中的算子
+##### 试验功能 -- 使用英伟达平台 flash attention 库中的算子
 
   ```shell
 
@@ -176,6 +176,17 @@ python scripts/install.py [XMAKE_CONFIG_FLAGS]
 
   ```
 
+##### 试验功能 -- 使用摩尔线程开源 mate 提供的 flash attention 能力
+  ```shell
+  #该功能依赖摩尔线程开源项目 mate（https://github.com/MooreThreads/mate） v0.1.3 版本，默认不随仓库递归拉取。
+
+  #若需启用 Moore MATE FlashAttention，请手动初始化对应子模块：
+  git -c submodule.third_party/mate.update=checkout submodule update --init --recursive third_party/mate
+
+  #随后参考 mate v0.1.3 README 进行编译，之后在 xmake 配置环节额外打开 --aten 开关使用 mate 提供的 flash attention 能力，可参考：
+  xmake f --moore-gpu=y --ccl=y --aten=y -cv
+  ```
+
 2. 编译安装
 
    默认安装路径为 `$HOME/.infini`。

diff --git a/include/infinicore/adaptor/aten_adaptor.hpp b/include/infinicore/adaptor/aten_adaptor.hpp
@@ -11,6 +11,12 @@
 #include <c10/cuda/CUDAStream.h>
 #endif
 
+#if defined(ENABLE_MOORE_API)
+#include <c10/macros/Macros.h>
+#include <c10/musa/MUSAMacros.h>
+#include <c10/musa/MUSAStream.h>
+#endif
+
 namespace infinicore::adaptor {
 inline at::ScalarType to_at_dtype(DataType dtype) {
     switch (dtype) {
@@ -36,7 +42,13 @@ inline at::Device to_at_device(const Device &device) {
         return at::Device(at::kCUDA, device.getIndex());
     } else if (device.getType() == Device::Type::CPU) {
         return at::Device(at::kCPU);
-    } else {
+    }
+#if defined(ENABLE_MOORE_API)
+    else if (device.getType() == Device::Type::MOORE) {
+        return at::Device(at::DeviceType::PrivateUse1, device.getIndex());
+    }
+#endif
+    else {
         throw std::runtime_error("Unsupported device type for ATen");
     }
 }
@@ -46,6 +58,11 @@ at::Tensor to_aten_tensor(const infinicore::Tensor &t);
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_METAX_API) || defined(ENABLE_QY_API)
 c10::cuda::CUDAStream get_cuda_stream();
 #endif
+
+#if defined(ENABLE_MOORE_API)
+c10::musa::MUSAStream get_musa_stream();
+#endif
+
 } // namespace infinicore::adaptor
 
 #endif // ENABLE_ATEN
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
@@ -106,6 +106,10 @@
 from infinicore.ops.matmul import matmul
 from infinicore.ops.mha_kvcache import mha_kvcache
 from infinicore.ops.mha_varlen import mha_varlen
+from infinicore.ops.moore_mate_flash_attn import (
+    moore_mate_flash_attn_decode,
+    moore_mate_flash_attn_prefill,
+)
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
 from infinicore.ops.nrm2 import nrm2
@@ -276,6 +280,8 @@
     "zeros",
     "sum",
     "var_mean",
+    "moore_mate_flash_attn_prefill",
+    "moore_mate_flash_attn_decode",
     "var",
     "topk",
     "all",