From ce5d99515c28e90129b48979fc021cc5a4edebca Mon Sep 17 00:00:00 2001 From: zhangstevenunity <128771452+zhangstevenunity@users.noreply.github.com> Date: Thu, 28 May 2026 10:26:43 +0800 Subject: [PATCH] fix(emitc): drain MTE before pto.comm.tnotify lowering TNOTIFY_IMPL writes the signal on the scalar pipe and only issues pipe_barrier(PIPE_ALL) *after* the store. Prior pto.tload / pto.tstore ops (local or peer-addressed) can still be in flight on MTE2/MTE3 when the signal lands, breaking the notify/wait handshake -- the receiver's TWAIT returns before the data is visible. Emit pipe_barrier(PIPE_ALL) right before the pto::comm::TNOTIFY call in PTOToEmitC so the lowering itself honors the contract, with no caller-side workaround required. - lib/PTO/Transforms/PTOToEmitC.cpp: add emitTNotifyMteDrain helper and call it from PTOSignalCommToEmitC for the TNotifyOp branch only (TWAIT/TTEST do not need the extra drain). - test/lit/pto/issue711_tnotify_mte_drain.pto: regression covering tstore->tnotify, tload->tnotify, and confirming twait does not get the new drain. - docs/PTO_IR_manual.md: document the lowering ordering guarantee. Fixes #711 Co-Authored-By: Claude Opus 4.7 --- docs/PTO_IR_manual.md | 10 ++ lib/PTO/Transforms/PTOToEmitC.cpp | 17 +++ test/lit/pto/issue711_tnotify_mte_drain.pto | 129 ++++++++++++++++++++ 3 files changed, 156 insertions(+) create mode 100644 test/lit/pto/issue711_tnotify_mte_drain.pto diff --git a/docs/PTO_IR_manual.md b/docs/PTO_IR_manual.md index a32de72f7..14dbdc92a 100644 --- a/docs/PTO_IR_manual.md +++ b/docs/PTO_IR_manual.md @@ -9371,6 +9371,16 @@ pto.comm.tget(%dst, %src, buf(%ping, %pong) : !pto.partition_tensor_view<128xf32 - `signal` must be a GM-shaped value with element type `i32`. - `value` / `cmpValue` must be signless integer scalars. +**Lowering ordering guarantee:** + +- `pto.comm.tnotify` is lowered with a `pipe_barrier(PIPE_ALL)` emitted + immediately before the `pto::comm::TNOTIFY(...)` call. `TNOTIFY_IMPL` writes + the signal on the scalar pipe and only issues its trailing barrier *after* + the store, so this preceding drain is what makes the + `peer_TWAIT_returns ⇒ everything I issued before my TNOTIFY is visible` + contract hold across `pto.tload` / `pto.tstore` (local or peer-addressed). + Callers do not need to insert manual sync. + **Examples:** ```mlir diff --git a/lib/PTO/Transforms/PTOToEmitC.cpp b/lib/PTO/Transforms/PTOToEmitC.cpp index b3f0c6bbd..65dedca95 100644 --- a/lib/PTO/Transforms/PTOToEmitC.cpp +++ b/lib/PTO/Transforms/PTOToEmitC.cpp @@ -6387,6 +6387,20 @@ static std::string notifyOpTok(pto::NotifyOp op) { return "pto::comm::NotifyOp::Set"; } +// Issue #711: TNOTIFY writes its signal on the scalar pipe, and +// TNOTIFY_IMPL's trailing pipe_barrier(PIPE_ALL) runs *after* that store. +// If any prior pto.tload / pto.tstore (local or peer) is still in flight on +// an MTE pipe when the signal lands, the receiver's matching TWAIT can +// return before the data is visible. The lowering of pto.comm.tnotify must +// drain MTE-side pipes itself; callers cannot be required to insert sync. +static void emitTNotifyMteDrain(ConversionPatternRewriter &rewriter, + Location loc) { + auto *ctx = rewriter.getContext(); + auto args = rewriter.getArrayAttr({emitc::OpaqueAttr::get(ctx, "PIPE_ALL")}); + rewriter.create(loc, TypeRange{}, "pipe_barrier", args, + ArrayAttr{}, ValueRange{}); +} + static std::string waitCmpTok(pto::WaitCmp cmp) { switch (cmp) { case pto::WaitCmp::EQ: @@ -6641,6 +6655,9 @@ struct PTOSignalCommToEmitC : public OpConversionPattern { rewriter, op.getLoc(), notifyTy, notifyOpTok(op.getNotifyOp())); SmallVector operands{*signalGT, peelUnrealized(adaptor.getValue()), notifyOp}; + // See emitTNotifyMteDrain comment: drain in-flight MTE work before the + // scalar-pipe signal store so the notify/wait handshake is honored. + emitTNotifyMteDrain(rewriter, op.getLoc()); rewriter.create(op.getLoc(), TypeRange{}, callee, ArrayAttr{}, ArrayAttr{}, operands); rewriter.eraseOp(op); diff --git a/test/lit/pto/issue711_tnotify_mte_drain.pto b/test/lit/pto/issue711_tnotify_mte_drain.pto new file mode 100644 index 000000000..b8ce7eec2 --- /dev/null +++ b/test/lit/pto/issue711_tnotify_mte_drain.pto @@ -0,0 +1,129 @@ +// Copyright (c) 2026 Huawei Technologies Co., Ltd. +// This program is free software, you can redistribute it and/or modify it under the terms and conditions of +// CANN Open Software License Agreement Version 2.0 (the "License"). +// Please refer to the License for details. You may not use this file except in compliance with the License. +// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +// See LICENSE in the root of the software repository for the full text of the License. + +// Regression for issue #711: pto.comm.tnotify lowering must drain MTE-side +// pipes before emitting pto::comm::TNOTIFY(...). TNOTIFY_IMPL writes the +// signal on the scalar pipe and only does pipe_barrier(PIPE_ALL) *after* the +// store, so without an MTE drain the signal can overtake an in-flight +// pto.tload / pto.tstore (local or peer) and the receiver's matching TWAIT +// returns before the data is visible. + +// RUN: ptoas --pto-arch=a3 %s -o - 2>&1 | FileCheck %s + +module { + // tstore -> tnotify: the data-producing case (signal must follow the store). + func.func @tnotify_drain_after_tstore( + %src_ptr: !pto.ptr, + %dst_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + %dst_view = pto.make_tensor_view %dst_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %dst = pto.partition_view %dst_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>) + outs(%tile : !pto.tile_buf) + pto.tstore ins(%tile : !pto.tile_buf) + outs(%dst : !pto.partition_tensor_view<1x32xf32>) + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + // tload -> tnotify: the input-consumed case (notify must follow the load + // so the producer can reuse the source buffer once TWAIT returns). + func.func @tnotify_drain_after_tload( + %src_ptr: !pto.ptr, + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c32 = arith.constant 32 : index + %v_i32 = arith.constant 1 : i32 + + %tile = pto.alloc_tile : + !pto.tile_buf + + %src_view = pto.make_tensor_view %src_ptr, + shape = [%c1, %c32], strides = [%c32, %c1] : !pto.tensor_view + %src = pto.partition_view %src_view, + offsets = [%c0, %c0], sizes = [%c1, %c32] + : !pto.tensor_view -> !pto.partition_tensor_view<1x32xf32> + + pto.tload ins(%src : !pto.partition_tensor_view<1x32xf32>) + outs(%tile : !pto.tile_buf) + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.tnotify(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {notifyOp = #pto} + return + } + + // twait must NOT receive the new MTE drain -- the receiver-side wait does + // not write any peer-visible data, so adding a barrier there would only be + // a needless stall. Pair this with --implicit-check-not to lock that in. + func.func @twait_no_extra_drain( + %signal_ptr: !pto.ptr) + attributes {pto.kernel_kind = #pto.kernel_kind} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %v_i32 = arith.constant 1 : i32 + + %sig_view = pto.make_tensor_view %signal_ptr, + shape = [%c1], strides = [%c1] : !pto.tensor_view<1xi32> + %sig = pto.partition_view %sig_view, + offsets = [%c0], sizes = [%c1] + : !pto.tensor_view<1xi32> -> !pto.partition_tensor_view<1xi32> + pto.comm.twait(%sig, %v_i32 : !pto.partition_tensor_view<1xi32>, i32) + {cmp = #pto} + return + } +} + +// CHECK-LABEL: AICORE void tnotify_drain_after_tstore( +// CHECK: pto::comm::NotifyOp{{.*}}= pto::comm::NotifyOp::Set; +// CHECK: TLOAD( +// CHECK: TSTORE( +// CHECK: pipe_barrier(PIPE_ALL); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void tnotify_drain_after_tload( +// CHECK: pto::comm::NotifyOp{{.*}}= pto::comm::NotifyOp::AtomicAdd; +// CHECK: TLOAD( +// CHECK: pipe_barrier(PIPE_ALL); +// CHECK-NEXT: pto::comm::TNOTIFY( + +// CHECK-LABEL: AICORE void twait_no_extra_drain( +// CHECK-NOT: pipe_barrier(PIPE_ALL); +// CHECK: pto::comm::TWAIT(