diff --git a/.agents/skills/python-kwargs-setattr-security/SKILL.md b/.agents/skills/python-kwargs-setattr-security/SKILL.md
new file mode 100644
index 0000000000000..d31d9d9cac3fa
--- /dev/null
+++ b/.agents/skills/python-kwargs-setattr-security/SKILL.md
@@ -0,0 +1,59 @@
+---
+name: python-kwargs-setattr-security
+description: When reviewing or fixing Python code that uses setattr() with user-controlled kwargs to configure C++ extension objects (SessionOptions, RunOptions, etc.) in ONNX Runtime. Use this to apply the allowlist pattern that prevents arbitrary file writes and other attacks via reflected property access.
+---
+
+## Problem Pattern
+
+Using `hasattr(obj, k) / setattr(obj, k, v)` with user-controlled kwargs is insecure. The `hasattr` check is NOT a security guard — it returns True for ALL exposed properties including dangerous ones.
+
+```python
+# INSECURE — do not use
+for k, v in kwargs.items():
+    if hasattr(options, k):
+        setattr(options, k, v)
+```
+
+## Fix: Explicit Allowlist
+
+Define a module-level frozenset of safe attribute names. Raise RuntimeError for known-but-blocked attrs; silently ignore unknown keys.
+
+```python
+# Define at module level, before the class
+_ALLOWED_SESSION_OPTIONS = frozenset({
+    "enable_cpu_mem_arena",
+    "enable_mem_pattern",
+    # ... only explicitly reviewed safe attrs
+})
+
+# In the method
+for k, v in kwargs.items():
+    if k in _ALLOWED_SESSION_OPTIONS:
+        setattr(options, k, v)
+    elif hasattr(options, k):  # reuse the existing instance, don't create new
+        raise RuntimeError(
+            f"SessionOptions attribute '{k}' is not permitted via the backend API. "
+            f"Allowed attributes: {', '.join(sorted(_ALLOWED_SESSION_OPTIONS))}"
+        )
+    # else: silently ignore (may be kwargs for a different config object)
+```
+
+## Key Rules
+
+1. **Use the existing object** in `hasattr(options, k)` — never `hasattr(ClassName(), k)` (creates throwaway C++ objects per iteration)
+2. **RuntimeError** is the ORT convention for API misuse errors (not ValueError)
+3. **Silent ignore for one path is OK when kwargs are forwarded to both paths**: `run_model()` passes the same kwargs dict to both `prepare()` (validates SessionOptions) and `rep.run()` (validates RunOptions). A RunOptions kwarg unknown to SessionOptions is silently ignored by `prepare()` — this is correct because `rep.run()` will validate it. Only raise RuntimeError when the attr exists on the target object but is blocked.
+4. **Frozenset constant naming**: `_ALLOWED_<CLASSNAME>` — ALL_CAPS, Google Style
+5. **No type annotations** on module-level constants (ORT Python convention)
+
+## Dangerous SessionOptions Properties (never allowlist)
+
+- `optimized_model_filepath` — triggers Model::Save(), overwrites arbitrary files
+- `profile_file_prefix` + `enable_profiling` — writes profiling JSON to arbitrary path
+- `register_custom_ops_library` — loads arbitrary shared libraries (method, not property)
+
+## Files in ONNX Runtime
+
+- `onnxruntime/python/backend/backend.py` — `_ALLOWED_SESSION_OPTIONS`
+- `onnxruntime/python/backend/backend_rep.py` — `_ALLOWED_RUN_OPTIONS`
+- Tests: `onnxruntime/test/python/onnxruntime_test_python_backend.py` — `TestBackendKwargsAllowlist`
diff --git a/.github/workflows/windows_openvino.yml b/.github/workflows/windows_openvino.yml
index 8ff7a7071a755..52581c7d0a5f5 100644
--- a/.github/workflows/windows_openvino.yml
+++ b/.github/workflows/windows_openvino.yml
@@ -51,12 +51,12 @@ jobs:
         with:
           architecture: x64
 
-      - name: Download OpenVINO Toolkit v2025.4.1
+      - name: Download OpenVINO Toolkit v2026.1.0
         env:
-          OpenVINOVersion: 2025.4.1
+          OpenVINOVersion: 2026.1.0
         shell: pwsh
         run: |
-          $Url ="https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.4.1/windows/openvino_toolkit_windows_2025.4.1.20426.82bbf0292c5_x86_64.zip"
+          $Url ="https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.1/windows_vc_mt/openvino_toolkit_windows_vc_mt_2026.1.0.21367.63e31528c62_x86_64.zip"
           $OutputPath = "$env:RUNNER_TEMP\openvino.zip"
           $ExtractPath = "$env:RUNNER_TEMP\openvino-v$env:OpenVINOVersion"
           $TempExtractPath = "$env:RUNNER_TEMP\openvino_temp"
@@ -99,7 +99,7 @@ jobs:
         shell: pwsh
         # Use $GITHUB_ENV to set the variable for subsequent steps
         run: |
-          $openVinoRootDir = Join-Path $env:RUNNER_TEMP "openvino-v2025.4.1"
+          $openVinoRootDir = Join-Path $env:RUNNER_TEMP "openvino-v2026.1.0"
           echo "OpenVINORootDir=$openVinoRootDir" >> $env:GITHUB_ENV
 
       - name: Print OpenVINORootDir after downloading OpenVINO
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8af161b524bee..83d1751e55543 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -89,6 +89,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
+option(onnxruntime_USE_RVV "Build with RISC-V Vector support in MLAS" OFF)
 option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)
 
 option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 448e6fcb23f2f..fa37238bbb82e 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -51,7 +51,7 @@ pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
-cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v4.2.1.zip;5d2b21b10478556c5e209dd7229e298a5c9f0b02
+cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v4.4.2.zip;4b0bae4428b84370407c0a71778b13dc2eee5be1
 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0c12f53da76d0c31b03b9f0f8ec8f3b4.zip;239063aee4946a9af147b473a4c3da78ba7413b4
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index 83d8a156b630f..62187fd0ca63f 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -4,7 +4,7 @@ onnxruntime_fetchcontent_declare(
   URL ${DEP_URL_cutlass}
   URL_HASH SHA1=${DEP_SHA1_cutlass}
   EXCLUDE_FROM_ALL
-  PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.2.1.patch
+  PATCH_COMMAND ${Patch_EXECUTABLE} --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_4.4.2.patch
 )
 
 FetchContent_GetProperties(cutlass)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 4afa074a0b254..be0abc980bda6 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -376,6 +376,18 @@ if (CPUINFO_SUPPORTED)
         ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch
       FIND_PACKAGE_ARGS NAMES cpuinfo
     )
+  elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    message(STATUS "Applying sysfs fallback patch for cpuinfo on Linux")
+    onnxruntime_fetchcontent_declare(
+      pytorch_cpuinfo
+      URL ${DEP_URL_pytorch_cpuinfo}
+      URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
+      EXCLUDE_FROM_ALL
+      PATCH_COMMAND
+        # https://github.com/microsoft/onnxruntime/issues/10038
+        ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/fix_missing_sysfs_fallback.patch
+      FIND_PACKAGE_ARGS NAMES cpuinfo
+    )
   else()
     onnxruntime_fetchcontent_declare(
       pytorch_cpuinfo
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index bde73252449dc..0233254ad50ad 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -435,6 +435,8 @@ else()
           set(X86 TRUE)
         elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
           set(X86_64 TRUE)
+        elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+          set(RISCV64 TRUE)
         elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
           set(LOONGARCH64 TRUE)
         elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x$")
@@ -903,6 +905,48 @@ endif()
           set(MLAS_SOURCE_IS_NOT_SET 0)
         endif()
     endif()
+    if(RISCV64 AND MLAS_SOURCE_IS_NOT_SET)
+        file(GLOB_RECURSE mlas_platform_srcs CONFIGURE_DEPENDS
+          "${MLAS_SRC_DIR}/scalar/*.cpp")
+
+        if(onnxruntime_USE_RVV)
+          set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+          set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS} -march=rv64gcv -mabi=lp64d")
+          check_cxx_source_compiles("
+            #include <stddef.h>
+            #include <riscv_vector.h>
+            int main() {
+              size_t vl = __riscv_vsetvl_e32m1(4);
+              return static_cast<int>(vl == 0);
+            }"
+            HAS_RISCV64_RVV
+          )
+          set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+          unset(OLD_CMAKE_REQUIRED_FLAGS)
+
+          if(HAS_RISCV64_RVV)
+            list(APPEND mlas_platform_srcs
+              ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
+            )
+            set_source_files_properties(
+              ${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
+              ${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
+              PROPERTIES COMPILE_FLAGS "-march=rv64gcv -mabi=lp64d")
+            list(APPEND mlas_private_compile_definitions MLAS_USE_RVV=1)
+          else()
+            message(
+              WARNING
+              "onnxruntime_USE_RVV was requested, but the compiler does not support rv64gcv RVV intrinsics. Falling back to scalar MLAS kernels.")
+          endif()
+        endif()
+
+        if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
+          set(MLAS_SOURCE_IS_NOT_SET 0)
+        endif()
+    endif()
     if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
         file(GLOB_RECURSE mlas_platform_srcs
           "${MLAS_SRC_DIR}/scalar/*.cpp")
@@ -997,4 +1041,4 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
     endif()
   endif()
 
-endif()
\ No newline at end of file
+endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 4e5636572b94a..bd12b50b7af43 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1400,6 +1400,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 
     SET(MLAS_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench)
     file(GLOB_RECURSE MLAS_BENCH_SOURCE_FILES "${MLAS_BENCH_DIR}/*.cpp" "${MLAS_BENCH_DIR}/*.h")
+    list(FILTER MLAS_BENCH_SOURCE_FILES EXCLUDE REGEX "${MLAS_BENCH_DIR}/riscv64/.*")
     onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES} ${ONNXRUNTIME_ROOT}/core/framework/error_code.cc)
     target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
     target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
@@ -1418,6 +1419,33 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo)
     endif()
     set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest")
+
+  endif()
+
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
+    set(MLAS_RISCV64_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench/riscv64)
+
+    onnxruntime_add_executable(
+      onnxruntime_mlas_sgemm_riscv_bench
+      ${MLAS_RISCV64_BENCH_DIR}/sgemm_riscv_bench.cpp)
+    target_include_directories(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
+    target_link_libraries(
+      onnxruntime_mlas_sgemm_riscv_bench
+      PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
+    target_compile_definitions(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${mlas_private_compile_definitions})
+    set_target_properties(onnxruntime_mlas_sgemm_riscv_bench PROPERTIES FOLDER "ONNXRuntimeTest")
+
+    onnxruntime_add_executable(
+      onnxruntime_mlas_softmax_riscv_compare
+      ${MLAS_RISCV64_BENCH_DIR}/softmax_rvv_compare.cpp)
+    target_include_directories(
+      onnxruntime_mlas_softmax_riscv_compare
+      PRIVATE ${ONNXRUNTIME_ROOT} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
+    target_link_libraries(
+      onnxruntime_mlas_softmax_riscv_compare
+      PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
+    target_compile_definitions(onnxruntime_mlas_softmax_riscv_compare PRIVATE ${mlas_private_compile_definitions})
+    set_target_properties(onnxruntime_mlas_softmax_riscv_compare PROPERTIES FOLDER "ONNXRuntimeTest")
   endif()
 
   if(WIN32)
diff --git a/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch b/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch
new file mode 100644
index 0000000000000..005cd458fdd2b
--- /dev/null
+++ b/cmake/patches/cpuinfo/fix_missing_sysfs_fallback.patch
@@ -0,0 +1,83 @@
+diff --git a/src/linux/processors.c b/src/linux/processors.c
+index 47bee76..d0c5569 100644
+--- a/src/linux/processors.c
++++ b/src/linux/processors.c
+@@ -2,0 +3 @@
++#include <unistd.h>
+@@ -291,0 +293,22 @@
++static uint32_t cpuinfo_linux_get_max_processor_from_sysconf(
++	uint32_t max_processors_count,
++	const char* processor_list_name) {
++	const long nproc = sysconf(_SC_NPROCESSORS_ONLN);
++	if (nproc <= 0) {
++		cpuinfo_log_warning(
++			"failed to query online processors from sysconf(_SC_NPROCESSORS_ONLN) for %s",
++			processor_list_name);
++		return UINT32_MAX;
++	}
++
++	uint32_t max_processor = (uint32_t)(nproc - 1);
++	if ((uint64_t)nproc > (uint64_t)max_processors_count) {
++		cpuinfo_log_warning(
++			"online processors count %ld exceeds system limit %" PRIu32 ": truncating to the latter",
++			nproc,
++			max_processors_count);
++		max_processor = max_processors_count - 1;
++	}
++	return max_processor;
++}
++
+@@ -301 +324 @@
+-		return UINT32_MAX;
++		return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, POSSIBLE_CPULIST_FILENAME);
+@@ -323 +346 @@
+-		return UINT32_MAX;
++		return cpuinfo_linux_get_max_processor_from_sysconf(max_processors_count, PRESENT_CPULIST_FILENAME);
+@@ -357,0 +381,31 @@
++static bool cpuinfo_linux_detect_processors_from_sysconf(
++	uint32_t max_processors_count,
++	uint32_t* processor0_flags,
++	uint32_t processor_struct_size,
++	uint32_t detected_flag,
++	const char* processor_list_name) {
++	const long nproc = sysconf(_SC_NPROCESSORS_ONLN);
++	if (nproc <= 0) {
++		cpuinfo_log_warning(
++			"failed to query online processors from sysconf(_SC_NPROCESSORS_ONLN) for %s",
++			processor_list_name);
++		return false;
++	}
++
++	uint32_t processors_count = (uint32_t)nproc;
++	if ((uint64_t)nproc > (uint64_t)max_processors_count) {
++		cpuinfo_log_warning(
++			"online processors count %ld exceeds system limit %" PRIu32 ": truncating to the latter",
++			nproc,
++			max_processors_count);
++		processors_count = max_processors_count;
++	}
++
++	for (uint32_t processor = 0; processor < processors_count; processor++) {
++		*((uint32_t*)((uintptr_t)processor0_flags + processor_struct_size * processor)) |= detected_flag;
++	}
++	cpuinfo_log_warning(
++		"falling back to sysconf(_SC_NPROCESSORS_ONLN) = %ld for %s", nproc, processor_list_name);
++	return true;
++}
++
+@@ -373 +427,6 @@
+-		return false;
++		return cpuinfo_linux_detect_processors_from_sysconf(
++			max_processors_count,
++			processor0_flags,
++			processor_struct_size,
++			possible_flag,
++			POSSIBLE_CPULIST_FILENAME);
+@@ -392 +451,6 @@
+-		return false;
++		return cpuinfo_linux_detect_processors_from_sysconf(
++			max_processors_count,
++			processor0_flags,
++			processor_struct_size,
++			present_flag,
++			PRESENT_CPULIST_FILENAME);
diff --git a/cmake/patches/cutlass/cutlass_4.2.1.patch b/cmake/patches/cutlass/cutlass_4.4.2.patch
similarity index 100%
rename from cmake/patches/cutlass/cutlass_4.2.1.patch
rename to cmake/patches/cutlass/cutlass_4.4.2.patch
diff --git a/include/onnxruntime/ep/adapter/allocator.h b/include/onnxruntime/ep/adapter/allocator.h
index 4e601bb22252b..1798be23e4ed0 100644
--- a/include/onnxruntime/ep/adapter/allocator.h
+++ b/include/onnxruntime/ep/adapter/allocator.h
@@ -8,6 +8,7 @@
 #endif
 
 #include <mutex>
+#include <utility>
 
 #include "core/framework/allocator.h"
 
@@ -15,6 +16,59 @@ namespace onnxruntime {
 namespace ep {
 namespace adapter {
 
+// Wraps an OrtAllocator* exposed by the C API as an IAllocator.
+// Takes ownership of the wrapped Ort::Allocator and releases it on destruction.
+class IAllocatorWrappingOrtAllocator final : public IAllocator {
+ public:
+  explicit IAllocatorWrappingOrtAllocator(Ort::Allocator ort_allocator)
+      : IAllocator(*(EnsureOrtAllocatorHasValue(ort_allocator).GetInfo())),
+        ort_allocator_(std::move(ort_allocator)) {
+  }
+
+  void* Alloc(size_t size) override {
+    return ort_allocator_.Alloc(size);
+  }
+
+  void Free(void* p) override {
+    ort_allocator_.Free(p);
+  }
+
+  void* Reserve(size_t size) override {
+    return ort_allocator_.Reserve(size);
+  }
+
+  bool IsStreamAware() const override {
+    return false;
+
+    // TODO: Enable once AllocOnStream() is implemented.
+    // static constexpr uint32_t kOrtAllocatorAllocOnStreamMinVersion = 23;
+    // const OrtAllocator* raw = ort_allocator_;
+    // return raw->version >= kOrtAllocatorAllocOnStreamMinVersion && raw->AllocOnStream != nullptr;
+  }
+
+  void* AllocOnStream(size_t /*size*/, Stream* /*stream*/) override {
+    // TODO: Implement AllocOnStream().
+    // The internal `onnxruntime::IAllocator::AllocOnStream` signature takes an internal `onnxruntime::Stream*`
+    // argument, while the public `::OrtAllocator::AllocOnStream` signature takes an `::OrtSyncStream*` argument.
+    // We need to properly map from one to the other.
+    // `::OrtSyncStream*` should be treated as an opaque type from the plugin EP's perspective.
+    ORT_NOT_IMPLEMENTED("IAllocatorWrappingOrtAllocator::AllocOnStream is not implemented yet.");
+  }
+
+ private:
+  static const Ort::Allocator& EnsureOrtAllocatorHasValue(const Ort::Allocator& ort_allocator) {
+    ORT_ENFORCE(ort_allocator != nullptr, "Ort::Allocator must contain a non-nullptr OrtAllocator.");
+    return ort_allocator;
+  }
+
+  // TODO: Consider adding GetStats() override. Requires parsing OrtKeyValuePairs from the C API
+  // into AllocatorStats; see GetStatsFromOrtAllocator() in allocator_adapters.cc for reference.
+
+  Ort::Allocator ort_allocator_;
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IAllocatorWrappingOrtAllocator);
+};
+
 /// <summary>
 /// A bridge class between the EP API OrtAllocator and an IAllocator implementation.
 /// </summary>
diff --git a/include/onnxruntime/ep/adapter/op_kernel_info.h b/include/onnxruntime/ep/adapter/op_kernel_info.h
index 7e61385f3686c..417ebd4adf7a2 100644
--- a/include/onnxruntime/ep/adapter/op_kernel_info.h
+++ b/include/onnxruntime/ep/adapter/op_kernel_info.h
@@ -8,14 +8,16 @@
 #endif
 
 #include <memory>
+#include <shared_mutex>
 
+#include "core/common/inlined_containers.h"
 #include "core/common/narrow.h"
 #include "core/common/status.h"
 #include "core/framework/config_options.h"
-#include "core/framework/op_kernel_info.h"
 #include "core/framework/tensor_shape.h"
 #include "core/framework/tensor.h"
 
+#include "allocator.h"
 #include "node.h"
 #include "kernel_def.h"
 #include "tensor_helper.h"
@@ -43,12 +45,11 @@ struct OpKernelInfo {
   // to manage the lifetime of the cached data.
   struct KernelInfoCache {
     explicit KernelInfoCache(const OrtKernelInfo* kernel_info) : kernel_info_(kernel_info) {
-      const auto* core_kernel_info = reinterpret_cast<const ::onnxruntime::OpKernelInfo*>(kernel_info);
-      execution_provider_ = core_kernel_info->GetExecutionProvider();
-      ort_ep_ = execution_provider_ != nullptr ? execution_provider_->GetOrtEp() : nullptr;
-      ep_impl_ = ort_ep_ != nullptr ? (static_cast<const Ep*>(ort_ep_))->EpImpl() : execution_provider_;
-
       Ort::ConstKernelInfo info{kernel_info};
+      ort_ep_ = info.GetEp();
+      ORT_ENFORCE(ort_ep_ != nullptr, "Plugin EP adapter requires a non-null OrtEp");
+      ep_impl_ = static_cast<const Ep*>(ort_ep_)->EpImpl();
+
       const size_t input_count = info.GetInputCount();
       constant_input_tensors.resize(input_count);
       for (size_t i = 0; i < input_count; ++i) {
@@ -60,10 +61,13 @@ struct OpKernelInfo {
       }
     }
     const OrtKernelInfo* kernel_info_;
-    const ::onnxruntime::IExecutionProvider* execution_provider_{};
     const OrtEp* ort_ep_{};
     const ::onnxruntime::IExecutionProvider* ep_impl_{};
     std::vector<Tensor> constant_input_tensors;
+
+    mutable std::shared_mutex allocator_cache_mutex_;
+    mutable InlinedHashMap<OrtMemType, AllocatorPtr> allocator_cache_;
+
     ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(KernelInfoCache);
   };
 
@@ -74,11 +78,34 @@ struct OpKernelInfo {
     return (static_cast<const Ep*>(cache_->ort_ep_))->GetDataTransferManager();
   }
 
-  // Delegates to the core OpKernelInfo::GetAllocator so the adapter returns
-  // exactly the same allocator the framework would provide for each OrtMemType.
   AllocatorPtr GetAllocator(OrtMemType mem_type) const {
-    const auto* core_kernel_info = reinterpret_cast<const ::onnxruntime::OpKernelInfo*>(cache_->kernel_info_);
-    return core_kernel_info->GetAllocator(mem_type);
+    {
+      std::shared_lock lock(cache_->allocator_cache_mutex_);
+      auto it = cache_->allocator_cache_.find(mem_type);
+      if (it != cache_->allocator_cache_.end()) {
+        return it->second;
+      }
+    }
+
+    std::unique_lock lock(cache_->allocator_cache_mutex_);
+    // Double-check after acquiring exclusive lock
+    auto it = cache_->allocator_cache_.find(mem_type);
+    if (it != cache_->allocator_cache_.end()) {
+      return it->second;
+    }
+
+    OrtAllocator* ort_allocator_raw = nullptr;
+    Ort::Status status(Ort::GetApi().KernelInfoGetAllocator(cache_->kernel_info_, mem_type, &ort_allocator_raw));
+
+    if (!status.IsOK() || ort_allocator_raw == nullptr) {
+      cache_->allocator_cache_.emplace(mem_type, nullptr);
+      return nullptr;
+    }
+
+    Ort::Allocator ort_allocator{ort_allocator_raw};
+    auto allocator = std::make_shared<IAllocatorWrappingOrtAllocator>(std::move(ort_allocator));
+    cache_->allocator_cache_.emplace(mem_type, allocator);
+    return allocator;
   }
 
   Node node() const noexcept {
diff --git a/js/react_native/README.md b/js/react_native/README.md
index f7b118e81573d..d57dbad2b37f8 100644
--- a/js/react_native/README.md
+++ b/js/react_native/README.md
@@ -16,6 +16,18 @@ With ONNX Runtime React Native, React Native developers can score pre-trained ON
 npm install onnxruntime-react-native
 ```
 
+React Native's autolinking registers the native Android and iOS modules automatically. No manual changes to `settings.gradle`, `build.gradle`, or `MainApplication` are required for bare React Native projects.
+
+For Expo managed/prebuild workflows, add the config plugin to your `app.json`/`app.config.js`:
+
+```json
+{
+  "plugins": ["onnxruntime-react-native"]
+}
+```
+
+Then run `npx expo prebuild` to apply the native changes.
+
 ### Usage
 
 ```js
diff --git a/js/react_native/app.plugin.js b/js/react_native/app.plugin.js
index 2fa117b1a14e5..7f6bd8b55dae3 100644
--- a/js/react_native/app.plugin.js
+++ b/js/react_native/app.plugin.js
@@ -23,6 +23,56 @@ const withOrt = (config) => {
     return config;
   });
 
+  // Register OnnxruntimePackage in MainApplication for New Architecture / Expo prebuild
+  config = configPlugin.withMainApplication(config, (config) => {
+    const lang = config.modResults.language;
+    if (lang === 'kt') {
+      config.modResults.contents = generateCode.mergeContents({
+        src: config.modResults.contents,
+        newSrc: 'import ai.onnxruntime.reactnative.OnnxruntimePackage',
+        tag: 'onnxruntime-react-native-import',
+        anchor: /^import /m,
+        offset: 0,
+        comment: '//',
+      }).contents;
+      config.modResults.contents = generateCode.mergeContents({
+        src: config.modResults.contents,
+        newSrc: '      add(OnnxruntimePackage())',
+        tag: 'onnxruntime-react-native-package',
+        anchor: /override fun getPackages\(\)/,
+        offset: 2,
+        comment: '//',
+      }).contents;
+    } else if (lang === 'java') {
+      config.modResults.contents = generateCode.mergeContents({
+        src: config.modResults.contents,
+        newSrc: 'import ai.onnxruntime.reactnative.OnnxruntimePackage;',
+        tag: 'onnxruntime-react-native-import',
+        anchor: /^import /m,
+        offset: 0,
+        comment: '//',
+      }).contents;
+      if (!config.modResults.contents.includes('packages.add(new OnnxruntimePackage())')) {
+        if (/return\s+new PackageList\(this\)\.getPackages\(\);/.test(config.modResults.contents)) {
+          config.modResults.contents = config.modResults.contents.replace(
+            /(\s*)return\s+new PackageList\(this\)\.getPackages\(\);/,
+            '$1List<ReactPackage> packages = new PackageList(this).getPackages();\n$1packages.add(new OnnxruntimePackage());\n$1return packages;',
+          );
+        } else {
+          config.modResults.contents = generateCode.mergeContents({
+            src: config.modResults.contents,
+            newSrc: '      packages.add(new OnnxruntimePackage());',
+            tag: 'onnxruntime-react-native-package',
+            anchor: /^\s*List<ReactPackage>\s+packages\s*=\s*new PackageList\(this\)\.getPackages\(\);\s*$/m,
+            offset: 1,
+            comment: '//',
+          }).contents;
+        }
+      }
+    }
+    return config;
+  });
+
   // Add build dependency to pod file
   config = configPlugin.withDangerousMod(config, [
     'ios',
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 854e66c6f7239..b518adf14b327 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -49,6 +49,7 @@
     "ios/*.mm",
     "onnxruntime-react-native.podspec",
     "app.plugin.js",
+    "react-native.config.js",
     "unimodule.json",
     "!dist/commonjs/*.js.map",
     "!dist/module/*.js.map",
diff --git a/js/react_native/react-native.config.js b/js/react_native/react-native.config.js
new file mode 100644
index 0000000000000..87759a6f45ad9
--- /dev/null
+++ b/js/react_native/react-native.config.js
@@ -0,0 +1,11 @@
+module.exports = {
+  dependency: {
+    platforms: {
+      android: {
+        packageImportPath: 'import ai.onnxruntime.reactnative.OnnxruntimePackage;',
+        packageInstance: 'new OnnxruntimePackage()',
+      },
+      ios: {},
+    },
+  },
+};
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
index fe2567e71d49a..9bbad9839d616 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/rotary-embedding.ts
@@ -62,6 +62,14 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
     }
   }
 
+  if (sequenceLength > maxSequenceLength) {
+    throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
+  }
+
+  // Note: position_ids value validation is handled by shader-side bounds checks (defense-in-depth).
+  // We cannot validate position_ids values here because the tensor is GPU-resident — its data field
+  // is a GPU buffer ID, not a WASM heap pointer, so getBigInt64Array() would read garbage.
+
   if (headSize / 2 !== cosCache.dims[1] && rotaryEmbeddingDim / 2 !== cosCache.dims[1]) {
     throw new Error(
       `Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${
@@ -69,10 +77,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: RotaryEmbeddi
       }`,
     );
   }
-
-  if (sequenceLength > maxSequenceLength) {
-    throw new Error('Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported');
-  }
 };
 
 export const createRotaryEmbeddingProgramInfo = (
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_data.h b/onnxruntime/contrib_ops/cuda/bert/attention_data.h
index c54a1fea9ad3a..98f92b79e6ec6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_data.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_data.h
@@ -237,11 +237,27 @@ struct PagedAttentionData {
   // Fused op buffers
   T* workspace_buffer = nullptr;
 
+  // Memory-efficient attention (CUTLASS fMHA) buffers for the unfused fallback path
+  // taken when FlashAttention is unavailable (SM<80 or ORT_DISABLE_FLASH_ATTENTION).
+  T* gathered_key = nullptr;    // [total_kv_tokens, num_heads, head_size], packed varlen (GQA-expanded)
+  T* gathered_value = nullptr;  // [total_kv_tokens, num_heads, head_size], packed varlen (GQA-expanded)
+  T* fmha_buffer = nullptr;     // CUTLASS fMHA output-accumulator workspace
+  // Populated by the caller after a D->H sync on cumulative_seqlens_kv[batch_size].
+  int total_kv_tokens = 0;
+
+  // Actual max of per-batch new-query lengths (cumulative_seqlens_q[i+1] - cumulative_seqlens_q[i]).
+  // Populated by the caller via the same D->H sync so the MEA path's rotary grid and MEA's
+  // grid_x (ceil_div(sequence_length, kQueriesPerBlock)) cover every query token. The previous
+  // heuristic `token_count - batch_size + 1` underestimates when any batch has 0 new tokens,
+  // producing silent per-token dropout in MEA and rotary.
+  int max_query_len = 0;
+
   // Output Tensors
   T* output = nullptr;
 
   // Kernel Flags
   bool use_flash_attention = false;
+  bool use_memory_efficient_attention = false;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc b/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc
index 5df2c8b438771..7fba61270e280 100644
--- a/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention.cc
@@ -9,6 +9,7 @@
 #include "contrib_ops/cuda/bert/paged_attention.h"
 #include "contrib_ops/cuda/bert/paged_attention_helper.h"
 #include "contrib_ops/cuda/bert/flash_attention/flash_api.h"
+#include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h"
 
 using namespace onnxruntime::cuda;
 using namespace ::onnxruntime::common;
@@ -50,6 +51,7 @@ PagedAttention<T>::PagedAttention(const OpKernelInfo& info)
 
   kernel_options_ = this->GetAttentionKernelOptions();
   disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
+  disable_memory_efficient_attention_ = sizeof(T) != 2 || !kernel_options_->UseEfficientAttention();
 }
 
 template <typename T>
@@ -141,31 +143,57 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
                            "value_cache and value_cache_out must be the same buffer");
   }
 
-  // Check flash kernel availability and allocate buffers
+  // Empty query input: output is already shaped [0, hidden_size], and the cache outputs
+  // alias the input caches (verified above), so no backend kernel or cache update is needed.
+  if (parameters.token_count == 0) {
+    return Status::OK();
+  }
+
+  // Kernel backend selection — FlashAttention preferred, fall back to MemoryEfficientAttention.
 #if USE_FLASH_ATTENTION
   bool use_flash_attention = !disable_flash_attention_ &&
                              onnxruntime::flash::is_supported<T>(device_prop,
                                                                  parameters.head_size,
                                                                  parameters.num_heads,
                                                                  parameters.kv_num_heads);
-  size_t softmax_lse_bytes = 0;
-  if (use_flash_attention) {
-    softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(parameters.token_count,
-                                                                 parameters.num_heads);
-  }
-  auto softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, GetComputeStream(context));
 #else
   constexpr bool use_flash_attention = false;
-  auto softmax_lse_buffer = GetScratchBuffer<void>(0, GetComputeStream(context));  // nullptr
 #endif
 
-  if (!use_flash_attention) {
+#if USE_MEMORY_EFFICIENT_ATTENTION
+  const int sm = device_prop.major * 10 + device_prop.minor;
+  const bool is_half = std::is_same<T, MLFloat16>::value;
+  const bool is_bf16 = std::is_same<T, BFloat16>::value;
+  bool use_memory_efficient_attention =
+      !use_flash_attention &&
+      !disable_memory_efficient_attention_ &&
+      has_memory_efficient_attention(sm, is_half, is_bf16,
+                                     parameters.head_size, parameters.head_size);
+#else
+  constexpr bool use_memory_efficient_attention = false;
+#endif
+
+  if (!use_flash_attention && !use_memory_efficient_attention) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "Currently PagedAttention is only supported through the FlashAttention kernel.");
+                           "PagedAttention requires FlashAttention (sm>=80, fp16/bf16) or "
+                           "MemoryEfficientAttention (fp16 sm>=53, bf16 sm>=80, head_size<=1024 and %8==0) "
+                           "to be available. Check ORT_DISABLE_FLASH_ATTENTION / "
+                           "ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION env vars and dtype/head_size.");
   }
 
+  // Scratch buffers common to both backends.
+  size_t softmax_lse_bytes = 0;
+#if USE_FLASH_ATTENTION
+  if (use_flash_attention) {
+    softmax_lse_bytes = onnxruntime::flash::get_softmax_lse_size(parameters.token_count,
+                                                                 parameters.num_heads);
+  }
+#endif
+  auto softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, GetComputeStream(context));
+
   size_t cumulative_seqlens_kv_bytes = sizeof(int) * (parameters.batch_size + 1);
   auto cumulative_seqlens_kv_buffer = GetScratchBuffer<void>(cumulative_seqlens_kv_bytes, GetComputeStream(context));
+  int* cumulative_seqlens_kv_ptr = reinterpret_cast<int*>(cumulative_seqlens_kv_buffer.get());
 
   size_t workspace_buffer_bytes = 0;
   if (do_rotary_) {
@@ -175,10 +203,91 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   }
   auto workspace_buffer = GetScratchBuffer<void>(workspace_buffer_bytes, GetComputeStream(context));
 
+  // Populate cumulative_seqlens_kv for both backends. The MEA path additionally needs
+  // the last element on the host to size the tight gather buffers, so we D->H sync below.
+  //
+  // LaunchGetCumulativeSeqlensKV uses a per-block cub::BlockScan with a block size of 256
+  // and launches (batch_size + 255) / 256 blocks, so blocks scan independently. Enforce
+  // batch_size <= 256 so the cumulative sum is correct; a larger batch would silently
+  // produce wrong KV offsets. (A future grid-wide scan could lift this limit.)
+  constexpr int kMaxBatchSizeForCumulativeSeqlensKV = 256;
+  if (parameters.batch_size > kMaxBatchSizeForCumulativeSeqlensKV) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "PagedAttention currently supports batch_size <= ",
+                           kMaxBatchSizeForCumulativeSeqlensKV,
+                           " (LaunchGetCumulativeSeqlensKV limitation); got batch_size=",
+                           parameters.batch_size, ".");
+  }
+
+  cudaStream_t cuda_stream = static_cast<cudaStream_t>(ort_stream.get()->GetHandle());
+  ORT_RETURN_IF_ERROR(LaunchGetCumulativeSeqlensKV(
+      cumulative_seqlens_kv_ptr,
+      reinterpret_cast<const int*>(cumulative_seqlens_q->Data<int>()),
+      reinterpret_cast<const int*>(past_seqlens->Data<int>()),
+      parameters.batch_size, cuda_stream));
+
+  int total_kv_tokens = 0;
+  int max_query_len = 0;
+  IAllocatorUniquePtr<void> gathered_key_buffer;
+  IAllocatorUniquePtr<void> gathered_value_buffer;
+  IAllocatorUniquePtr<void> fmha_buffer;
+
+#if USE_MEMORY_EFFICIENT_ATTENTION
+  if (use_memory_efficient_attention) {
+    // MEA needs two host-side quantities:
+    //   - total_kv_tokens (= cumulative_seqlens_kv[batch_size]) to size tight gather buffers.
+    //   - max_query_len (= max per-batch new-query length) to size the rotary and MEA grids
+    //     correctly. The heuristic `token_count - batch_size + 1` underestimates when any
+    //     batch has 0 new tokens (valid input), silently dropping query-tokens from those
+    //     larger-than-average batches.
+    // Both come from cumulative_seqlens_q / cumulative_seqlens_kv, which are tiny (batch+1
+    // ints each), so one D->H copy of the full arrays is cheaper than issuing an extra
+    // reduction kernel and avoids a second sync.
+    const int kCumulativeCount = parameters.batch_size + 1;
+    auto cum_q_pinned = this->AllocateBufferOnCPUPinned<int>(kCumulativeCount);
+    auto cum_kv_pinned = this->AllocateBufferOnCPUPinned<int>(kCumulativeCount);
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_q_pinned.get(),
+                                         reinterpret_cast<const int*>(cumulative_seqlens_q->Data<int>()),
+                                         sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream));
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cum_kv_pinned.get(), cumulative_seqlens_kv_ptr,
+                                         sizeof(int) * kCumulativeCount, cudaMemcpyDeviceToHost, cuda_stream));
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
+    total_kv_tokens = cum_kv_pinned.get()[parameters.batch_size];
+    for (int i = 0; i < parameters.batch_size; ++i) {
+      const int q_len_i = cum_q_pinned.get()[i + 1] - cum_q_pinned.get()[i];
+      if (q_len_i > max_query_len) {
+        max_query_len = q_len_i;
+      }
+    }
+    if (total_kv_tokens == 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "PagedAttention MEA fallback: total_kv_tokens is zero for non-empty input.");
+    }
+    if (total_kv_tokens < 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "PagedAttention MEA fallback: total_kv_tokens is negative (", total_kv_tokens, ").");
+    }
+
+    const size_t gather_elems = static_cast<size_t>(total_kv_tokens) *
+                                parameters.num_heads * parameters.head_size;
+    gathered_key_buffer = GetScratchBuffer<void>(sizeof(T) * gather_elems, GetComputeStream(context));
+    gathered_value_buffer = GetScratchBuffer<void>(sizeof(T) * gather_elems, GetComputeStream(context));
+
+    if (MemoryEfficientAttentionParams::need_workspace(parameters.head_size, sizeof(T) == sizeof(float))) {
+      // MEA output accumulator is float32 regardless of input dtype (see GQA pattern at
+      // group_query_attention.cc:482); use sizeof(float), not sizeof(T).
+      const size_t fmha_elems = static_cast<size_t>(parameters.token_count) *
+                                parameters.num_heads * parameters.head_size;
+      fmha_buffer = GetScratchBuffer<void>(sizeof(float) * fmha_elems, GetComputeStream(context));
+    }
+  }
+#endif
+
   // Print debug info
   if (kernel_options_->AllowDebugInfo()) {
     AttentionKernelDebugInfo debug_info;
     debug_info.use_flash_attention = use_flash_attention;
+    debug_info.use_efficient_attention = use_memory_efficient_attention;
 
     debug_info.Print("PagedAttention",
                      this->Node().Name(),
@@ -194,10 +303,11 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   data.value_cache = reinterpret_cast<CudaT*>(const_cast<T*>(value_cache->Data<T>()));
   data.cumulative_seqlens_q = reinterpret_cast<const int*>(cumulative_seqlens_q->Data<int>());
   data.past_seqlens = reinterpret_cast<const int*>(past_seqlens->Data<int>());
-  data.cumulative_seqlens_kv = reinterpret_cast<int*>(cumulative_seqlens_kv_buffer.get());
+  data.cumulative_seqlens_kv = cumulative_seqlens_kv_ptr;
   data.block_table = reinterpret_cast<const int*>(block_table->Data<int>());
   data.output = reinterpret_cast<CudaT*>(output->MutableData<T>());
   data.use_flash_attention = use_flash_attention;
+  data.use_memory_efficient_attention = use_memory_efficient_attention;
   if (softmax_lse_buffer != nullptr) {
     data.softmax_lse = reinterpret_cast<CudaT*>(softmax_lse_buffer.get());
   }
@@ -208,6 +318,15 @@ Status PagedAttention<T>::ComputeInternal(OpKernelContext* context) const {
     data.cos_cache = reinterpret_cast<const CudaT*>(cos_cache->Data<T>());
     data.sin_cache = reinterpret_cast<const CudaT*>(sin_cache->Data<T>());
   }
+  if (use_memory_efficient_attention) {
+    data.gathered_key = reinterpret_cast<CudaT*>(gathered_key_buffer.get());
+    data.gathered_value = reinterpret_cast<CudaT*>(gathered_value_buffer.get());
+    if (fmha_buffer != nullptr) {
+      data.fmha_buffer = reinterpret_cast<CudaT*>(fmha_buffer.get());
+    }
+    data.total_kv_tokens = total_kv_tokens;
+    data.max_query_len = max_query_len;
+  }
 
   cublasHandle_t cublas = GetCublasHandle(context);
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention.h b/onnxruntime/contrib_ops/cuda/bert/paged_attention.h
index a3df144745f61..027141f02b9ae 100644
--- a/onnxruntime/contrib_ops/cuda/bert/paged_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention.h
@@ -29,6 +29,7 @@ class PagedAttention final : public CudaKernel {
   float scale_;
   float softcap_;
   bool disable_flash_attention_;
+  bool disable_memory_efficient_attention_;
   const AttentionKernelOptions* kernel_options_;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu
index 06608ebed44cc..2241fa232a2c0 100644
--- a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.cu
@@ -9,6 +9,7 @@
 #include "contrib_ops/cuda/bert/attention_softmax.h"
 #include "contrib_ops/cuda/utils/dump_cuda_tensor.h"
 #include "contrib_ops/cuda/bert/flash_attention/flash_api.h"
+#include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h"
 #include "contrib_ops/cuda/bert/paged_attention_impl.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "contrib_ops/cuda/bert/rotary_embedding_impl.h"
@@ -237,6 +238,101 @@ Status LaunchReshapeAndCache(const T* key, const T* value, T* key_cache, T* valu
   return CUDA_CALL(cudaGetLastError());
 }
 
+// Gather paged KV into packed-varlen [total_kv_tokens, num_heads, head_size], expanding GQA heads.
+// total_elems = total_kv_tokens * num_heads * head_size can exceed INT32_MAX for realistic
+// large-context GQA configs (e.g., 2M tokens * 64 * 128 = 16.4B), so the linear index is int64_t
+// and the kernel uses a grid-stride loop instead of a single (tid >= total_elems) early-exit.
+template <typename T>
+__global__ void GatherAndExpandPagedKVCache(const T* __restrict__ key_cache,
+                                            const T* __restrict__ value_cache,
+                                            T* __restrict__ gathered_key,
+                                            T* __restrict__ gathered_value,
+                                            const int* __restrict__ block_table,
+                                            const int* __restrict__ cumulative_seqlens_kv,
+                                            const int batch_size,
+                                            const int num_heads,
+                                            const int kv_num_heads,
+                                            const int head_size,
+                                            const int block_size,
+                                            const int max_num_blocks_per_seq,
+                                            const int64_t total_elems) {
+  const int64_t stride = static_cast<int64_t>(gridDim.x) * blockDim.x;
+  const int64_t num_heads_times_head = static_cast<int64_t>(num_heads) * head_size;
+  const int q_kv_head_ratio = num_heads / kv_num_heads;
+  const int64_t page_stride = static_cast<int64_t>(block_size) * kv_num_heads * head_size;
+
+  for (int64_t tid = threadIdx.x + static_cast<int64_t>(blockIdx.x) * blockDim.x;
+       tid < total_elems;
+       tid += stride) {
+    const int h = static_cast<int>(tid % head_size);
+    const int head_id = static_cast<int>((tid / head_size) % num_heads);
+    const int token_id = static_cast<int>(tid / num_heads_times_head);
+
+    // cumulative_seqlens_kv is a prefix sum of non-negative per-batch KV lengths
+    // (past_seqlens[i] + new_tokens[i]), so it is monotonically non-decreasing for
+    // any valid op input — the same assumption the previous linear scan made.
+    // Binary-search for the batch this token belongs to: log2(batch_size) is strictly
+    // better than the linear scan, which ran once per (token, head, h) element and
+    // multiplied its cost by num_heads * head_size.
+    int left = 0;
+    int right = batch_size;
+    while (left < right) {
+      const int mid = left + (right - left) / 2;
+      if (token_id < cumulative_seqlens_kv[mid + 1]) {
+        right = mid;
+      } else {
+        left = mid + 1;
+      }
+    }
+    const int batch_id = left;
+
+    const int pos = token_id - cumulative_seqlens_kv[batch_id];
+    const int block_idx_in_seq = pos / block_size;
+    const int block_offset = pos % block_size;
+    const int block_id = block_table[batch_id * max_num_blocks_per_seq + block_idx_in_seq];
+
+    // GQA expansion: each output head maps to kv_head_id = head_id / (num_heads / kv_num_heads).
+    // For MHA (num_heads == kv_num_heads) this is the identity.
+    const int kv_head_id = head_id / q_kv_head_ratio;
+
+    const int64_t paged_idx = static_cast<int64_t>(block_id) * page_stride +
+                              static_cast<int64_t>(block_offset) * kv_num_heads * head_size +
+                              kv_head_id * head_size +
+                              h;
+
+    gathered_key[tid] = key_cache[paged_idx];
+    gathered_value[tid] = value_cache[paged_idx];
+  }
+}
+
+template <typename T>
+Status LaunchGatherAndExpandPagedKVCache(const T* key_cache, const T* value_cache,
+                                         T* gathered_key, T* gathered_value,
+                                         const int* block_table, const int* cumulative_seqlens_kv,
+                                         const int batch_size, const int num_heads,
+                                         const int kv_num_heads, const int head_size,
+                                         const int block_size, const int max_num_blocks_per_seq,
+                                         const int total_kv_tokens, cudaStream_t stream,
+                                         const int max_threads_per_block) {
+  const int64_t total_elems = static_cast<int64_t>(total_kv_tokens) * num_heads * head_size;
+  if (total_elems == 0) {
+    return Status::OK();
+  }
+  // With the op's batch_size <= 256 precondition (paged_attention.cc) and MEA's
+  // head_size <= 1024 cap, blocks_needed = ceil(total_elems / threads) stays comfortably
+  // within int range for any realistic input, so no explicit clamp is needed. The kernel
+  // uses a grid-stride loop so launching fewer blocks than total_elems / threads would
+  // also be correct — we don't need an artificial "keep SMs busy" cap.
+  const int threads = static_cast<int>(std::min<int64_t>(max_threads_per_block, total_elems));
+  const int blocks = static_cast<int>((total_elems + threads - 1) / threads);
+  GatherAndExpandPagedKVCache<T><<<blocks, threads, 0, stream>>>(
+      key_cache, value_cache, gathered_key, gathered_value,
+      block_table, cumulative_seqlens_kv,
+      batch_size, num_heads, kv_num_heads, head_size,
+      block_size, max_num_blocks_per_seq, total_elems);
+  return CUDA_CALL(cudaGetLastError());
+}
+
 ////////// Launch Kernels
 
 #if USE_FLASH_ATTENTION
@@ -276,12 +372,11 @@ Status FlashAttention(
     value = reinterpret_cast<T*>(key) + static_cast<size_t>(kv_num_heads * head_size);
   }
 
-  // Calculate cumulative present sequence length in cumulative_seqlens_kv
+  // cumulative_seqlens_kv is populated by the caller (paged_attention.cc) before QkvToContext;
+  // shared across FA and MEA dispatch paths so the host can also read total_kv_tokens.
   int* cumulative_seqlens_q = const_cast<int*>(data.cumulative_seqlens_q);
   int* past_seqlens = const_cast<int*>(data.past_seqlens);
   int* cumulative_seqlens_kv = data.cumulative_seqlens_kv;
-  ORT_RETURN_IF_ERROR(LaunchGetCumulativeSeqlensKV(cumulative_seqlens_kv, cumulative_seqlens_q, past_seqlens,
-                                                   batch_size, stream));
 
   if (parameters.do_rotary) {
     // Will unpack Q and K in case of packed_qkv
@@ -335,6 +430,127 @@ Status FlashAttention(
 }
 #endif
 
+#if USE_MEMORY_EFFICIENT_ATTENTION
+// Fallback when FlashAttention is unavailable (SM<80 or ORT_DISABLE_FLASH_ATTENTION=1).
+// Mirrors the FlashAttention preprocessing (rotary, unpack, ReshapeAndCache), then gathers
+// the paged KV cache into a packed-varlen [total_kv_tokens, num_heads, head_size] buffer and
+// dispatches to CUTLASS memory-efficient attention via its seqstart_q / seqstart_k varlen ABI.
+// Caller must populate data.gathered_key / data.gathered_value / data.total_kv_tokens.
+template <typename T>
+Status EfficientAttention(
+    const cudaDeviceProp& device_prop,
+    cudaStream_t stream,
+    contrib::PagedAttentionParameters& parameters,
+    PagedAttentionData<T>& data,
+    float scale) {
+  const int max_threads_per_block = device_prop.maxThreadsPerBlock;
+  const int batch_size = parameters.batch_size;
+  const int token_count = parameters.token_count;
+  const int q_hidden_size = parameters.hidden_size;
+  const int kv_hidden_size = parameters.kv_hidden_size;
+  const int num_heads = parameters.num_heads;
+  const int kv_num_heads = parameters.kv_num_heads;
+  const int head_size = parameters.head_size;
+  const int block_size = parameters.block_size;
+  const int max_num_blocks_per_seq = parameters.max_num_blocks_per_seq;
+  const int local_window_size = parameters.local_window_size;
+  const int total_kv_tokens = data.total_kv_tokens;
+  // Use the caller-computed actual max of per-batch new-query lengths, not the
+  // `token_count - batch_size + 1` heuristic: the heuristic assumes >=1 new token per batch
+  // and underestimates otherwise, which would silently drop query tokens from the
+  // rotary grid and from MEA's `grid_x = ceil_div(sequence_length, kQueriesPerBlock)`.
+  const int max_query_len = data.max_query_len;
+
+  T* query = const_cast<T*>(data.query);
+  T* key;
+  T* value;
+  if (!parameters.is_packed_qkv) {
+    key = const_cast<T*>(data.key);
+    value = const_cast<T*>(data.value);
+  } else {
+    key = reinterpret_cast<T*>(query) + static_cast<size_t>(num_heads * head_size);
+    value = reinterpret_cast<T*>(key) + static_cast<size_t>(kv_num_heads * head_size);
+  }
+
+  // cumulative_seqlens_kv is populated by the caller (paged_attention.cc) before QkvToContext;
+  // shared across FA and MEA dispatch paths.
+  int* cumulative_seqlens_q = const_cast<int*>(data.cumulative_seqlens_q);
+  int* past_seqlens = const_cast<int*>(data.past_seqlens);
+  int* cumulative_seqlens_kv = data.cumulative_seqlens_kv;
+
+  if (parameters.do_rotary) {
+    auto q_buffer = data.workspace_buffer;
+    auto k_buffer = data.workspace_buffer + token_count * num_heads * head_size;
+    const int packed_seq_stride = parameters.is_packed_qkv ? (num_heads + 2 * kv_num_heads) * head_size : -1;
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(
+        stream, q_buffer, query, past_seqlens, cumulative_seqlens_q, data.cos_cache, data.sin_cache, batch_size,
+        max_query_len, num_heads, head_size, parameters.rotary_dim, parameters.rotary_interleaved, packed_seq_stride,
+        max_threads_per_block));
+    ORT_RETURN_IF_ERROR(LaunchRotaryEmbeddingKernel<T>(
+        stream, k_buffer, key, past_seqlens, cumulative_seqlens_q, data.cos_cache, data.sin_cache, batch_size,
+        max_query_len, kv_num_heads, head_size, parameters.rotary_dim, parameters.rotary_interleaved, packed_seq_stride,
+        max_threads_per_block));
+    query = q_buffer;
+    key = k_buffer;
+  } else if (parameters.is_packed_qkv) {
+    auto q_buffer = data.workspace_buffer;
+    const int packed_seq_stride = q_hidden_size + 2 * kv_hidden_size;
+    ORT_RETURN_IF_ERROR(LaunchUnpackCumulative<T>(
+        query, q_buffer, token_count, q_hidden_size, packed_seq_stride, stream, max_threads_per_block));
+    query = q_buffer;
+  }
+
+  int* block_table = const_cast<int*>(data.block_table);
+  const int key_stride = parameters.is_packed_qkv && !parameters.do_rotary ? q_hidden_size + 2 * kv_hidden_size : kv_hidden_size;
+  const int value_stride = parameters.is_packed_qkv ? q_hidden_size + 2 * kv_hidden_size : kv_hidden_size;
+  ORT_RETURN_IF_ERROR(LaunchReshapeAndCache<T>(key, value, data.key_cache, data.value_cache, block_table, past_seqlens,
+                                               cumulative_seqlens_q, batch_size, max_num_blocks_per_seq, token_count,
+                                               kv_hidden_size, block_size, key_stride, value_stride, stream,
+                                               max_threads_per_block));
+
+  ORT_RETURN_IF_ERROR(LaunchGatherAndExpandPagedKVCache<T>(
+      data.key_cache, data.value_cache, data.gathered_key, data.gathered_value,
+      block_table, cumulative_seqlens_kv, batch_size, num_heads, kv_num_heads,
+      head_size, block_size, max_num_blocks_per_seq, total_kv_tokens, stream, max_threads_per_block));
+
+  MemoryEfficientAttentionParams p;
+  p.sm = device_prop.major * 10 + device_prop.minor;
+  p.is_bf16 = std::is_same<T, BFloat16>::value;
+  p.is_half = !p.is_bf16 && (sizeof(T) == 2);
+  p.batch_size = batch_size;
+  p.num_heads = num_heads;
+  p.sequence_length = max_query_len;
+  p.kv_sequence_length = total_kv_tokens;
+  p.max_sequence_length = total_kv_tokens;
+  p.qk_head_size = head_size;
+  p.v_head_size = head_size;
+  p.causal = true;
+  p.scale = scale;
+  p.softcap = parameters.softcap;
+  p.local_window_size = local_window_size;
+  p.seqstart_q_ptr = cumulative_seqlens_q;
+  p.seqstart_k_ptr = cumulative_seqlens_kv;
+  p.seqlen_k_ptr = nullptr;
+  p.query = query;
+  p.key = data.gathered_key;
+  p.value = data.gathered_value;
+  p.attn_bias = nullptr;
+  p.is_kv_bsnh = true;
+  p.has_custom_right_padding = false;
+  p.output = data.output;
+  p.workspace = MemoryEfficientAttentionParams::need_workspace(head_size, sizeof(T) == sizeof(float))
+                    ? data.fmha_buffer
+                    : nullptr;
+  p.stream = stream;
+  run_memory_efficient_attention(p);
+
+  DUMP_TENSOR_INIT();
+  DUMP_TENSOR("mea paged attention output", data.output, token_count, num_heads, head_size);
+
+  return Status::OK();
+}
+#endif
+
 ////////// API Functions
 
 template <typename T>
@@ -353,7 +569,13 @@ Status QkvToContext(
   }
 #endif
 
-  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unfused Paged Attention not implemented.");
+#if USE_MEMORY_EFFICIENT_ATTENTION
+  if (data.use_memory_efficient_attention) {
+    return EfficientAttention(device_prop, stream, parameters, data, scale);
+  }
+#endif
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "No PagedAttention kernel available for the current configuration.");
 }
 
 template struct PagedAttentionData<half>;
diff --git a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h
index 7e27556a5c63f..22f9793be0af6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/paged_attention_impl.h
@@ -27,6 +27,11 @@ Status LaunchUnpackQKVCumulative(const T* packed_qkv, T* unpacked_q, T* unpacked
                                  const int kv_num_heads, const int head_size, const int token_count, cudaStream_t stream,
                                  const int max_threads_per_block);
 
+// Exposed so paged_attention.cc can populate cumulative_seqlens_kv on both the FA and MEA
+// dispatch paths (producer hoisted out of FlashAttention/UnfusedAttention in impl.cu).
+Status LaunchGetCumulativeSeqlensKV(int32_t* cumulative_seqlens_kv, const int32_t* cumulative_seqlens_q,
+                                    const int32_t* past_seqlens, const int batch_size, cudaStream_t stream);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
index 9f81e490971cd..69d2db391ce3c 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -35,13 +35,28 @@ Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
                                "  if (global_idx >= size) { return; }\n"
                                "  if (bsnh[3] < half_rotary_emb_dim) {\n"
                             << "    let position_ids_idx = " << position_ids.BroadcastedIndicesToOffset("bsnh.xy", output_indices) << ";\n"
-                            << "    let position_id = u32(" << position_ids.GetByOffset("position_ids_idx") << ") + select(0, bsnh[1], position_ids_idx == 0);\n"
+                            << "    let raw_pos = " << position_ids.GetByOffset("position_ids_idx") << ";\n"
                             << "    let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " << interleaved_str << ");\n"
                             << "    let j = i + select(half_rotary_emb_dim, 1, " << interleaved_str << ");\n"
-                            << "    let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-                            << "    " << output.SetByOffset("i", "re") << "\n"
-                            << "    let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-                            << "    " << output.SetByOffset("j", "im") << "\n"
+                                                                                                       "    let max_position = uniforms.cos_cache_shape[0];\n"
+                                                                                                       // Bounds check: raw_pos < 0 catches negative position_ids (i32 from truncated int64).
+                                                                                                       // After u32 conversion + offset, check >= max_position catches too-large values.
+                                                                                                       // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+                                                                                                       "    if (raw_pos < 0) {\n"
+                            << "      " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
+                            << "      " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
+                                                                                              "    } else {\n"
+                                                                                              "      let position_id = u32(raw_pos) + select(0, bsnh[1], position_ids_idx == 0);\n"
+                                                                                              "      if (position_id >= max_position) {\n"
+                            << "        " << output.SetByOffset("i", input.GetByOffset("i")) << "\n"
+                            << "        " << output.SetByOffset("j", input.GetByOffset("j")) << "\n"
+                                                                                                "      } else {\n"
+                            << "        let re = " << input.GetByOffset("i") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " - " << input.GetByOffset("j") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+                            << "        " << output.SetByOffset("i", "re") << "\n"
+                            << "        let im = " << input.GetByOffset("i") << " * " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << " + " << input.GetByOffset("j") << " * " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+                            << "        " << output.SetByOffset("j", "im") << "\n"
+                                                                              "      }\n"
+                                                                              "    }\n"
                             << "  } else { \n"
                                "    let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n"
                             << "    " << output.SetByOffset("k", input.GetByOffset("k")) << "\n"
@@ -74,24 +89,39 @@ Status FusedQKRotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) c
       << "    let seqlen = u32(seqlen_i);\n"
       << "    let total_seqlen = seqlen + 1u;\n"
       << "    let past_seqlen = total_seqlen - uniforms.q_global_shape[1];\n"
+      // position_id is derived from past_seqlen + sequence_idx (always non-negative).
       << "    let position_id = past_seqlen + sequence_idx;\n"
-      << "    let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
-      << "    let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
       << "    let qi = dot(bsnh, uniforms.q_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
       << "    let qj = qi + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
-      << "    let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
-      << "    " << q_output.SetByOffset("qi", "q_re") << "\n"
-      << "    let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
-      << "    " << q_output.SetByOffset("qj", "q_im") << "\n"
+                                                                                // Bounds check: position_id must be within cos/sin cache range.
+                                                                                // On OOB, pass through input unchanged (same as CUDA kernel behavior).
+                                                                                "    let max_position = uniforms.cos_cache_shape[0];\n"
+                                                                                "    if (position_id >= max_position) {\n"
+      << "      " << q_output.SetByOffset("qi", q_input.GetByOffset("qi")) << "\n"
+      << "      " << q_output.SetByOffset("qj", q_input.GetByOffset("qj")) << "\n"
+      << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "        let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "        " << k_output.SetByOffset("ki", k_input.GetByOffset("ki")) << "\n"
+      << "        " << k_output.SetByOffset("kj", k_input.GetByOffset("kj")) << "\n"
+                                                                                "      }\n"
+                                                                                "    } else {\n"
+      << "      let cos_v = " << cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "      let sin_v = " << sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") << ";\n"
+      << "      let q_re = " << q_input.GetByOffset("qi") << " * cos_v - " << q_input.GetByOffset("qj") << " * sin_v;\n"
+      << "      " << q_output.SetByOffset("qi", "q_re") << "\n"
+      << "      let q_im = " << q_input.GetByOffset("qi") << " * sin_v + " << q_input.GetByOffset("qj") << " * cos_v;\n"
+      << "      " << q_output.SetByOffset("qj", "q_im") << "\n"
       // Conditionally process Key (only for heads that exist in K domain)
-      << "    if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
-      << "      let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
-      << "      let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
-      << "      let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n"
-      << "      " << k_output.SetByOffset("ki", "k_re") << "\n"
-      << "      let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
-      << "      " << k_output.SetByOffset("kj", "k_im") << "\n"
-      << "    }\n"
+      << "      if (bsnh[2] < uniforms.k_global_shape[2]) {\n"
+      << "        let ki = dot(bsnh, uniforms.k_input_output_stride) + select(0u, bsnh[3], " << interleaved_str << ");\n"
+      << "        let kj = ki + select(half_rotary_dim, 1u, " << interleaved_str << ");\n"
+      << "        let k_re = " << k_input.GetByOffset("ki") << " * cos_v - " << k_input.GetByOffset("kj") << " * sin_v;\n"
+      << "        " << k_output.SetByOffset("ki", "k_re") << "\n"
+      << "        let k_im = " << k_input.GetByOffset("ki") << " * sin_v + " << k_input.GetByOffset("kj") << " * cos_v;\n"
+      << "        " << k_output.SetByOffset("kj", "k_im") << "\n"
+                                                             "      }\n"
+                                                             "    }\n"
       << "  } else {\n"
       << "    let qk = dot(bsnh, uniforms.q_input_output_stride) + half_rotary_dim;\n"
       << "    " << q_output.SetByOffset("qk", q_input.GetByOffset("qk")) << "\n"
@@ -127,6 +157,11 @@ Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& con
   const auto half_rotary_embedding_dim = onnxruntime::narrow<uint32_t>(cos_cache->Shape()[1]);
   const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;
 
+  // position_ids bounds validation is handled by shader-side defense-in-depth checks
+  // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible
+  // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is
+  // incompatible with AddInputs).
+
   // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
   // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy]
   // to unfold the global index in shader.
diff --git a/onnxruntime/core/framework/sparse_utils.cc b/onnxruntime/core/framework/sparse_utils.cc
index c42f6d190512c..48b28db9f9028 100644
--- a/onnxruntime/core/framework/sparse_utils.cc
+++ b/onnxruntime/core/framework/sparse_utils.cc
@@ -7,6 +7,7 @@
 
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
+#include "core/common/safeint.h"
 #include "core/framework/tensor.h"
 #include "core/framework/data_types_internal.h"
 #include "core/framework/data_transfer_manager.h"
@@ -256,16 +257,36 @@ Status SparseCsrToDenseTensor(const DataTransferManager& data_manager, const Spa
     }
 
     void* output = cpu_result.MutableDataRaw();
+    const auto dense_size = cpu_result.Shape().Size();
+    const auto outer_size = outer_span.size();
+    const auto inner_size = static_cast<int64_t>(inner_span.size());
+
+    // Validate CSR structural invariants (O(1) checks).
+    if (outer_size > 0) {
+      ORT_RETURN_IF_NOT(outer_span[0] == 0,
+                        "CSR outer index must start at 0, got: ", outer_span[0]);
+      ORT_RETURN_IF_NOT(outer_span[outer_size - 1] == inner_size,
+                        "CSR outer index last element must equal inner index count (",
+                        inner_size, "), got: ", outer_span[outer_size - 1]);
+    }
 
-    size_t src_idx = 0;
     size_t inner_idx = 0;
-    for (size_t out_i = 1; out_i < outer_span.size(); ++out_i) {
+    for (size_t out_i = 1; out_i < outer_size; ++out_i) {
+      ORT_RETURN_IF_NOT(outer_span[out_i] >= outer_span[out_i - 1],
+                        "CSR outer index not non-decreasing at position ", out_i,
+                        ": ", outer_span[out_i]);
       auto row_size = outer_span[out_i] - outer_span[out_i - 1];
       for (int64_t cnt = 0; cnt < row_size; ++cnt, ++inner_idx) {
-        assert(inner_idx < inner_span.size());
+        ORT_RETURN_IF_NOT(inner_idx < inner_span.size(),
+                          "CSR inner index out of range: inner_idx=", inner_idx,
+                          " >= inner_span.size()=", inner_span.size());
         auto col = inner_span[inner_idx];
-        auto dst_idx = (out_i - 1) * cols + col;
-        copy_func(output, values, dst_idx, src_idx);
+        ORT_RETURN_IF_NOT(col >= 0 && col < cols, "Invalid CSR column index: ", col);
+        // Use SafeInt to prevent overflow during index calculation.
+        int64_t dst_idx = SafeInt<int64_t>(out_i - 1) * cols + col;
+        ORT_RETURN_IF_NOT(dst_idx >= 0 && dst_idx < dense_size,
+                          "Invalid CSR computed index: ", dst_idx);
+        copy_func(output, values, dst_idx, inner_idx);
       }
     }
   }
@@ -356,15 +377,22 @@ Status SparseCooToDenseTensor(const DataTransferManager& data_manager, const Spa
     if (num_indices == num_values) {
       for (int64_t src_idx = 0; src_idx < num_values; ++src_idx) {
         auto dst_idx = indices[src_idx];
-        ORT_RETURN_IF_NOT(dst_idx < dense_size, "Invalid index: ", dst_idx, " > dense_size: ", dense_size);
+        ORT_RETURN_IF_NOT(dst_idx >= 0 && dst_idx < dense_size,
+                          "Invalid COO index: ", dst_idx);
         copy_func(output, values, dst_idx, src_idx);
       }
     } else {
+      const auto rows = src_dims[0];
       const auto cols = src_dims[1];
       for (int64_t src_idx = 0; src_idx < num_values; ++src_idx) {
         auto tuple_idx = src_idx * 2;
-        auto dst_idx = indices[tuple_idx] * cols + indices[tuple_idx + 1];
-        ORT_RETURN_IF_NOT(dst_idx < dense_size, "Invalid index: ", dst_idx, " > dense_size: ", dense_size);
+        auto r = indices[tuple_idx];
+        auto c = indices[tuple_idx + 1];
+        ORT_RETURN_IF_NOT(r >= 0 && r < rows && c >= 0 && c < cols,
+                          "Invalid COO 2D index: (", r, ", ", c,
+                          ") must be in [0, ", rows, ") x [0, ", cols, ")");
+        // Use SafeInt to prevent overflow during index calculation.
+        int64_t dst_idx = SafeInt<int64_t>(r) * cols + c;
         copy_func(output, values, dst_idx, src_idx);
       }
     }
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 59bde3742b288..3e928afcf6c80 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -2074,7 +2074,7 @@ static Status CopySparseData(const std::string& name,
   switch (indices.data_type()) {
     case ONNX_NAMESPACE::TensorProto_DataType_INT64:
       if (needs_unpack) {
-        ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow<size_t>(indices_elements) * sizeof(int64_t)),
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt<size_t>(indices_elements) * sizeof(int64_t),
                           "Sparse tensor: ", name, " indices raw data size does not match expected: ",
                           indices_elements * sizeof(int64_t));
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
@@ -2088,7 +2088,7 @@ static Status CopySparseData(const std::string& name,
       break;
     case ONNX_NAMESPACE::TensorProto_DataType_INT32: {
       if (needs_unpack) {
-        ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow<size_t>(indices_elements) * sizeof(int32_t)),
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt<size_t>(indices_elements) * sizeof(int32_t),
                           "Sparse tensor: ", name, " indices raw data size does not match expected: ",
                           indices_elements * sizeof(int32_t));
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
@@ -2107,7 +2107,7 @@ static Status CopySparseData(const std::string& name,
     }
     case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
       if (needs_unpack) {
-        ORT_RETURN_IF_NOT(indices.raw_data().size() == (narrow<size_t>(indices_elements) * sizeof(int16_t)),
+        ORT_RETURN_IF_NOT(indices.raw_data().size() == SafeInt<size_t>(indices_elements) * sizeof(int16_t),
                           "Sparse tensor: ", name, " indices raw data size does not match expected: ",
                           indices_elements * sizeof(int16_t));
         ORT_RETURN_IF_ERROR(UnpackInitializerData(indices, model_path, unpack_buffer));
@@ -2288,14 +2288,14 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
 
     // by putting the data into a std::string we can avoid a copy as set_raw_data can do a std::move
     // into the TensorProto.
-    std::string dense_data_storage(narrow<size_t>(dense_elements) * element_size, 0);
+    std::string dense_data_storage(SafeInt<size_t>(dense_elements) * element_size, 0);
     if (nnz_elements > 0) {
       // need to read in sparse data first as it could be in a type specific field, in raw data, or in external data
       std::vector<uint8_t> values_data;
       ORT_RETURN_IF_ERROR(UnpackInitializerData(sparse_values, model_path, values_data));
-      ORT_RETURN_IF_NOT(values_data.size() == static_cast<size_t>(nnz_elements) * element_size,
+      ORT_RETURN_IF_NOT(values_data.size() == SafeInt<size_t>(nnz_elements) * element_size,
                         "Sparse tensor: ", name, " values data size does not match expected: ",
-                        static_cast<size_t>(nnz_elements) * element_size);
+                        static_cast<size_t>(SafeInt<size_t>(nnz_elements) * element_size));
       void* sparse_data = values_data.data();
       void* dense_data = dense_data_storage.data();
 
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index f7c2908d0ab8b..04e99d206bd06 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -60,6 +60,9 @@ Module Name:
 #if defined(__s390x__)
 #define MLAS_TARGET_S390X
 #endif
+#if defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
+#define MLAS_TARGET_RISCV64
+#endif
 
 #if defined(__VSX__)
 #define MLAS_TARGET_POWER
diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
index 4916062f2b4f9..a677ee5087672 100644
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@@ -876,7 +876,7 @@ Return Value:
         //
         float Maximum;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
         Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
 #else 
         Maximum = MlasReduceMaximumF32Kernel(Input, D);
@@ -894,7 +894,7 @@ Return Value:
         float* Temp = LogSoftmax ? nullptr : Output;
         float Accumulation;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
         Accumulation = GetMlasPlatform().ComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
 #else
         Accumulation = MlasComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
@@ -910,7 +910,7 @@ Return Value:
             //
             float Parameters[] = {NegativeMaximum, std::log(Accumulation)};
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
             GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
 #else 
 
@@ -922,7 +922,7 @@ Return Value:
             //
             float Parameters[] = {1.0f / Accumulation};
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
             GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
 #else
             MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 662e757a47998..1fa4c90913b24 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -352,7 +352,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 //
 
 #if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || \
-    defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X)
+    defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X) || \
+    defined(MLAS_TARGET_RISCV64)
 
 typedef
 size_t
@@ -1018,6 +1019,36 @@ extern "C" {
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelLasx;
     MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelLasx;
     MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelLasx;
+#elif defined(MLAS_TARGET_RISCV64)
+#if defined(MLAS_USE_RVV)
+    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelRvv;
+    void MlasSgemmCopyPackBRvv(
+        float* D,
+        const float* B,
+        size_t ldb,
+        size_t CountX,
+        size_t CountY);
+#endif
+    size_t MLASCALL MlasSgemmKernelZero(
+        const float* A,
+        const float* B,
+        float* C,
+        size_t CountK,
+        size_t CountM,
+        size_t CountN,
+        size_t lda,
+        size_t ldc,
+        float alpha);
+    size_t MLASCALL MlasSgemmKernelAdd(
+        const float* A,
+        const float* B,
+        float* C,
+        size_t CountK,
+        size_t CountM,
+        size_t CountN,
+        size_t lda,
+        size_t ldc,
+        float alpha);
 #else
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
@@ -1167,6 +1198,12 @@ extern "C" {
 
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32Kernel;
     MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32Kernel;
+#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV)
+    MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL MlasComputeSumExpF32KernelRvv;
+    MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelRvv;
+    MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelRvv;
+    MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelRvv;
+#endif
 #if defined(MLAS_TARGET_AMD64)
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx;
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx512F;
@@ -1442,7 +1479,7 @@ struct MLAS_PLATFORM {
 #endif
 
 
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_RISCV64)
     MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel;
 #endif
 #if defined(MLAS_TARGET_LARCH64)
@@ -1507,7 +1544,7 @@ struct MLAS_PLATFORM {
     MLAS_QUANTIZE_LINEAR_U4_KERNEL* QuantizeLinearU4Kernel;
 #endif
 
-#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_RISCV64)
     MLAS_COMPUTE_UNARY_FLOAT_KERNEL* ErfKernelRoutine;
     MLAS_COMPUTE_UNARY_FLOAT_KERNEL* LogisticKernelRoutine;
     MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL* ReduceMaximumF32Kernel;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index e9f140a2ee0f7..191ee1ab2f2f8 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -27,8 +27,10 @@ Module Name:
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
 
-#include <thread>
+#include <cctype>
+#include <cstdlib>
 #include <mutex>
+#include <thread>
 
 #if defined(MLAS_TARGET_POWER)
 #if defined(__linux__)
@@ -49,6 +51,54 @@ Module Name:
 #include <sys/auxv.h>
 #endif
 
+#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && defined(__linux__)
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#ifndef COMPAT_HWCAP_ISA_V
+#define COMPAT_HWCAP_ISA_V (1UL << ('V' - 'A'))
+#endif
+#endif
+
+#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV)
+namespace {
+
+bool
+MlasStringEqualsIgnoreCase(
+    const char* value,
+    const char* expected
+    )
+{
+    while (*value != '\0' && *expected != '\0') {
+        const auto lhs = static_cast<unsigned char>(*value);
+        const auto rhs = static_cast<unsigned char>(*expected);
+        if (std::tolower(lhs) != std::tolower(rhs)) {
+            return false;
+        }
+        ++value;
+        ++expected;
+    }
+
+    return *value == '\0' && *expected == '\0';
+}
+
+bool
+MlasShouldForceScalarRiscv(
+    const char* value
+    )
+{
+    if (value == nullptr || value[0] == '\0') {
+        return false;
+    }
+
+    return MlasStringEqualsIgnoreCase(value, "1") ||
+           MlasStringEqualsIgnoreCase(value, "true") ||
+           MlasStringEqualsIgnoreCase(value, "on") ||
+           MlasStringEqualsIgnoreCase(value, "yes");
+}
+
+}  // namespace
+#endif
+
 #if defined(MLAS_TARGET_ARM64)
 #if defined(_WIN32)
 
@@ -265,6 +315,33 @@ Return Value:
     this->CastF16ToF32Kernel = nullptr;
     this->CastF32ToF16Kernel = nullptr;
 
+#if defined(MLAS_TARGET_RISCV64)
+    this->GemmFloatKernel = nullptr;
+    this->ErfKernelRoutine = MlasErfKernel;
+    this->LogisticKernelRoutine = MlasLogisticKernel;
+    this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
+    this->ComputeSumExpF32Kernel = MlasComputeSumExpF32Kernel;
+    this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
+    this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;
+
+#if defined(MLAS_USE_RVV)
+    bool has_rvv = true;
+#if defined(__linux__)
+    has_rvv = (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) != 0;
+#endif
+    if (MlasShouldForceScalarRiscv(std::getenv("ORT_MLAS_RISCV_FORCE_SCALAR"))) {
+        has_rvv = false;
+    }
+    if (has_rvv) {
+        this->GemmFloatKernel = MlasGemmFloatKernelRvv;
+        this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelRvv;
+        this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelRvv;
+        this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelRvv;
+        this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelRvv;
+    }
+#endif
+#endif
+
 #if defined(MLAS_TARGET_AMD64_IX86)
 
     //
diff --git a/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp
new file mode 100644
index 0000000000000..c6e43e2c8bcd4
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/riscv64/sgemm_kernel_rvv.cpp
@@ -0,0 +1,275 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sgemm_kernel_rvv.cpp
+
+Abstract:
+
+    This module implements an RVV kernel for the single precision matrix/matrix
+    multiply operation (SGEMM) on riscv64.
+
+--*/
+
+#include "mlasi.h"
+
+#if defined(MLAS_USE_RVV)
+
+#include <riscv_vector.h>
+
+namespace {
+
+// The packed B layout stays 16 columns wide to match MLAS, but each tile is
+// consumed in runtime-sized RVV chunks so the kernel is not tied to a fixed
+// VLEN such as 128 or 256 bits.
+constexpr size_t kPackedCountN = 16;
+
+template<bool ZeroMode, bool AlphaIsOne>
+MLAS_FORCEINLINE
+void
+MlasStoreAccumulatorRvv(
+    float* C,
+    vfloat32m4_t Accumulator,
+    size_t vl,
+    float alpha
+    )
+{
+#if defined(_WIN32)
+
+    if constexpr (AlphaIsOne) {
+        UNREFERENCED_PARAMETER(alpha);
+    }
+
+#endif
+
+    if constexpr (!AlphaIsOne) {
+        Accumulator = __riscv_vfmul_vf_f32m4(Accumulator, alpha, vl);
+    }
+
+    if constexpr (!ZeroMode) {
+        Accumulator = __riscv_vfadd_vv_f32m4(Accumulator, __riscv_vle32_v_f32m4(C, vl), vl);
+    }
+
+    __riscv_vse32_v_f32m4(C, Accumulator, vl);
+}
+
+template<bool ZeroMode, bool AlphaIsOne, size_t Rows>
+MLAS_FORCEINLINE
+size_t
+MlasSgemmKernelRvv(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+{
+    static_assert(Rows >= 1 && Rows <= 4, "unsupported RVV SGEMM tile height");
+
+#if defined(_WIN32)
+
+    if constexpr (Rows == 1) {
+        UNREFERENCED_PARAMETER(lda);
+        UNREFERENCED_PARAMETER(ldc);
+    }
+
+    if constexpr (AlphaIsOne) {
+        UNREFERENCED_PARAMETER(alpha);
+    }
+
+#endif
+
+    const float* packed_b_block = B;
+    float* c_block = C;
+    size_t remaining_n_total = CountN;
+
+    do {
+        const size_t count_n_block = remaining_n_total >= kPackedCountN ? kPackedCountN : remaining_n_total;
+        size_t remaining_n_block = count_n_block;
+        size_t column_offset = 0;
+        float* c = c_block;
+
+        while (remaining_n_block > 0) {
+            // Split a packed 16-column tile into however many lanes the current
+            // machine exposes for e32,m4. This keeps the kernel VLEN-agnostic.
+            const size_t vl = __riscv_vsetvl_e32m4(remaining_n_block);
+            vfloat32m4_t row0_block = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            vfloat32m4_t row1_block;
+            vfloat32m4_t row2_block;
+            vfloat32m4_t row3_block;
+
+            if constexpr (Rows >= 2) {
+                row1_block = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            }
+            if constexpr (Rows >= 3) {
+                row2_block = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            }
+            if constexpr (Rows >= 4) {
+                row3_block = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            }
+
+            const float* a = A;
+            const float* b = packed_b_block + column_offset;
+            size_t k = CountK;
+
+            while (k >= 2) {
+                const float row0_a0 = a[0];
+                const float row0_a1 = a[1];
+                vfloat32m4_t b_elements = __riscv_vle32_v_f32m4(b, vl);
+                row0_block = __riscv_vfmacc_vf_f32m4(row0_block, row0_a0, b_elements, vl);
+
+                if constexpr (Rows >= 2) {
+                    row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda], b_elements, vl);
+                }
+                if constexpr (Rows >= 3) {
+                    row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2], b_elements, vl);
+                }
+                if constexpr (Rows >= 4) {
+                    row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3], b_elements, vl);
+                }
+
+                b_elements = __riscv_vle32_v_f32m4(b + kPackedCountN, vl);
+                row0_block = __riscv_vfmacc_vf_f32m4(row0_block, row0_a1, b_elements, vl);
+
+                if constexpr (Rows >= 2) {
+                    row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda + 1], b_elements, vl);
+                }
+                if constexpr (Rows >= 3) {
+                    row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2 + 1], b_elements, vl);
+                }
+                if constexpr (Rows >= 4) {
+                    row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3 + 1], b_elements, vl);
+                }
+
+                a += 2;
+                b += kPackedCountN * 2;
+                k -= 2;
+            }
+
+            if (k > 0) {
+                vfloat32m4_t b_elements = __riscv_vle32_v_f32m4(b, vl);
+                row0_block = __riscv_vfmacc_vf_f32m4(row0_block, a[0], b_elements, vl);
+
+                if constexpr (Rows >= 2) {
+                    row1_block = __riscv_vfmacc_vf_f32m4(row1_block, a[lda], b_elements, vl);
+                }
+                if constexpr (Rows >= 3) {
+                    row2_block = __riscv_vfmacc_vf_f32m4(row2_block, a[lda * 2], b_elements, vl);
+                }
+                if constexpr (Rows >= 4) {
+                    row3_block = __riscv_vfmacc_vf_f32m4(row3_block, a[lda * 3], b_elements, vl);
+                }
+            }
+
+            MlasStoreAccumulatorRvv<ZeroMode, AlphaIsOne>(c, row0_block, vl, alpha);
+
+            if constexpr (Rows >= 2) {
+                MlasStoreAccumulatorRvv<ZeroMode, AlphaIsOne>(c + ldc, row1_block, vl, alpha);
+            }
+            if constexpr (Rows >= 3) {
+                MlasStoreAccumulatorRvv<ZeroMode, AlphaIsOne>(c + ldc * 2, row2_block, vl, alpha);
+            }
+            if constexpr (Rows >= 4) {
+                MlasStoreAccumulatorRvv<ZeroMode, AlphaIsOne>(c + ldc * 3, row3_block, vl, alpha);
+            }
+
+            c += vl;
+            column_offset += vl;
+            remaining_n_block -= vl;
+        }
+
+        c_block += count_n_block;
+        packed_b_block += CountK * kPackedCountN;
+        remaining_n_total -= count_n_block;
+
+    } while (remaining_n_total > 0);
+
+    return Rows;
+}
+
+template<bool ZeroMode, bool AlphaIsOne>
+MLAS_FORCEINLINE
+size_t
+MlasGemmFloatKernelRvvDispatchRows(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+{
+    if (CountM >= 4) {
+        return MlasSgemmKernelRvv<ZeroMode, AlphaIsOne, 4>(A, B, C, CountK, CountN, lda, ldc, alpha);
+    }
+
+    if (CountM == 3) {
+        return MlasSgemmKernelRvv<ZeroMode, AlphaIsOne, 3>(A, B, C, CountK, CountN, lda, ldc, alpha);
+    }
+
+    if (CountM >= 2) {
+        return MlasSgemmKernelRvv<ZeroMode, AlphaIsOne, 2>(A, B, C, CountK, CountN, lda, ldc, alpha);
+    }
+
+    return MlasSgemmKernelRvv<ZeroMode, AlphaIsOne, 1>(A, B, C, CountK, CountN, lda, ldc, alpha);
+}
+
+template<bool ZeroMode>
+MLAS_FORCEINLINE
+size_t
+MlasGemmFloatKernelRvvDispatch(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    )
+{
+    if (alpha == 1.0f) {
+        return MlasGemmFloatKernelRvvDispatchRows<ZeroMode, true>(
+            A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+    }
+
+    return MlasGemmFloatKernelRvvDispatchRows<ZeroMode, false>(
+        A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+}
+
+}  // namespace
+
+size_t
+MLASCALL
+MlasGemmFloatKernelRvv(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha,
+    bool ZeroMode
+    )
+{
+    if (ZeroMode) {
+        return MlasGemmFloatKernelRvvDispatch<true>(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+    }
+
+    return MlasGemmFloatKernelRvvDispatch<false>(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+}
+
+#endif  // defined(MLAS_USE_RVV)
diff --git a/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp
new file mode 100644
index 0000000000000..b2ec24e3fbfdc
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/riscv64/sgemm_pack_b_rvv.cpp
@@ -0,0 +1,115 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sgemm_pack_b_rvv.cpp
+
+Abstract:
+
+    This module implements an RVV packing helper for the single precision
+    matrix/matrix multiply operation (SGEMM) on riscv64.
+
+--*/
+
+#include "mlasi.h"
+
+#if defined(MLAS_USE_RVV)
+
+#include <riscv_vector.h>
+
+namespace {
+
+// Keep MLAS packing in 16-column tiles, but let RVV decide the actual chunk
+// size at runtime via vsetvl so the same code works across different VLENs.
+constexpr size_t kPackedCountN = 16;
+
+MLAS_FORCEINLINE
+void
+MlasStoreZeroPaddedBlock(
+    float* D,
+    const float* B,
+    size_t CountX
+    )
+{
+    size_t remaining = kPackedCountN;
+    size_t offset = 0;
+
+    while (remaining > 0) {
+        const size_t vl = __riscv_vsetvl_e32m4(remaining);
+        __riscv_vse32_v_f32m4(D + offset, __riscv_vfmv_v_f_f32m4(0.0f, vl), vl);
+        offset += vl;
+        remaining -= vl;
+    }
+
+    remaining = CountX;
+    offset = 0;
+
+    while (remaining > 0) {
+        const size_t vl = __riscv_vsetvl_e32m4(remaining);
+        __riscv_vse32_v_f32m4(D + offset, __riscv_vle32_v_f32m4(B + offset, vl), vl);
+        offset += vl;
+        remaining -= vl;
+    }
+}
+
+MLAS_FORCEINLINE
+void
+MlasStoreFullBlock(
+    float* D,
+    const float* B
+    )
+{
+    size_t remaining = kPackedCountN;
+    size_t offset = 0;
+
+    while (remaining > 0) {
+        const size_t vl = __riscv_vsetvl_e32m4(remaining);
+        __riscv_vse32_v_f32m4(D + offset, __riscv_vle32_v_f32m4(B + offset, vl), vl);
+        offset += vl;
+        remaining -= vl;
+    }
+}
+
+}  // namespace
+
+void
+MlasSgemmCopyPackBRvv(
+    float* D,
+    const float* B,
+    size_t ldb,
+    size_t CountX,
+    size_t CountY
+    )
+{
+    while (CountX >= kPackedCountN) {
+        const float* b = B;
+        size_t y = CountY;
+
+        do {
+            MlasStoreFullBlock(D, b);
+            D += kPackedCountN;
+            b += ldb;
+            y--;
+        } while (y > 0);
+
+        B += kPackedCountN;
+        CountX -= kPackedCountN;
+    }
+
+    if (CountX > 0) {
+        size_t y = CountY;
+
+        do {
+            MlasStoreZeroPaddedBlock(D, B, CountX);
+            D += kPackedCountN;
+            B += ldb;
+            y--;
+        } while (y > 0);
+    }
+}
+
+#endif  // defined(MLAS_USE_RVV)
diff --git a/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp b/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp
new file mode 100644
index 0000000000000..dc548b56d676e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/riscv64/softmax_kernel_rvv.cpp
@@ -0,0 +1,207 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    softmax_kernel_rvv.cpp
+
+Abstract:
+
+    This module implements RVV kernels for the softmax critical path on
+    riscv64. The implementation keeps the scope intentionally small and
+    focuses on the float32 primitives used by Softmax and LogSoftmax:
+    reduction, sum-exp, normalization, and log-softmax output.
+
+--*/
+
+#include "mlasi.h"
+
+#if defined(MLAS_USE_RVV)
+
+#include <riscv_vector.h>
+
+namespace {
+
+constexpr float kExpLowerRangeSumExp = -88.3762626647949f;
+constexpr float kRoundingBias = MLAS_ROUNDING_BIAS_MAGIC;
+constexpr float kLog2Reciprocal = 1.44269504088896341f;
+constexpr float kLog2High = -6.93145752e-1f;
+constexpr float kLog2Low = -1.42860677e-6f;
+constexpr float kPoly0 = 0x1.694000p-10f;
+constexpr float kPoly1 = 0x1.125edcp-7f;
+constexpr float kPoly2 = 0x1.555b5ap-5f;
+constexpr float kPoly3 = 0x1.555450p-3f;
+constexpr float kPoly4 = 0x1.fffff6p-2f;
+constexpr float kPoly56 = 0x1.000000p+0f;
+constexpr int32_t kMaximumExponentBits = 0x3F800000;
+
+MLAS_FORCEINLINE
+vfloat32m1_t
+MlasComputeExpVectorRvv(
+    vfloat32m1_t value,
+    size_t vl
+    )
+{
+    value = __riscv_vfmax_vf_f32m1(value, kExpLowerRangeSumExp, vl);
+
+    vfloat32m1_t scaled = __riscv_vfmul_vf_f32m1(value, kLog2Reciprocal, vl);
+    vfloat32m1_t biased = __riscv_vfadd_vf_f32m1(scaled, kRoundingBias, vl);
+    vfloat32m1_t reduced_m = __riscv_vfsub_vf_f32m1(biased, kRoundingBias, vl);
+    vfloat32m1_t reduced = __riscv_vfadd_vv_f32m1(
+        __riscv_vfmul_vf_f32m1(reduced_m, kLog2High, vl), value, vl);
+    reduced = __riscv_vfadd_vv_f32m1(
+        __riscv_vfmul_vf_f32m1(reduced_m, kLog2Low, vl), reduced, vl);
+
+    vfloat32m1_t poly = __riscv_vfmv_v_f_f32m1(kPoly0, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly1, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly2, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly3, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly4, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly56, vl);
+    poly = __riscv_vfadd_vf_f32m1(
+        __riscv_vfmul_vv_f32m1(poly, reduced, vl), kPoly56, vl);
+
+    vint32m1_t exponent_bits = __riscv_vreinterpret_v_f32m1_i32m1(biased);
+    exponent_bits = __riscv_vsll_vx_i32m1(exponent_bits, 23, vl);
+    exponent_bits = __riscv_vadd_vx_i32m1(exponent_bits, kMaximumExponentBits, vl);
+    vfloat32m1_t scale = __riscv_vreinterpret_v_i32m1_f32m1(exponent_bits);
+
+    return __riscv_vfmul_vv_f32m1(poly, scale, vl);
+}
+
+MLAS_FORCEINLINE
+float
+MlasReduceSumRvv(
+    vfloat32m1_t value,
+    size_t vl
+    )
+{
+    vfloat32m1_t accumulator = __riscv_vfmv_s_f_f32m1(0.0f, 1);
+    accumulator = __riscv_vfredusum_vs_f32m1_f32m1(value, accumulator, vl);
+    return __riscv_vfmv_f_s_f32m1_f32(accumulator);
+}
+
+MLAS_FORCEINLINE
+float
+MlasReduceMaxRvv(
+    vfloat32m1_t value,
+    size_t vl
+    )
+{
+    vfloat32m1_t accumulator =
+        __riscv_vfmv_s_f_f32m1(std::numeric_limits<float>::lowest(), 1);
+    accumulator = __riscv_vfredmax_vs_f32m1_f32m1(value, accumulator, vl);
+    return __riscv_vfmv_f_s_f32m1_f32(accumulator);
+}
+
+}  // namespace
+
+float
+MLASCALL
+MlasReduceMaximumF32KernelRvv(
+    const float* Input,
+    size_t N
+    )
+{
+    float maximum = std::numeric_limits<float>::lowest();
+
+    while (N > 0) {
+        size_t vl = __riscv_vsetvl_e32m1(N);
+        vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl);
+        input = __riscv_vfmax_vf_f32m1(input, maximum, vl);
+        maximum = MlasReduceMaxRvv(input, vl);
+
+        Input += vl;
+        N -= vl;
+    }
+
+    return maximum;
+}
+
+float
+MLASCALL
+MlasComputeSumExpF32KernelRvv(
+    const float* Input,
+    float* Output,
+    size_t N,
+    const float* NegativeMaximum
+    )
+{
+    const float negative_maximum = *NegativeMaximum;
+    float accumulation = 0.0f;
+
+    while (N > 0) {
+        size_t vl = __riscv_vsetvl_e32m1(N);
+        vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl);
+        vfloat32m1_t shifted = __riscv_vfadd_vf_f32m1(input, negative_maximum, vl);
+        vfloat32m1_t exp_value = MlasComputeExpVectorRvv(shifted, vl);
+
+        if (Output != nullptr) {
+            __riscv_vse32_v_f32m1(Output, exp_value, vl);
+            Output += vl;
+        }
+
+        accumulation += MlasReduceSumRvv(exp_value, vl);
+
+        Input += vl;
+        N -= vl;
+    }
+
+    return accumulation;
+}
+
+void
+MLASCALL
+MlasComputeSoftmaxOutputF32KernelRvv(
+    float* Output,
+    size_t N,
+    const float* Parameters
+    )
+{
+    const float scale = Parameters[0];
+
+    while (N > 0) {
+        size_t vl = __riscv_vsetvl_e32m1(N);
+        vfloat32m1_t output = __riscv_vle32_v_f32m1(Output, vl);
+        output = __riscv_vfmul_vf_f32m1(output, scale, vl);
+        __riscv_vse32_v_f32m1(Output, output, vl);
+
+        Output += vl;
+        N -= vl;
+    }
+}
+
+void
+MLASCALL
+MlasComputeLogSoftmaxOutputF32KernelRvv(
+    const float* Input,
+    float* Output,
+    size_t N,
+    const float* Parameters
+    )
+{
+    const float negative_maximum = Parameters[0];
+    const float logarithm = Parameters[1];
+
+    while (N > 0) {
+        size_t vl = __riscv_vsetvl_e32m1(N);
+        vfloat32m1_t input = __riscv_vle32_v_f32m1(Input, vl);
+        input = __riscv_vfadd_vf_f32m1(input, negative_maximum, vl);
+        input = __riscv_vfsub_vf_f32m1(input, logarithm, vl);
+        __riscv_vse32_v_f32m1(Output, input, vl);
+
+        Input += vl;
+        Output += vl;
+        N -= vl;
+    }
+}
+
+#endif  // defined(MLAS_USE_RVV)
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 7836b1f89b0c4..88d0308bfa21e 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -247,6 +247,13 @@ Return Value:
 
 --*/
 {
+#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && !defined(FORCE_GENERIC_ALGORITHMS)
+    if (GetMlasPlatform().GemmFloatKernel != nullptr) {
+        MlasSgemmCopyPackBRvv(D, B, ldb, CountX, CountY);
+        return;
+    }
+#endif
+
     //
     // Copy data from matrix B into the destination buffer 16 columns at a
     // time.
@@ -1004,6 +1011,14 @@ Return Value:
 
 #if (defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_LARCH64)) && !defined(FORCE_GENERIC_ALGORITHMS)
         RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
+#elif defined(MLAS_TARGET_RISCV64) && !defined(FORCE_GENERIC_ALGORITHMS)
+        if (GetMlasPlatform().GemmFloatKernel != nullptr) {
+            RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
+        } else if (ZeroMode) {
+            RowsHandled = MlasSgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+        } else {
+            RowsHandled = MlasSgemmKernelAdd(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
+        }
 #else
         if (ZeroMode) {
             RowsHandled = MlasSgemmKernelZero(A, B, C, CountK, CountM, CountN, lda, ldc, alpha);
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index aeddef0c5188f..28d6332f6282c 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -618,7 +618,17 @@ class PosixEnv : public Env {
   PosixEnv() {
     cpuinfo_available_ = cpuinfo_initialize();
     if (!cpuinfo_available_) {
-      LOGS_DEFAULT(INFO) << "cpuinfo_initialize failed";
+      // PosixEnv may be constructed before the logging system is initialized
+      // (e.g. via a static Env::Default() reference in the Python bindings).
+      // Using LOGS_DEFAULT here would crash with "Attempt to use DefaultLogger
+      // but none has been registered". Fall back to stderr when no logger exists.
+      if (logging::LoggingManager::HasDefaultLogger()) {
+        LOGS_DEFAULT(WARNING) << "cpuinfo_initialize failed. "
+                                 "May cause CPU EP performance degradation due to undetected CPU features.";
+      } else {
+        std::cerr << "onnxruntime warning: cpuinfo_initialize failed. "
+                     "May cause CPU EP performance degradation due to undetected CPU features.\n";
+      }
     }
   }
   bool cpuinfo_available_{false};
diff --git a/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc
new file mode 100644
index 0000000000000..2aa5d82d3f198
--- /dev/null
+++ b/onnxruntime/core/providers/coreml/builders/impl/quick_gelu_op_builder.cc
@@ -0,0 +1,128 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cmath>
+
+#include "core/providers/common.h"
+#include "core/providers/coreml/builders/helper.h"
+#include "core/providers/coreml/builders/impl/base_op_builder.h"
+#include "core/providers/coreml/builders/impl/builder_utils.h"
+#include "core/providers/coreml/builders/model_builder.h"
+#include "core/providers/coreml/builders/op_builder_factory.h"
+#include "core/providers/coreml/shape_utils.h"
+#include "core/providers/shared/utils/utils.h"
+
+namespace onnxruntime {
+namespace coreml {
+
+// com.microsoft:QuickGelu is produced by ORT's QuickGeluFusion pass
+// (onnxruntime/core/optimizer/quick_gelu_fusion.cc) at optimization level
+// ORT_ENABLE_EXTENDED and above. The schema in contrib_defs.cc defines it as
+//     Y = X * Sigmoid(alpha * X)    default alpha = 1.702
+// CoreML has no native equivalent, so we decompose to three MIL ops — all
+// primitives are already CoreML-supported. Same approach the QNN EP uses
+// in qnn/builder/opbuilder/quick_gelu_op_builder.cc.
+class QuickGeluOpBuilder : public BaseOpBuilder {
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override;
+
+  bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                         const logging::Logger& logger) const override;
+
+  bool SupportsMLProgram() const override { return true; }
+};
+
+Status QuickGeluOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                                 const Node& node,
+                                                 const logging::Logger& logger) const {
+  // IsOpSupportedImpl gates this, but fail fast rather than silently produce an
+  // invalid model if the path is ever reached without MLProgram.
+  ORT_RETURN_IF_NOT(model_builder.CreateMLProgram(),
+                    "QuickGelu is only supported by the CoreML EP in MLProgram format");
+
+  NodeAttrHelper helper(node);
+  const float alpha = helper.Get("alpha", 1.702f);
+
+  const auto input_dtype = node.InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  const int32_t elem_type = static_cast<int32_t>(input_dtype);
+  const std::string& x_name = node.InputDefs()[0]->Name();
+
+  std::vector<int64_t> x_shape;
+  ORT_RETURN_IF_NOT(GetShape(*node.InputDefs()[0], x_shape, logger), "Failed to get QuickGelu input shape");
+
+  {
+    using namespace CoreML::Specification::MILSpec;
+
+    // When alpha ≈ 1.0 (e.g. CLIP's approximate GELU, `x * sigmoid(x)`), skip
+    // the leading mul and feed x straight into sigmoid. Saves one op and
+    // avoids the rounding it would introduce. Mirrors QNN's builder at
+    // qnn/builder/opbuilder/quick_gelu_op_builder.cc:42-49.
+    constexpr float kAlphaEpsilon = 1e-6f;
+    const bool skip_alpha_mul = std::abs(alpha - 1.0f) < kAlphaEpsilon;
+
+    std::string sigmoid_input_name = x_name;
+    std::unique_ptr<Operation> mul_alpha;
+    if (!skip_alpha_mul) {
+      // alpha_x = mul(x, alpha)
+      mul_alpha = model_builder.CreateOperation(node, "mul", "alpha");
+      AddOperationInput(*mul_alpha, "x", x_name);
+      if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+        AddOperationInput(*mul_alpha, "y", model_builder.AddScalarConstant(mul_alpha->type(), "alpha", alpha));
+      } else {
+        AddOperationInput(*mul_alpha, "y",
+                          model_builder.AddScalarConstant(mul_alpha->type(), "alpha", MLFloat16(alpha)));
+      }
+      sigmoid_input_name = model_builder.GetUniqueName(node, "quick_gelu_alpha_x");
+      AddIntermediateOperationOutput(*mul_alpha, sigmoid_input_name, elem_type, x_shape);
+    }
+
+    // sig = sigmoid(sigmoid_input)
+    auto sig = model_builder.CreateOperation(node, "sigmoid");
+    AddOperationInput(*sig, "x", sigmoid_input_name);
+    const std::string& sig_name = model_builder.GetUniqueName(node, "quick_gelu_sigmoid");
+    AddIntermediateOperationOutput(*sig, sig_name, elem_type, x_shape);
+
+    // y = mul(x, sig)
+    auto mul_final = model_builder.CreateOperation(node, "mul", "final");
+    AddOperationInput(*mul_final, "x", x_name);
+    AddOperationInput(*mul_final, "y", sig_name);
+    AddOperationOutput(*mul_final, *node.OutputDefs()[0]);
+
+    if (mul_alpha) {
+      model_builder.AddOperation(std::move(mul_alpha));
+    }
+    model_builder.AddOperation(std::move(sig));
+    model_builder.AddOperation(std::move(mul_final));
+  }
+
+  return Status::OK();
+}
+
+bool QuickGeluOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params,
+                                           const logging::Logger& logger) const {
+  // Only the MLProgram path is implemented. NeuralNetwork format is deprecated
+  // on Apple Silicon and not worth carrying a second implementation for.
+  if (!input_params.create_mlprogram) {
+    LOGS(logger, VERBOSE) << "QuickGelu: only MLProgram format is supported by the CoreML EP";
+    return false;
+  }
+
+  // AddToModelBuilderImpl requires the input shape to size intermediate MIL
+  // outputs, so check here and fall back to CPU if shape inference was
+  // incomplete — don't claim the node and then fail at model-build time.
+  std::vector<int64_t> x_shape;
+  if (!GetShape(*node.InputDefs()[0], x_shape, logger)) {
+    LOGS(logger, VERBOSE) << "QuickGelu: failed to get input shape";
+    return false;
+  }
+
+  return true;
+}
+
+void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<QuickGeluOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace coreml
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
index cc301aceae466..d4f14273eeef5 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -26,6 +26,9 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateActivationOpBuilder("Elu", op_registrations);
   CreateActivationOpBuilder("HardSigmoid", op_registrations);
 
+  // Microsoft-domain ops produced by ORT's own optimizer passes
+  CreateQuickGeluOpBuilder("QuickGelu", op_registrations);
+
   // Unary ops
   CreateUnaryOpBuilder("Erf", op_registrations);
   CreateUnaryOpBuilder("Reciprocal", op_registrations);
diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
index 9b51b53d73e9e..f6304848274de 100644
--- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h
@@ -44,6 +44,7 @@ void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op
 void CreateSqueezeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateQuickGeluOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 }  // namespace coreml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
index 45c0a2fadc2ba..1a35c24c69676 100644
--- a/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/linearclassifier.cc
@@ -39,9 +39,7 @@ LinearClassifier::LinearClassifier(const OpKernelInfo& info)
   class_count_ = static_cast<ptrdiff_t>(intercepts_.size());
 
   ORT_ENFORCE(class_count_ > 0, "LinearClassifier: intercepts must not be empty.");
-  ORT_ENFORCE(coefficients_.size() % static_cast<size_t>(class_count_) == 0,
-              "LinearClassifier: coefficients size (", coefficients_.size(),
-              ") must be a multiple of the number of classes (", class_count_, ").");
+  ORT_ENFORCE(!coefficients_.empty(), "LinearClassifier: coefficients must not be empty.");
 
   SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions());
 }
@@ -156,12 +154,12 @@ Status LinearClassifier::Compute(OpKernelContext* ctx) const {
   if (!SafeMultiply(static_cast<size_t>(class_count_), static_cast<size_t>(num_features),
                     expected_coefficients_size)) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "class_count (", class_count_, ") * num_features (", num_features,
-                           ") overflows size_t");
+                           "LinearClassifier: class_count (", class_count_,
+                           ") * num_features (", num_features, ") overflows size_t");
   }
   ORT_RETURN_IF_NOT(coefficients_.size() >= expected_coefficients_size,
-                    "coefficients size (", coefficients_.size(), ") is less than class_count (", class_count_,
-                    ") * num_features (", num_features, ")");
+                    "LinearClassifier: coefficients size (", coefficients_.size(),
+                    ") is less than class_count (", class_count_, ") * num_features (", num_features, ").");
 
   Tensor* Y = ctx->Output(0, {num_batches});
 
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
index 65bcad2be8a24..9d9808ef248f6 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.cc
@@ -46,36 +46,44 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info)
   feature_count_ = 0;
   class_count_ = 0;
   for (size_t i = 0; i < vectors_per_class_.size(); i++) {
+    ORT_ENFORCE(vectors_per_class_[i] >= 0,
+                "vectors_per_class[", i, "] must be non-negative. Got ", vectors_per_class_[i]);
     starting_vector_.push_back(vector_count_);
-    vector_count_ += narrow<ptrdiff_t>(vectors_per_class_[i]);
+    vector_count_ += onnxruntime::narrow<size_t>(vectors_per_class_[i]);
   }
 
+  ORT_ENFORCE(classlabels_strings_.size() > 0 || classlabels_ints_.size() > 0, "One of classlabels_strings, classlabels_ints is required.");
+
   using_strings_ = false;
   if (classlabels_strings_.size() > 0) {
     using_strings_ = true;
     class_count_ = classlabels_strings_.size();
-  } else if (classlabels_ints_.size() > 0) {
-    class_count_ = classlabels_ints_.size();
   } else {
-    class_count_ = 1;
+    class_count_ = classlabels_ints_.size();
   }
 
+  ORT_ENFORCE(class_count_ < 65536, "The number of classes ", class_count_, " is beyond what this kernel supports (65535).");
+  ORT_ENFORCE(proba_.size() == probb_.size(), "proba and probb must have the same size.");
+  ORT_ENFORCE(coefficients_.size() > 0, "coefficients are empty.");
+
   if (vector_count_ > 0) {
     feature_count_ = support_vectors_.size() / vector_count_;  // length of each support vector
     mode_ = SVM_TYPE::SVM_SVC;
+    ORT_ENFORCE(vectors_per_class_.size() == class_count_, "Mismatch between classlabels_ints/classlabels_strings and vectors_per_class dimensions.");
   } else {
     feature_count_ = coefficients_.size() / class_count_;  // liblinear mode
     mode_ = SVM_TYPE::SVM_LINEAR;
     set_kernel_type(KERNEL::LINEAR);
   }
 
-  ORT_ENFORCE(classlabels_strings_.size() > 0 || classlabels_ints_.size() > 0);
-  ORT_ENFORCE(proba_.size() == probb_.size());
-  ORT_ENFORCE(coefficients_.size() > 0);
-
   // Validate attribute array sizes against the declared dimensions to prevent
   // out-of-bounds reads from crafted models.
   if (mode_ == SVM_TYPE::SVM_SVC) {
+    ORT_ENFORCE(vectors_per_class_.size() == static_cast<size_t>(class_count_),
+                "vectors_per_class attribute size (", vectors_per_class_.size(),
+                ") must match class_count (", class_count_, ").");
+    ORT_ENFORCE(vector_count_ > 0, "vector_count must be greater than 0 in SVC mode.");
+
     // SVC mode: coefficients layout is [class_count - 1, vector_count]
     size_t expected_coefficients = 0;
     if (!SafeMultiply(static_cast<size_t>(class_count_ - 1), static_cast<size_t>(vector_count_),
@@ -87,6 +95,9 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info)
                 "coefficients attribute size (", coefficients_.size(),
                 ") is smaller than expected (", expected_coefficients,
                 ") for the given class_count and vector_count.");
+    ORT_ENFORCE(support_vectors_.size() % static_cast<size_t>(vector_count_) == 0,
+                "support_vectors attribute size (", support_vectors_.size(),
+                ") must be divisible by vector_count (", vector_count_, ").");
 
     // rho needs one entry per classifier pair: class_count * (class_count - 1) / 2
     size_t num_classifiers = 0;
@@ -121,7 +132,7 @@ SVMClassifier::SVMClassifier(const OpKernelInfo& info)
 }
 
 template <typename LabelType>
-static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weight, const int64_t maxclass,
+static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weight, const size_t maxclass,
                         bool have_proba, bool weights_are_all_positive,
                         const std::vector<LabelType>& classlabels,
                         const LabelType& posclass, const LabelType& negclass) {
@@ -134,9 +145,9 @@ static void ChooseClass(Tensor& output, const int64_t output_idx, float max_weig
       else if (max_weight > 0 && !weights_are_all_positive)
         output_data = classlabels[1];
       else
-        output_data = classlabels[onnxruntime::narrow<size_t>(maxclass)];
+        output_data = classlabels[maxclass];
     } else {
-      output_data = classlabels[onnxruntime::narrow<size_t>(maxclass)];
+      output_data = classlabels[maxclass];
     }
   } else if (max_weight > 0) {
     output_data = posclass;
@@ -209,7 +220,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
   const ptrdiff_t num_batches = SafeInt<ptrdiff_t>(input_rank == 1 ? 1 : x_shape[0]);
   const ptrdiff_t num_features = input_rank == 1 ? narrow<ptrdiff_t>(x_shape[0])
                                                  : narrow<ptrdiff_t>(x_shape[1]);
-  ORT_RETURN_IF_NOT(num_features == feature_count_ && num_features >= 0 && num_batches >= 0,
+  ORT_RETURN_IF_NOT(num_features == static_cast<ptrdiff_t>(feature_count_) && num_features >= 0 && num_batches >= 0,
                     "Invalid input for SVMClassifier: expected feature_count=", feature_count_,
                     ", actual num_features=", num_features,
                     ", input_rank=", input_rank,
@@ -241,11 +252,11 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
   // Total number of classifiers comparing pairs between the classes
   // e.g. if you have A, B C and D classes, the number of classifiers to compare between each pair is 6
   //      with AB, AC, AD, BC, BD and CD
-  const int64_t num_classifiers = class_count_ * (class_count_ - 1) / 2;  // == (class_count_-1)!
-  const int64_t class_count_squared = class_count_ * class_count_;
+  const size_t num_classifiers = class_count_ * (class_count_ - 1) / 2;  // == (class_count_-1)!
+  const size_t class_count_squared = class_count_ * class_count_;
   const bool have_proba = proba_.size() > 0;
 
-  int64_t final_scores_per_batch = class_count_;
+  size_t final_scores_per_batch = class_count_;
   if (mode_ == SVM_TYPE::SVM_SVC && !have_proba) {
     if (class_count_ > 2)
       final_scores_per_batch = num_classifiers;
@@ -261,7 +272,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
 
   // both outputs are required so can't be nullptr
   Tensor& Y = *ctx.Output(0, {num_batches});
-  Tensor& Z = *ctx.Output(1, {num_batches, final_scores_per_batch});
+  Tensor& Z = *ctx.Output(1, {num_batches, static_cast<int64_t>(final_scores_per_batch)});
 
   auto final_scores = Z.MutableDataAsSpan<float>();
 
@@ -276,7 +287,7 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
   }
 
   int write_additional_scores = -1;
-  int64_t num_scores_per_batch = class_count_;
+  size_t num_scores_per_batch = class_count_;
 
   if (mode_ == SVM_TYPE::SVM_SVC && !have_proba) {
     num_scores_per_batch = num_classifiers;
@@ -346,39 +357,39 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
       // e.g. AB combines with BA.
       // If A has 3 support vectors and B has 2, there's a 3x2 block for AB and a 2x3 block for BA to combine
 
-      auto cur_kernels = kernels_span.subspan(n * SafeInt<size_t>(vector_count_), onnxruntime::narrow<size_t>(vector_count_));
-      auto cur_scores = classifier_scores.subspan(n * SafeInt<size_t>(num_slots_per_iteration), onnxruntime::narrow<size_t>(num_classifiers));
-      auto cur_votes = votes_span.subspan(n * SafeInt<size_t>(class_count_), onnxruntime::narrow<size_t>(class_count_));
+      auto cur_kernels = kernels_span.subspan(n * SafeInt<size_t>(vector_count_), vector_count_);
+      auto cur_scores = classifier_scores.subspan(n * SafeInt<size_t>(num_slots_per_iteration), num_classifiers);
+      auto cur_votes = votes_span.subspan(n * SafeInt<size_t>(class_count_), class_count_);
       auto scores_iter = cur_scores.begin();
 
       size_t classifier_idx = 0;
-      for (int64_t i = 0; i < class_count_ - 1; i++) {
-        int64_t start_index_i = starting_vector_[onnxruntime::narrow<size_t>(i)];  // start of support vectors for class i
-        int64_t class_i_support_count = vectors_per_class_[onnxruntime::narrow<size_t>(i)];
-        int64_t i_coeff_row_offset = vector_count_ * i;
+      for (size_t i = 0; i < class_count_ - 1; i++) {
+        size_t start_index_i = starting_vector_[i];  // start of support vectors for class i
+        size_t class_i_support_count = onnxruntime::narrow<size_t>(vectors_per_class_[i]);
+        size_t i_coeff_row_offset = vector_count_ * i;
 
-        for (int64_t j = i + 1; j < class_count_; j++) {
-          int64_t start_index_j = starting_vector_[onnxruntime::narrow<size_t>(j)];  // start of support vectors for class j
-          int64_t class_j_support_count = vectors_per_class_[onnxruntime::narrow<size_t>(j)];
-          int64_t j_coeff_row_offset = vector_count_ * (j - 1);
+        for (size_t j = i + 1; j < class_count_; j++) {
+          size_t start_index_j = starting_vector_[j];  // start of support vectors for class j
+          size_t class_j_support_count = onnxruntime::narrow<size_t>(vectors_per_class_[j]);
+          size_t j_coeff_row_offset = vector_count_ * (j - 1);
 
           double sum = 0;
 
-          const float* val1 = &(coefficients_[j_coeff_row_offset + SafeInt<size_t>(start_index_i)]);
-          const float* val2 = &(cur_kernels[onnxruntime::narrow<size_t>(start_index_i)]);
-          for (int64_t m = 0; m < class_i_support_count; ++m, ++val1, ++val2)
+          const float* val1 = coefficients_.data() + (j_coeff_row_offset + start_index_i);
+          const float* val2 = cur_kernels.data() + start_index_i;
+          for (size_t m = 0; m < class_i_support_count; ++m, ++val1, ++val2)
             sum += *val1 * *val2;
 
-          val1 = &(coefficients_[i_coeff_row_offset + SafeInt<size_t>(start_index_j)]);
-          val2 = &(cur_kernels[onnxruntime::narrow<size_t>(start_index_j)]);
+          val1 = coefficients_.data() + (i_coeff_row_offset + start_index_j);
+          val2 = cur_kernels.data() + start_index_j;
 
-          for (int64_t m = 0; m < class_j_support_count; ++m, ++val1, ++val2)
+          for (size_t m = 0; m < class_j_support_count; ++m, ++val1, ++val2)
             sum += *val1 * *val2;
 
           sum += rho_[classifier_idx++];
 
           *scores_iter++ = static_cast<float>(sum);
-          ++(cur_votes[onnxruntime::narrow<size_t>(sum > 0 ? i : j)]);
+          ++(cur_votes[sum > 0 ? i : j]);
         }
       }
     }
@@ -389,23 +400,23 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
                          &classifier_scores_data, num_classifiers, &votes_data, &Y,
                          num_scores_per_batch, write_additional_scores](ptrdiff_t idx) {
     int n = SafeInt<int32_t>(idx);  // convert to a usable sized type
-    auto cur_scores = final_scores.subspan(n * SafeInt<size_t>(final_scores_per_batch), onnxruntime::narrow<size_t>(final_scores_per_batch));
+    auto cur_scores = final_scores.subspan(n * SafeInt<size_t>(final_scores_per_batch), final_scores_per_batch);
 
     if (mode_ == SVM_TYPE::SVM_SVC && have_proba) {
-      auto probsp2 = gsl::make_span<float>(probsp2_data.data() + (n * class_count_squared), onnxruntime::narrow<size_t>(class_count_squared));
+      auto probsp2 = gsl::make_span<float>(probsp2_data.data() + (n * class_count_squared), class_count_squared);
 
       float* classifier_scores = classifier_scores_data.data() + (n * num_classifiers);
 
       size_t index = 0;
-      for (int64_t i = 0; i < class_count_ - 1; ++i) {
-        int64_t p1 = i * class_count_ + i + 1;
-        int64_t p2 = (i + 1) * class_count_ + i;
-        for (int64_t j = i + 1; j < class_count_; ++j, ++index) {
+      for (size_t i = 0; i < class_count_ - 1; ++i) {
+        size_t p1 = i * class_count_ + i + 1;
+        size_t p2 = (i + 1) * class_count_ + i;
+        for (size_t j = i + 1; j < class_count_; ++j, ++index) {
           float val1 = sigmoid_probability(classifier_scores[index], proba_[index], probb_[index]);
           float val2 = std::max(val1, 1.0e-7f);
           val2 = std::min(val2, 1 - 1.0e-7f);
-          probsp2[onnxruntime::narrow<size_t>(p1)] = val2;
-          probsp2[onnxruntime::narrow<size_t>(p2)] = 1 - val2;
+          probsp2[p1] = val2;
+          probsp2[p2] = 1 - val2;
           ++p1;
           p2 += class_count_;
         }
@@ -431,10 +442,10 @@ Status SVMClassifier::ComputeImpl(OpKernelContext& ctx,
     // onnx specs expects one column per class.
     if (num_classifiers == 1) {  // binary case
       if (using_strings_) {
-        ChooseClass<std::string>(Y, n, max_weight, maxclass, have_proba, weights_are_all_positive_,
+        ChooseClass<std::string>(Y, n, max_weight, onnxruntime::narrow<size_t>(maxclass), have_proba, weights_are_all_positive_,
                                  classlabels_strings_, "1", "0");
       } else {
-        ChooseClass<int64_t>(Y, n, max_weight, maxclass, have_proba, weights_are_all_positive_,
+        ChooseClass<int64_t>(Y, n, max_weight, onnxruntime::narrow<size_t>(maxclass), have_proba, weights_are_all_positive_,
                              classlabels_ints_, 1, 0);
       }
     } else {  // multiclass
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.h b/onnxruntime/core/providers/cpu/ml/svmclassifier.h
index e392d0915db68..4d7ed089089f2 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.h
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.h
@@ -121,12 +121,12 @@ class SVMClassifier final : public OpKernel, private SVMCommon {
   Status ComputeImpl(OpKernelContext& ctx, gsl::span<const float> x_data, const TensorShape& x_shape) const;
 
   bool weights_are_all_positive_;
-  ptrdiff_t feature_count_;
-  ptrdiff_t class_count_;
-  ptrdiff_t vector_count_;
+  size_t feature_count_;
+  size_t class_count_;
+  size_t vector_count_;
   bool using_strings_;
   std::vector<int64_t> vectors_per_class_;
-  std::vector<int64_t> starting_vector_;
+  std::vector<size_t> starting_vector_;
   std::vector<float> rho_;
   std::vector<float> proba_;
   std::vector<float> probb_;
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index 28aedc0faae61..37306d97b06ab 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -491,7 +491,8 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
-                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, V_2025_2, V_2025_3, V_2025_4},
+                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1,
+                              V_2025_2, V_2025_3, V_2025_4, V_2026_0, V_2026_1},
                              [this](const Node* node, const InitializedTensorSet&) {
                                auto& attributes = node->GetAttributes();
                                if (attributes.count("coordinate_transformation_mode") > 0) {
diff --git a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
index ee46c76f1ea54..234b1d54e69c5 100644
--- a/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
+++ b/onnxruntime/core/providers/webgpu/llm/rotary_embedding.cc
@@ -83,6 +83,13 @@ Status RotaryEmbedding::ComputeInternal(ComputeContext& context) const {
 
   if (position_ids != nullptr) {
     // position_ids provided: cos/sin cache is 2D (max_pos, D/2)
+    // position_ids bounds validation is handled by shader-side defense-in-depth checks
+    // (OOB position_ids → pass-through input unchanged). Host-side value scanning is not possible
+    // because WebGPU program inputs must be GPU buffers (InputMemoryType(OrtMemTypeCPUInput) is
+    // incompatible with AddInputs).
+    // Note: ONNX RotaryEmbedding has no base-offset mode (format 0) — position_ids is always
+    // a 2D tensor (batch_size, sequence_length) when provided.
+
     contrib::webgpu::RotaryEmbeddingProgram program{interleaved_};
     program
         .CacheHint(interleaved_)
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_packed.cc b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc
index 79a4f1f73902b..96fe712a41b40 100644
--- a/onnxruntime/core/providers/webgpu/math/gemm_packed.cc
+++ b/onnxruntime/core/providers/webgpu/math/gemm_packed.cc
@@ -34,9 +34,9 @@ Status GemmProgram::GenerateShaderCode(ShaderHelper& shader) const {
     MatMulReadFnSource(shader, a, b, nullptr, transA_, transB_);
   }
   if (is_vec4_) {
-    ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_, output_components_, /*tile_inner*/ 32, need_split_k, split_dim_inner_));
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedVec4Source(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, transA_, transB_, alpha_, need_handle_matmul_, output_components_, /*tile_inner*/ 32, need_split_k, split_dim_inner_));
   } else {
-    ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, nullptr, transA_, transB_, alpha_, need_handle_matmul_));
+    ORT_RETURN_IF_ERROR(MakeMatMulPackedSource(shader, elements_per_thread, WorkgroupSizeX(), WorkgroupSizeY(), data_type, /* batch_dims = */ nullptr, transA_, transB_, alpha_, need_handle_matmul_));
   }
 
   const ShaderVariableHelper* c = nullptr;
diff --git a/onnxruntime/core/providers/webgpu/math/gemm_utils.cc b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc
index 573d7b016310f..b762c383a7c3f 100644
--- a/onnxruntime/core/providers/webgpu/math/gemm_utils.cc
+++ b/onnxruntime/core/providers/webgpu/math/gemm_utils.cc
@@ -309,13 +309,27 @@ Status MakeMatMulPackedVec4Source(ShaderHelper& shader,
     //       atomic built-in functions in `HandleMatMulWithSplitK()`.
     shader.MainFunctionBody()
         << "const kSplitK = " << split_dim_inner << ";\n"
-        << "  let num_tiles = (kSplitK - 1) / tileInner + 1;\n"
-        << "  var kStart = kSplitK * i32(logical_global_id.z);\n"
-
-        // When Split-K is used, `batch` should always be 0 and `logical_global_id.z` is used to indicate
-        // the index of split-k instead of batch.
-        << "  let batch = 0;\n"
-        << "  let batchIndices = 0u;\n";
+        << "  let num_tiles = (kSplitK - 1) / tileInner + 1;\n";
+    if (nullptr != batch_dims) {
+      // With Split-K and batch (in MatMul and Conv2D|MatMul), `dispatch_z` is
+      // `splits_per_batch * batch_size`, and `logical_global_id.z` encodes both the
+      // batch index and the Split-K index within that range.
+      // We decompose it as:
+      //   split_index = logical_global_id.z % splits_per_batch
+      //   batch       = logical_global_id.z / splits_per_batch
+      shader.MainFunctionBody()
+          << "  let splits_per_batch = uniforms.splits_per_batch;\n"
+          << "  let split_index = i32(logical_global_id.z) % i32(splits_per_batch);\n"
+          << "  var kStart = kSplitK * split_index;\n"
+          << "  let batch = i32(logical_global_id.z) / i32(splits_per_batch);\n"
+          << "  let batchIndices = " << batch_dims->OffsetToIndices("u32(batch)") << ";\n";
+    } else {
+      // With Split-K without batch (in Gemm), `logical_global_id.z` is exactly the Split-K index.
+      shader.MainFunctionBody()
+          << "  var kStart = kSplitK * i32(logical_global_id.z);\n"
+          << "  let batch = 0;\n"
+          << "  let batchIndices = 0u;\n";
+    }
   } else {
     shader.MainFunctionBody()
         << "  let num_tiles = (uniforms.dim_inner - 1) / tileInner + 1;\n"
diff --git a/onnxruntime/core/providers/webgpu/math/matmul.cc b/onnxruntime/core/providers/webgpu/math/matmul.cc
index af488f2c23a30..512a3d05c09eb 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul.cc
+++ b/onnxruntime/core/providers/webgpu/math/matmul.cc
@@ -2,6 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/webgpu/math/matmul.h"
+
+#include <limits>
+
 #include "core/common/inlined_containers.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/webgpu/shader_helper.h"
@@ -188,18 +191,18 @@ Status ComputeMatMul(ComputeContext* context,
 
   TensorShape output_shape = helper.OutputShape();
 
-  const int64_t dim_output_outer = output_shape[output_shape.NumDimensions() - 2];
-  // check if A is  batch of vector (bach is not 1, M is 1) and B is a matrix (batch is 1)
-  if (batchA != 1 && dim_output_outer == 1 && batchB == 1) {
-    // optimization for batched vector matrix multiplication
-    // dimensions of A: [1,`batchA`,K]
-    TensorShapeVector dims_a = {1, batchA, helper.K()};
+  // When B is a matrix (batch is 1), we fold batchA into the M dimension for better
+  // performance (e.g., [2,3,5] → [1,6,5]).
+  if (batchA != 1 && batchB == 1) {
+    // dimensions of A: [1,`batchA`, M, K]
+    int64_t batchAndM = a_shape.SizeToDimension(a_shape.NumDimensions() - 1);
+    TensorShapeVector dims_a = {1, batchAndM, helper.K()};
     // dimensions of B: [1,K,N]
     TensorShapeVector dims_b = {1, helper.K(), helper.N()};
 
     a_shape = TensorShape(dims_a);
     b_shape = TensorShape(dims_b);
-    output_shape = {1, batchA, helper.N()};
+    output_shape = {1, batchAndM, helper.N()};
   }
 
   // helpful dimension variables
@@ -244,13 +247,13 @@ Status ComputeMatMul(ComputeContext* context,
   const Tensor* bias = has_bias ? inputs[2] : nullptr;
   bool use_bias_in_matmul = has_bias;
   uint32_t split_dim_inner = 1;
+  uint32_t splits_per_batch = 1;
 
   // Current Split-K implementation relies on atomic operations, which are not deterministic.
   if (!context->KernelContext().GetUseDeterministicCompute()) {
     const SplitKConfig& split_k_config = context->GetSplitKConfig();
     const bool need_split_k = split_k_config.UseSplitK(is_vec4, activation.activation_kind_, batch_size, dim_a_outer, dim_b_outer, dim_inner, is_channels_last);
     if (need_split_k) {
-      ORT_ENFORCE(batch_size == 1, "Split-K MatMul only supports batch_size == 1.");
       ORT_ENFORCE(is_vec4, "Split-K MatMul requires vec4 packing.");
 
       if (has_bias) {
@@ -258,17 +261,21 @@ Status ComputeMatMul(ComputeContext* context,
       }
 
       // Initialize `output_tensor` with 0 or bias before MatMulProgram with Split-K enabled.
-      const auto fill_bias_program = CreateMatMulFillBiasOrZeroBeforeSplitKProgram(bias, output_tensor, /*is_gemm*/ false, /*beta*/ 1.0f, /*bias_components*/ 4, output_shape_temp);
+      const auto fill_bias_program = CreateMatMulFillBiasOrZeroBeforeSplitKProgram(bias, output_tensor, /*is_gemm*/ false, /*beta*/ 1.0f, /*bias_components*/ 4, output_shape_temp, narrow<uint32_t>(batch_size));
       ORT_RETURN_IF_ERROR(context->RunProgram(fill_bias_program));
 
       // `bias` has been handled in the execution of `fill_bias_program` so we don't need to set
       // `bias` again in `MatMulProgram`.
       use_bias_in_matmul = false;
 
-      // With Split-K, `dim_inner` will be split into multiple parts and `dispatch_z` will be the
-      // number of splits along `dim_inner`.
+      // With Split-K, `dim_inner` will be split into multiple parts. `dispatch_z` encodes
+      // both the split-k index and the batch index: dispatch_z = splits_per_batch * batch_size.
       split_dim_inner = split_k_config.GetSplitDimInner();
-      dispatch_z = (dim_inner + split_dim_inner - 1) / split_dim_inner;
+      splits_per_batch = (dim_inner + split_dim_inner - 1) / split_dim_inner;
+      const uint64_t dispatch_z_u64 = static_cast<uint64_t>(batch_size) * static_cast<uint64_t>(splits_per_batch);
+      ORT_ENFORCE(dispatch_z_u64 <= static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()),
+                  "dispatch_z exceeds uint32_t range: ", dispatch_z_u64);
+      dispatch_z = narrow<uint32_t>(dispatch_z_u64);
 
       // The output should be declared in atomic types in `MatMulProgram` for the use of atomic
       // built-in functions.
@@ -281,7 +288,7 @@ Status ComputeMatMul(ComputeContext* context,
       .CacheHint(activation.ToString(), absl::StrJoin(elements_per_thread, "-"), std::to_string(is_vec4), components, is_channels_last, split_dim_inner)
       .AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, a_shape_temp, components},
                   {b, ProgramTensorMetadataDependency::TypeAndRank, b_shape_temp, components}})
-      .AddUniformVariables({{dim_a_outer}, {dim_b_outer}, {dim_inner}, {dispatch_x}, {dispatch_y}, {dispatch_z}})
+      .AddUniformVariables({{dim_a_outer}, {dim_b_outer}, {dim_inner}, {dispatch_x}, {dispatch_y}, {dispatch_z}, {splits_per_batch}})
       .AddIndices(outer_dims)
       .SetDispatchGroupSize(dispatch_x, dispatch_y, dispatch_z)
       .SetWorkgroupSize(MatMul::MATMUL_PACKED_WORKGROUP_SIZE_X, MatMul::MATMUL_PACKED_WORKGROUP_SIZE_Y, MatMul::MATMUL_PACKED_WORKGROUP_SIZE_Z)
@@ -302,31 +309,32 @@ MatMulFillBiasOrZeroBeforeSplitKProgram CreateMatMulFillBiasOrZeroBeforeSplitKPr
     bool is_gemm,
     float beta,
     uint32_t output_components,
-    const TensorShape& output_shape) {
+    const TensorShape& output_shape,
+    uint32_t batch_size) {
   const bool has_bias = bias != nullptr;
   const bool bias_is_scalar = has_bias ? bias->Shape().Size() == 1 : false;
 
-  // Currently we only support GEMM and channels last format for MatMul with Split-K.
   MatMulFillBiasOrZeroBeforeSplitKProgram program(is_gemm, has_bias, output_components, bias_is_scalar);
 
   const uint32_t dim_a_outer = narrow<uint32_t>(output_shape[output_shape.NumDimensions() - 2]);
   const uint32_t dim_b_outer = narrow<uint32_t>(output_shape[output_shape.NumDimensions() - 1]);
 
-  // Fill one value per invocation. Now we use default workgroup size (64) for this program.
-  const uint32_t total_outputs = dim_a_outer * dim_b_outer;
-  const uint32_t dispatch_x = (total_outputs + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE;
+  // Fill one value per invocation across all batches.
+  const uint64_t total_outputs = static_cast<uint64_t>(batch_size) *
+                                 static_cast<uint64_t>(dim_a_outer) *
+                                 static_cast<uint64_t>(dim_b_outer);
+  const uint64_t dispatch_x_u64 = CeilDiv(total_outputs, static_cast<uint64_t>(WORKGROUP_SIZE));
+  ORT_ENFORCE(dispatch_x_u64 <= static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()),
+              "dispatch_x exceeds uint32_t range: ", dispatch_x_u64);
+  const uint32_t dispatch_x = narrow<uint32_t>(dispatch_x_u64);
 
-  // To reuse `MatMulWriteFnSourceForGemm()` or `MatMulWriteFnSourceForMatMul()` we need to set
-  // `dim_b_outer` in components when `output_shape` is in `vec4`, while use `output_shape` directly
-  // as the output shape.
   const uint32_t dim_b_outer_components = narrow<uint32_t>(dim_b_outer * output_components);
   program.CacheHint(is_gemm, has_bias, output_components, bias_is_scalar)
       .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, output_shape, static_cast<int32_t>(output_components)})
-      .AddUniformVariables({{dim_a_outer}, {dim_b_outer_components}, {beta}})
+      .AddUniformVariables({{dim_a_outer}, {dim_b_outer_components}, {beta}, {batch_size}})
       .SetDispatchGroupSize(dispatch_x);
 
   if (has_bias) {
-    // We always use `c_components` as `output_components` in GEMM, and 4 in MatMul.
     const TensorShape reduced_bias_shape = ReduceShapeByComponents(bias->Shape(), output_components);
     program.AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank, reduced_bias_shape, static_cast<int32_t>(output_components)});
   }
diff --git a/onnxruntime/core/providers/webgpu/math/matmul.h b/onnxruntime/core/providers/webgpu/math/matmul.h
index d15e36ffa3d85..89101c60a1b6c 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul.h
+++ b/onnxruntime/core/providers/webgpu/math/matmul.h
@@ -24,7 +24,8 @@ MatMulFillBiasOrZeroBeforeSplitKProgram CreateMatMulFillBiasOrZeroBeforeSplitKPr
     bool is_gemm,
     float beta,
     uint32_t output_components,
-    const TensorShape& output_shape);
+    const TensorShape& output_shape,
+    uint32_t batch_size = 1);
 
 class MatMul final : public WebGpuKernel {
  public:
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
index 0883c8ddb95b5..0d2a1962dd2a0 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.cc
@@ -65,7 +65,6 @@ Status MatMulFillBiasOrZeroBeforeSplitKProgram::GenerateShaderCode(ShaderHelper&
   }
 
   // Handle bias with `MatMulWriteFnSourceForGemm() or MatMulWriteFnSourceForMatMul()`.
-  // const uint32_t bias_components = output_components_;
   if (is_gemm_) {
     MatMulWriteFnSourceForGemm(shader, output, bias, bias_is_scalar_);
   } else {
@@ -77,15 +76,18 @@ Status MatMulFillBiasOrZeroBeforeSplitKProgram::GenerateShaderCode(ShaderHelper&
   shader.MainFunctionBody() << R"(
   let output_id = i32(global_idx);
 
+  let batch_size = i32(uniforms.batch_size);
   let dim_a_outer = i32(uniforms.dim_a_outer);
   let dim_b_outer = i32(uniforms.dim_b_outer) / output_components;
-  if (output_id >= dim_a_outer * dim_b_outer) {
+  let elements_per_batch = dim_a_outer * dim_b_outer;
+  if (output_id >= batch_size * elements_per_batch) {
     return;
   }
 
-  let output_row = output_id / dim_b_outer;
-  let output_col = output_id % dim_b_outer;
-  let output_batch = 0;
+  let output_batch = output_id / elements_per_batch;
+  let remaining = output_id % elements_per_batch;
+  let output_row = remaining / dim_b_outer;
+  let output_col = remaining % dim_b_outer;
   let output_value = output_value_t();
   mm_write(output_batch, output_row, output_col, output_value);
 )";
diff --git a/onnxruntime/core/providers/webgpu/math/matmul_packed.h b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
index 618fc97d72fe0..eceb79f3c6a98 100644
--- a/onnxruntime/core/providers/webgpu/math/matmul_packed.h
+++ b/onnxruntime/core/providers/webgpu/math/matmul_packed.h
@@ -27,7 +27,8 @@ class MatMulProgram final : public Program<MatMulProgram> {
                                           {"dim_inner", ProgramUniformVariableDataType::Uint32},
                                           {"logical_dispatch_x", ProgramUniformVariableDataType::Uint32},
                                           {"logical_dispatch_y", ProgramUniformVariableDataType::Uint32},
-                                          {"logical_dispatch_z", ProgramUniformVariableDataType::Uint32});
+                                          {"logical_dispatch_z", ProgramUniformVariableDataType::Uint32},
+                                          {"splits_per_batch", ProgramUniformVariableDataType::Uint32});
 
   bool NeedSplitK() const;
 
@@ -58,7 +59,8 @@ class MatMulFillBiasOrZeroBeforeSplitKProgram final : public Program<MatMulFillB
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"dim_a_outer", ProgramUniformVariableDataType::Uint32},
                                           {"dim_b_outer", ProgramUniformVariableDataType::Uint32},
-                                          {"beta", ProgramUniformVariableDataType::Float32});
+                                          {"beta", ProgramUniformVariableDataType::Float32},
+                                          {"batch_size", ProgramUniformVariableDataType::Uint32});
 
  private:
   bool is_gemm_ = false;
diff --git a/onnxruntime/core/providers/webgpu/nn/conv2d_mm.cc b/onnxruntime/core/providers/webgpu/nn/conv2d_mm.cc
index c66f2cbd582d9..948943b95bea5 100644
--- a/onnxruntime/core/providers/webgpu/nn/conv2d_mm.cc
+++ b/onnxruntime/core/providers/webgpu/nn/conv2d_mm.cc
@@ -217,7 +217,8 @@ Conv2dMMProgram CreateConv2dMMProgram(const Activation& activation, const std::v
     std::transform(vec.begin(), vec.end(), std::ostream_iterator<std::string>(oss, ","), [](uint32_t i) { return std::to_string(i); });
     return oss.str();
   };
-  program.CacheHint(activation.ToString(), is_channels_last, stringify({inner_element_size, static_cast<uint32_t>(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_a_outer, tile_inner, static_cast<uint32_t>(components)}))
+
+  program.CacheHint(activation.ToString(), is_channels_last, stringify({inner_element_size, static_cast<uint32_t>(is_vec4 ? 1 : 0), fit_a_outer, fit_b_outer, fit_inner, tile_a_outer, tile_b_outer, tile_inner, static_cast<uint32_t>(components)}))
       .AddOutput({output, ProgramTensorMetadataDependency::TypeAndRank, reduced_output_shape, components})
       .SetDispatchGroupSize(dispatch[0], dispatch[1], dispatch[2])
       .SetWorkgroupSize(workgroup_size[0], workgroup_size[1], workgroup_size[2])
diff --git a/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc b/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc
new file mode 100644
index 0000000000000..abf7df6f4b8a2
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/grid_sample.cc
@@ -0,0 +1,253 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/tensor/grid_sample.h"
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+Status GridSampleProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  const auto& grid = shader.AddInput("grid", ShaderUsage::UseUniform);
+  const auto& y = shader.AddOutput("y", ShaderUsage::UseUniform);
+
+  // gs_denormalize: specialized per align_corners
+  if (align_corners_) {
+    shader.AdditionalImplementation()
+        << "fn gs_denormalize(n: f32, length: u32) -> f32 {\n"
+        << "  return (n + 1.0) * 0.5 * f32(length - 1u);\n"
+        << "}\n";
+  } else {
+    shader.AdditionalImplementation()
+        << "fn gs_denormalize(n: f32, length: u32) -> f32 {\n"
+        << "  return ((n + 1.0) * f32(length) - 1.0) * 0.5;\n"
+        << "}\n";
+  }
+
+  // gs_reflect: only needed for reflection padding mode
+  if (padding_mode_ == 2) {
+    shader.AdditionalImplementation()
+        << "fn gs_reflect(v: f32, v_min: f32, v_max: f32) -> f32 {\n"
+        << "  var fv = v;\n"
+        << "  let range = v_max - v_min;\n"
+        << "  if (fv < v_min) {\n"
+        << "    let dv = v_min - fv;\n"
+        << "    let n = i32(dv / range);\n"
+        << "    let r = dv - f32(n) * range;\n"
+        << "    fv = select(v_max - r, v_min + r, n % 2 == 0);\n"
+        << "  } else if (fv > v_max) {\n"
+        << "    let dv = fv - v_max;\n"
+        << "    let n = i32(dv / range);\n"
+        << "    let r = dv - f32(n) * range;\n"
+        << "    fv = select(v_min + r, v_max - r, n % 2 == 0);\n"
+        << "  }\n"
+        << "  return fv;\n"
+        << "}\n";
+  }
+
+  // gs_cubic_coeffs: only needed for bicubic mode
+  if (mode_ == 2) {
+    shader.AdditionalImplementation()
+        << "fn gs_cubic_coeffs(t: f32) -> vec4<f32> {\n"
+        << "  let ax = abs(t);\n"
+        << "  let a = -0.75f;\n"
+        << "  let c0 = ((a * (ax + 1.0) - 5.0 * a) * (ax + 1.0) + 8.0 * a) * (ax + 1.0) - 4.0 * a;\n"
+        << "  let c1 = ((a + 2.0) * ax - (a + 3.0)) * ax * ax + 1.0;\n"
+        << "  let c2 = ((a + 2.0) * (1.0 - ax) - (a + 3.0)) * (1.0 - ax) * (1.0 - ax) + 1.0;\n"
+        << "  let c3 = ((a * (2.0 - ax) - 5.0 * a) * (2.0 - ax) + 8.0 * a) * (2.0 - ax) - 4.0 * a;\n"
+        << "  return vec4<f32>(c0, c1, c2, c3);\n"
+        << "}\n";
+  }
+
+  // gs_pixel: pixel fetch helper, specialized per padding_mode (and align_corners for reflection)
+  // Returns f32 always; caller casts to output type.
+  shader.AdditionalImplementation()
+      << "fn gs_pixel(img_base: u32, r: i32, col: i32) -> f32 {\n";
+
+  if (padding_mode_ == 0) {
+    // zeros: out-of-bounds -> 0
+    shader.AdditionalImplementation()
+        << "  if (r < 0 || r >= i32(uniforms.H_in) || col < 0 || col >= i32(uniforms.W_in)) {\n"
+        << "    return 0.0;\n"
+        << "  }\n"
+        << "  return f32(" << x.GetByOffset("img_base + u32(r) * uniforms.W_in + u32(col)") << ");\n";
+  } else if (padding_mode_ == 1) {
+    // border: clamp to nearest edge
+    shader.AdditionalImplementation()
+        << "  let cr = u32(clamp(r, 0, i32(uniforms.H_in) - 1));\n"
+        << "  let cc = u32(clamp(col, 0, i32(uniforms.W_in) - 1));\n"
+        << "  return f32(" << x.GetByOffset("img_base + cr * uniforms.W_in + cc") << ");\n";
+  } else {
+    // reflection: oscillating reflect, bounds depend on align_corners
+    if (align_corners_) {
+      // reflect within [0, length-1]
+      shader.AdditionalImplementation()
+          << "  let rr = i32(gs_reflect(f32(r),   0.0, f32(uniforms.H_in) - 1.0));\n"
+          << "  let cc = i32(gs_reflect(f32(col), 0.0, f32(uniforms.W_in) - 1.0));\n";
+    } else {
+      // reflect within [-0.5, length-0.5]
+      shader.AdditionalImplementation()
+          << "  let rr = i32(gs_reflect(f32(r),   -0.5, f32(uniforms.H_in) - 0.5));\n"
+          << "  let cc = i32(gs_reflect(f32(col), -0.5, f32(uniforms.W_in) - 0.5));\n";
+    }
+    shader.AdditionalImplementation()
+        << "  let ur = u32(clamp(rr, 0, i32(uniforms.H_in) - 1));\n"
+        << "  let uc = u32(clamp(cc, 0, i32(uniforms.W_in) - 1));\n"
+        << "  return f32(" << x.GetByOffset("img_base + ur * uniforms.W_in + uc") << ");\n";
+  }
+  shader.AdditionalImplementation() << "}\n";
+
+  // Main function body
+  auto& body = shader.MainFunctionBody();
+  body << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")
+       // Decode global_idx -> (n, c, h_out, w_out)
+       << "  let HW_out  = uniforms.H_out * uniforms.W_out;\n"
+       << "  let CHW_out = uniforms.C * HW_out;\n"
+       << "  let n       = global_idx / CHW_out;\n"
+       << "  let rem     = global_idx % CHW_out;\n"
+       << "  let c       = rem / HW_out;\n"
+       << "  let hw      = rem % HW_out;\n"
+       << "  let h_out   = hw / uniforms.W_out;\n"
+       << "  let w_out   = hw % uniforms.W_out;\n"
+       // Read normalized grid coords: grid is [N, H_out, W_out, 2], gx=x-coord (W), gy=y-coord (H)
+       << "  let grid_base = ((n * uniforms.H_out + h_out) * uniforms.W_out + w_out) * 2u;\n"
+       << "  let gx = f32(" << grid.GetByOffset("grid_base") << ");\n"
+       << "  let gy = f32(" << grid.GetByOffset("grid_base + 1u") << ");\n"
+       // Denormalize to image-space coordinates
+       << "  let px = gs_denormalize(gx, uniforms.W_in);\n"
+       << "  let py = gs_denormalize(gy, uniforms.H_in);\n"
+       // Base flat offset for this (n, c) plane of X: [N, C, H_in, W_in]
+       << "  let img_base = (n * uniforms.C + c) * uniforms.H_in * uniforms.W_in;\n";
+
+  if (mode_ == 1) {
+    // nearest: round to nearest integer
+    body << "  let rx = i32(round(px));\n"
+         << "  let ry = i32(round(py));\n"
+         << "  let result = gs_pixel(img_base, ry, rx);\n";
+  } else if (mode_ == 0) {
+    // bilinear: 4-neighbor weighted interpolation
+    body << "  let x1 = i32(floor(px));\n"
+         << "  let y1 = i32(floor(py));\n"
+         << "  let x2 = x1 + 1;\n"
+         << "  let y2 = y1 + 1;\n"
+         << "  let dx1 = px - f32(x1);\n"
+         << "  let dx2 = 1.0 - dx1;\n"
+         << "  let dy1 = py - f32(y1);\n"
+         << "  let dy2 = 1.0 - dy1;\n"
+         << "  let p11 = gs_pixel(img_base, y1, x1);\n"
+         << "  let p12 = gs_pixel(img_base, y1, x2);\n"
+         << "  let p21 = gs_pixel(img_base, y2, x1);\n"
+         << "  let p22 = gs_pixel(img_base, y2, x2);\n"
+         << "  let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);\n";
+  } else {
+    // bicubic: 4x4 neighborhood with Robert Keys coefficients (alpha=-0.75)
+    body << "  let x0 = i32(floor(px)) - 1;\n"
+         << "  let y0 = i32(floor(py)) - 1;\n"
+         << "  let dx = px - f32(x0 + 1);\n"
+         << "  let dy = py - f32(y0 + 1);\n"
+         << "  let cx = gs_cubic_coeffs(dx);\n"
+         << "  let cy = gs_cubic_coeffs(dy);\n"
+         << "  var rows: vec4<f32>;\n"
+         << "  for (var i = 0i; i < 4i; i++) {\n"
+         << "    let row = y0 + i;\n"
+         << "    rows[i] = cx[0] * gs_pixel(img_base, row, x0    )\n"
+         << "            + cx[1] * gs_pixel(img_base, row, x0 + 1)\n"
+         << "            + cx[2] * gs_pixel(img_base, row, x0 + 2)\n"
+         << "            + cx[3] * gs_pixel(img_base, row, x0 + 3);\n"
+         << "  }\n"
+         << "  let result = dot(cy, rows);\n";
+  }
+
+  body << "  " << y.SetByOffset("global_idx", "x_value_t(result)") << "\n";
+
+  return Status::OK();
+}
+
+GridSample::GridSample(const OpKernelInfo& info) : WebGpuKernel(info) {
+  // Accept both opset-16 names ("bilinear"/"bicubic") and opset-20+ names ("linear"/"cubic")
+  std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
+  if (mode_str == "bilinear" || mode_str == "linear") {
+    mode_ = 0;
+  } else if (mode_str == "nearest") {
+    mode_ = 1;
+  } else if (mode_str == "bicubic" || mode_str == "cubic") {
+    mode_ = 2;
+  } else {
+    ORT_THROW("GridSample: unsupported mode \"", mode_str, "\"");
+  }
+
+  std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
+  if (padding_mode_str == "zeros") {
+    padding_mode_ = 0;
+  } else if (padding_mode_str == "border") {
+    padding_mode_ = 1;
+  } else if (padding_mode_str == "reflection") {
+    padding_mode_ = 2;
+  } else {
+    ORT_THROW("GridSample: unsupported padding_mode \"", padding_mode_str, "\"");
+  }
+
+  align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
+}
+
+Status GridSample::ComputeInternal(ComputeContext& context) const {
+  const auto* X = context.Input<Tensor>(0);
+  const auto* grid = context.Input<Tensor>(1);
+
+  const auto& X_shape = X->Shape();
+  const auto& grid_shape = grid->Shape();
+
+  ORT_RETURN_IF_NOT(X_shape.NumDimensions() == 4, "X must be 4-D for opset 16");
+  ORT_RETURN_IF_NOT(grid_shape.NumDimensions() == 4, "grid must be 4-D");
+  ORT_RETURN_IF_NOT(grid_shape[3] == 2, "grid last dimension must be 2");
+
+  const int64_t N = X_shape[0];
+  const int64_t C = X_shape[1];
+  const int64_t H_in = X_shape[2];
+  const int64_t W_in = X_shape[3];
+
+  ORT_RETURN_IF_NOT(grid_shape[0] == N, "grid batch size must match X batch size");
+
+  const int64_t H_out = grid_shape[1];
+  const int64_t W_out = grid_shape[2];
+
+  TensorShape Y_shape{N, C, H_out, W_out};
+  auto* Y = context.Output(0, Y_shape);
+
+  const uint32_t output_size = onnxruntime::narrow<uint32_t>(Y_shape.Size());
+  if (output_size == 0) {
+    return Status::OK();
+  }
+
+  GridSampleProgram program{mode_, padding_mode_, align_corners_};
+  program
+      .AddInputs({{X, ProgramTensorMetadataDependency::TypeAndRank},
+                  {grid, ProgramTensorMetadataDependency::TypeAndRank}})
+      .AddOutput({Y, ProgramTensorMetadataDependency::Rank})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .CacheHint(mode_, padding_mode_, static_cast<int>(align_corners_))
+      .AddUniformVariables({{output_size},
+                            {static_cast<uint32_t>(C)},
+                            {static_cast<uint32_t>(H_in)},
+                            {static_cast<uint32_t>(W_in)},
+                            {static_cast<uint32_t>(H_out)},
+                            {static_cast<uint32_t>(W_out)}});
+
+  return context.RunProgram(program);
+}
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    GridSample,
+    kOnnxDomain,
+    16, 19,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T1", WebGpuSupportedFloatTypes())
+        .TypeConstraint("T2", WebGpuSupportedFloatTypes()),
+    GridSample);
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/tensor/grid_sample.h b/onnxruntime/core/providers/webgpu/tensor/grid_sample.h
new file mode 100644
index 0000000000000..acc100c725009
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/tensor/grid_sample.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+// mode: 0=bilinear(linear), 1=nearest, 2=bicubic(cubic)
+// padding_mode: 0=zeros, 1=border, 2=reflection
+
+class GridSampleProgram final : public Program<GridSampleProgram> {
+ public:
+  GridSampleProgram(int mode, int padding_mode, bool align_corners)
+      : Program{"GridSample"},
+        mode_{mode},
+        padding_mode_{padding_mode},
+        align_corners_{align_corners} {}
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES(
+      {"output_size", ProgramUniformVariableDataType::Uint32},
+      {"C", ProgramUniformVariableDataType::Uint32},
+      {"H_in", ProgramUniformVariableDataType::Uint32},
+      {"W_in", ProgramUniformVariableDataType::Uint32},
+      {"H_out", ProgramUniformVariableDataType::Uint32},
+      {"W_out", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int mode_;
+  int padding_mode_;
+  bool align_corners_;
+};
+
+class GridSample final : public WebGpuKernel {
+ public:
+  explicit GridSample(const OpKernelInfo& info);
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  int mode_{0};
+  int padding_mode_{0};
+  bool align_corners_{false};
+};
+
+}  // namespace webgpu
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc b/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc
index 0362deb0fbd6a..b6ec2e0c2b10b 100644
--- a/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc
+++ b/onnxruntime/core/providers/webgpu/vendor/intel/math/matmul.cc
@@ -55,18 +55,18 @@ Status ApplyMatMulIntel(ComputeContext& context,
 
   TensorShape output_shape = helper.OutputShape();
 
-  const int64_t dim_output_outer = output_shape[output_shape.NumDimensions() - 2];
-  // check if A is batch of vector (bach is not 1, M is 1) and B is a matrix (batch is 1)
-  if (batchA != 1 && dim_output_outer == 1 && batchB == 1) {
-    // optimization for batched vector matrix multiplication
-    // dimensions of A: [1,`batchA`,K]
-    TensorShapeVector dims_a = {1, batchA, helper.K()};
+  // When B is a matrix (batch is 1), we fold batchA into the M dimension for better
+  // performance (e.g., [2,3,5] → [1,6,5]).
+  if (batchA != 1 && batchB == 1) {
+    // dimensions of A: [1,`batchA`, M, K]
+    int64_t batchAndM = a_shape.SizeToDimension(a_shape.NumDimensions() - 1);
+    TensorShapeVector dims_a = {1, batchAndM, helper.K()};
     // dimensions of B: [1,K,N]
     TensorShapeVector dims_b = {1, helper.K(), helper.N()};
 
     a_shape = TensorShape(dims_a);
     b_shape = TensorShape(dims_b);
-    output_shape = {1, batchA, helper.N()};
+    output_shape = {1, batchAndM, helper.N()};
   }
 
   // helpful dimension variables
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
index d85f5011ea043..d1cde04277938 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -31,6 +31,7 @@
 #include "core/providers/webgpu/webgpu_profiler.h"
 #include "core/providers/webgpu/tensor/cast.h"
 #include "core/providers/webgpu/tensor/expand.h"
+#include "core/providers/webgpu/tensor/grid_sample.h"
 #include "core/providers/webgpu/generator/range.h"
 #include "core/providers/webgpu/tensor/unsqueeze.h"
 
@@ -448,6 +449,8 @@ static const BuildKernelCreateInfoFn build_kernel_create_info_function_table[] =
     BuildKernelCreateInfo<class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 15, ScatterElements)>,
     BuildKernelCreateInfo<class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
     BuildKernelCreateInfo<class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 18, ScatterElements)>,
+
+    BuildKernelCreateInfo<class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, 19, GridSample)>,
 };
 
 std::unique_ptr<KernelRegistry> RegisterKernels(bool enable_graph_capture, bool enable_int64) {
@@ -716,6 +719,11 @@ std::optional<bool> WebGpuExecutionProvider::ShouldConvertDataLayoutForOp(std::s
     return target_data_layout != DataLayout::NHWC;
   }
 
+  // GridSample is NCHW-only (opset 16 spec requires NCHW input)
+  if (node_domain == kOnnxDomain && node_op_type == "GridSample") {
+    return target_data_layout != DataLayout::NHWC;
+  }
+
   // WebGPU perfer NCHW for InstanceNormalization due to a better performance
   if (node_domain == kOnnxDomain && node_op_type == "InstanceNormalization") {
     return target_data_layout != DataLayout::NHWC;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.cc b/onnxruntime/core/providers/webgpu/webgpu_utils.cc
index 5127801ca8451..ec0664c5fdb6a 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_utils.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_utils.cc
@@ -36,34 +36,36 @@ SplitKConfig::SplitKConfig(const wgpu::AdapterInfo& adapter_info) {
     } else if (adapter_info.architecture == std::string_view{"xe-2lpg"} ||
                adapter_info.architecture == std::string_view{"xe-2hpg"} ||
                adapter_info.architecture == std::string_view{"gen-12hp"}) {
-      // Below thresholds are only verified on Intel discreate GPUs and Lunar Lake iGPUs.
+      // Below thresholds are only verified on Intel discrete GPUs and Lunar Lake iGPUs.
       enable_split_k_ = true;
 
+      max_batch_size_ = 8;
       split_dim_inner_ = 256;
       min_dim_inner_with_split_k_ = split_dim_inner_ * 2;
 
-      configs_per_dim_inner_range_.emplace_back(768, 52.0f);
-      configs_per_dim_inner_range_.emplace_back(2304, 35.0f);
-      configs_per_dim_inner_range_.emplace_back(3072, 21.5f);
-      configs_per_dim_inner_range_.emplace_back(4096, 16.0f);
+      configs_per_dim_inner_range_.emplace_back(768, 52.0);
+      configs_per_dim_inner_range_.emplace_back(2304, 35.0);
+      configs_per_dim_inner_range_.emplace_back(3072, 21.5);
+      configs_per_dim_inner_range_.emplace_back(4096, 16.0);
     } else {
       // Below are the default thresholds on newer Intel GPUs. These values are chosen on
       // Intel "gen-12lp" GPU with 32EUs.
       enable_split_k_ = true;
 
+      max_batch_size_ = 8;
       split_dim_inner_ = 256;
       min_dim_inner_with_split_k_ = split_dim_inner_ * 2;
 
-      configs_per_dim_inner_range_.emplace_back(768, 20.0f);
-      configs_per_dim_inner_range_.emplace_back(1792, 13.0f);
-      configs_per_dim_inner_range_.emplace_back(3072, 8.0f);
-      configs_per_dim_inner_range_.emplace_back(4096, 6.0f);
+      configs_per_dim_inner_range_.emplace_back(768, 20.0);
+      configs_per_dim_inner_range_.emplace_back(1792, 13.0);
+      configs_per_dim_inner_range_.emplace_back(3072, 8.0);
+      configs_per_dim_inner_range_.emplace_back(4096, 6.0);
     }
   }
 }
 
-SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, float rate)
-    : max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner(rate) {}
+SplitKConfig::ConfigAtRange::ConfigAtRange(uint32_t max_dim_inner, double rate)
+    : max_dim_inner_with_rate(max_dim_inner), max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner(rate) {}
 
 uint32_t SplitKConfig::GetMaxDimInnerWithSplitK() const {
   assert(!configs_per_dim_inner_range_.empty());
@@ -87,7 +89,10 @@ bool SplitKConfig::UseSplitK(
   // TODO: support the cases below.
   use_split_k &= activation_kind == ActivationKind::None;
   use_split_k &= is_vec4;
-  use_split_k &= batch_size == 1;
+
+  // Larger batches increase parallelism on their own, so we temporarily set a batch size threshold
+  // for using Split-K.
+  use_split_k &= batch_size <= max_batch_size_;
 
   // `is_channels_last` should only affect Split-K gating when bias is applied in the non-GEMM
   // MatMul/Conv|MatMul path. For GEMM and for MatMul or Conv|MatMul without bias, we need to
@@ -97,8 +102,8 @@ bool SplitKConfig::UseSplitK(
   use_split_k &= is_channels_last;
 
   // Split-K works best when `dim_inner` is relatively large compared with `dim_a_outer` and
-  // `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer)` and
-  // `dim_inner)` as the metric to decide whether to use Split-K or not.
+  // `dim_b_outer`. Currently we use the factor between `(dim_a_outer * dim_b_outer * batch_size)`
+  // and `dim_inner` as the metric to decide whether to use Split-K or not.
   use_split_k &= dim_inner >= min_dim_inner_with_split_k_;
   use_split_k &= dim_inner <= GetMaxDimInnerWithSplitK();
 
@@ -106,10 +111,10 @@ bool SplitKConfig::UseSplitK(
     return false;
   }
 
-  const float rate = dim_a_outer * dim_b_outer * 1.0f / dim_inner;
+  const double rate = static_cast<double>(dim_a_outer) * static_cast<double>(dim_b_outer) * static_cast<double>(batch_size) / static_cast<double>(dim_inner);
   for (const auto& config_at_range : configs_per_dim_inner_range_) {
     if (dim_inner <= config_at_range.max_dim_inner_with_rate) {
-      return rate <= config_at_range.max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner;
+      return rate <= config_at_range.max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner;
     }
   }
   return false;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_utils.h b/onnxruntime/core/providers/webgpu/webgpu_utils.h
index cbceaf2be120d..d4bb245e3e9e8 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_utils.h
+++ b/onnxruntime/core/providers/webgpu/webgpu_utils.h
@@ -115,13 +115,14 @@ class SplitKConfig {
   bool enable_split_k_ = false;
   uint32_t split_dim_inner_ = 0;
   uint32_t min_dim_inner_with_split_k_ = 0;
+  uint32_t max_batch_size_ = 0;
 
   uint32_t GetMaxDimInnerWithSplitK() const;
 
   struct ConfigAtRange {
-    ConfigAtRange(uint32_t max_dim_inner, float rate);
+    ConfigAtRange(uint32_t max_dim_inner, double rate);
     uint32_t max_dim_inner_with_rate = 0;
-    float max_dim_a_outer_multiplies_dim_b_outer_divides_dim_inner = 0.0f;
+    double max_dim_a_outer_x_dim_b_outer_x_batch_size_divides_dim_inner = 0.0;
   };
   std::vector<ConfigAtRange> configs_per_dim_inner_range_;
 };
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
index 727531f6a42d5..37b3c8eae7ebd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc
@@ -95,8 +95,10 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   options.set("padding", emscripten::val::array(padding));
 
   const auto ceil_mode = helper.Get("ceil_mode", 0);
-  options.set("roundingType", ceil_mode == 0 ? emscripten::val("floor")
-                                             : emscripten::val("ceil"));
+  emscripten::val output_shape_rounding = ceil_mode == 0 ? emscripten::val("floor") : emscripten::val("ceil");
+  // WebNN renamed roundingType to outputShapeRounding, but set older name too for compatibility.
+  options.set("roundingType", output_shape_rounding);
+  options.set("outputShapeRounding", output_shape_rounding);
 
   // WebNN doesn't support AveragePool with count_include_pad == 1, emulate it by pad + averagePool2d.
   if (op_type == "AveragePool" && helper.Get("count_include_pad", 0) == 1) {
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index ff4c08c9b14c0..3f28529e7a847 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -432,6 +432,112 @@ union PtrConvert {
   const char** strings;
 };
 
+// Shared validation for COO indices used by both FillSparseTensorCoo and UseCooIndices.
+// Returns nullptr on success, OrtStatus* on validation failure.
+OrtStatus* ValidateCooIndices(const int64_t* indices_data, size_t indices_num,
+                              size_t values_size, const TensorShape& dense_shape) {
+  if (indices_num > 0 && indices_data == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "indices_data must not be null when indices_num > 0.");
+  }
+  if ((values_size == 0) != (indices_num == 0)) {
+    return OrtApis::CreateStatus(
+        ORT_INVALID_ARGUMENT,
+        values_size == 0
+            ? "COO indices must be empty when the sparse tensor has no values."
+            : "COO indices must be provided when the sparse tensor has values.");
+  }
+  if (values_size > 0 && indices_num > 0) {
+    if (indices_num == values_size) {
+      const auto dense_size = dense_shape.Size();
+      for (size_t i = 0; i < indices_num; ++i) {
+        if (indices_data[i] < 0 || indices_data[i] >= dense_size) {
+          return OrtApis::CreateStatus(
+              ORT_INVALID_ARGUMENT,
+              MakeString("COO linear index out of bounds: ", indices_data[i],
+                         " must be in [0, ", dense_size, ")")
+                  .c_str());
+        }
+      }
+    } else if (indices_num / 2 == values_size && indices_num % 2 == 0) {
+      if (dense_shape.NumDimensions() != 2) {
+        return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                     "COO 2D indices require dense shape of 2 dimensions");
+      }
+      const auto rows = dense_shape.GetDims()[0];
+      const auto cols = dense_shape.GetDims()[1];
+      size_t tuple_idx = 0;
+      for (size_t i = 0; i < values_size; ++i, tuple_idx += 2) {
+        auto r = indices_data[tuple_idx];
+        auto c = indices_data[tuple_idx + 1];
+        if (r < 0 || r >= rows || c < 0 || c >= cols) {
+          return OrtApis::CreateStatus(
+              ORT_INVALID_ARGUMENT,
+              MakeString("COO 2D index out of bounds: (", r, ", ", c,
+                         ") must be in [0, ", rows, ") x [0, ", cols, ")")
+                  .c_str());
+        }
+      }
+    } else {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                   "COO indices count must be equal to or twice the values count.");
+    }
+  }
+  return nullptr;
+}
+
+// Shared validation for CSR indices used by both FillSparseTensorCsr and UseCsrIndices.
+// Returns nullptr on success, OrtStatus* on validation failure.
+OrtStatus* ValidateCsrIndices(const int64_t* inner_data, size_t inner_num,
+                              const int64_t* outer_data, size_t outer_num,
+                              const TensorShape& dense_shape) {
+  if (inner_num > 0 && inner_data == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "inner index data must not be null when inner index count > 0.");
+  }
+  if (outer_num > 0 && outer_data == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT,
+                                 "outer index data must not be null when outer index count > 0.");
+  }
+  if (dense_shape.NumDimensions() == 2 && inner_num > 0) {
+    const auto cols = dense_shape.GetDims()[1];
+    for (size_t i = 0; i < inner_num; ++i) {
+      if (inner_data[i] < 0 || inner_data[i] >= cols) {
+        return OrtApis::CreateStatus(
+            ORT_INVALID_ARGUMENT,
+            MakeString("CSR inner index out of bounds: ", inner_data[i],
+                       " must be in [0, ", cols, ")")
+                .c_str());
+      }
+    }
+  }
+  if (outer_num > 0) {
+    if (outer_data[0] != 0) {
+      return OrtApis::CreateStatus(
+          ORT_INVALID_ARGUMENT,
+          MakeString("CSR outer index must start at 0, got: ", outer_data[0]).c_str());
+    }
+    if (outer_data[outer_num - 1] != static_cast<int64_t>(inner_num)) {
+      return OrtApis::CreateStatus(
+          ORT_INVALID_ARGUMENT,
+          MakeString("CSR outer index last element must equal inner index count (",
+                     inner_num, "), got: ", outer_data[outer_num - 1])
+              .c_str());
+    }
+    int64_t prev = 0;
+    for (size_t i = 0; i < outer_num; ++i) {
+      auto val = outer_data[i];
+      if (val < prev || val > static_cast<int64_t>(inner_num)) {
+        return OrtApis::CreateStatus(
+            ORT_INVALID_ARGUMENT,
+            MakeString("CSR outer index out of bounds or not monotonically non-decreasing: ", val).c_str());
+      }
+      prev = val;
+    }
+  }
+  return nullptr;
+}
+
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
 }  // namespace
 
@@ -446,6 +552,11 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCoo, _Inout_ OrtValue* ort_value, _
   auto values_size = narrow<size_t>(values_t_shape.Size());
   auto indices_span = gsl::make_span(indices_data, indices_num);
 
+  if (auto* status = ValidateCooIndices(indices_data, indices_num, values_size,
+                                        sparse_tensor.DenseShape())) {
+    return status;
+  }
+
   if (sparse_tensor.IsDataTypeString()) {
     PtrConvert conv(values);
     ORT_THROW_IF_ERROR(sparse_tensor.MakeCooStrings(values_size, conv.strings, indices_span));
@@ -481,6 +592,13 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCsr, _Inout_ OrtValue* ort_value, _
 
   auto inner_indices_span = gsl::make_span(inner_indices_data, inner_indices_num);
   auto outer_indices_span = gsl::make_span(outer_indices_data, outer_indices_num);
+
+  if (auto* status = ValidateCsrIndices(inner_indices_data, inner_indices_num,
+                                        outer_indices_data, outer_indices_num,
+                                        sparse_tensor.DenseShape())) {
+    return status;
+  }
+
   if (sparse_tensor.IsDataTypeString()) {
     PtrConvert conv(values);
     ORT_THROW_IF_ERROR(sparse_tensor.MakeCsrStrings(values_size, conv.strings, inner_indices_span, outer_indices_span));
@@ -592,6 +710,12 @@ ORT_API_STATUS_IMPL(OrtApis::UseCooIndices, _Inout_ OrtValue* ort_value, _Inout_
                           ? gsl::span<int64_t>()
                           : gsl::make_span(indices_data, indices_num);
 
+  if (auto* status = ValidateCooIndices(indices_data, indices_num,
+                                        sparse_tensor.NumValues(),
+                                        sparse_tensor.DenseShape())) {
+    return status;
+  }
+
   ORT_THROW_IF_ERROR(sparse_tensor.UseCooIndices(indices_span));
   return nullptr;
 #else
@@ -616,6 +740,12 @@ ORT_API_STATUS_IMPL(OrtApis::UseCsrIndices, _Inout_ OrtValue* ort_value,
   auto outer_span = (outer_num == 0 || outer_data == nullptr)
                         ? gsl::span<int64_t>()
                         : gsl::make_span(outer_data, outer_num);
+
+  if (auto* status = ValidateCsrIndices(inner_data, inner_num, outer_data, outer_num,
+                                        sparse_tensor.DenseShape())) {
+    return status;
+  }
+
   ORT_THROW_IF_ERROR(sparse_tensor.UseCsrIndices(inner_span, outer_span));
   return nullptr;
 #else
diff --git a/onnxruntime/core/session/ort_version_check.h b/onnxruntime/core/session/ort_version_check.h
index 82fd757e3ce9f..f8fab0367b17d 100644
--- a/onnxruntime/core/session/ort_version_check.h
+++ b/onnxruntime/core/session/ort_version_check.h
@@ -10,21 +10,20 @@
 
 namespace onnxruntime::version_check {
 
-// A simple consteval-friendly result type for ParseUint.
-// std::optional triggers an internal compiler error in MSVC 14.44 when used with consteval.
+// A simple constexpr-friendly result type for ParseUint.
 struct ParseUintResult {
   uint32_t value;
   bool has_value;
 
-  consteval bool operator==(uint32_t other) const { return has_value && value == other; }
-  consteval bool operator!=(uint32_t other) const { return !(*this == other); }
+  constexpr bool operator==(uint32_t other) const { return has_value && value == other; }
+  constexpr bool operator!=(uint32_t other) const { return !(*this == other); }
 };
 
-inline consteval ParseUintResult ParseUintNone() { return {0, false}; }
+inline constexpr ParseUintResult ParseUintNone() { return {0, false}; }
 
 // Parse a non-negative integer from a string_view without leading zeros.
 // Returns a result with has_value == false on failure (empty, leading zero, non-digit, or overflow).
-consteval ParseUintResult ParseUint(std::string_view str) {
+constexpr ParseUintResult ParseUint(std::string_view str) {
   if (str.empty()) return ParseUintNone();
   // Leading zeros are not allowed (except "0" itself).
   if (str.size() > 1 && str[0] == '0') return ParseUintNone();
@@ -42,7 +41,7 @@ consteval ParseUintResult ParseUint(std::string_view str) {
 //   - Major version is 1
 //   - Y and Z are non-negative integers without leading zeros
 //   - Y (minor version) must equal expected_api_version (defaults to ORT_API_VERSION)
-consteval bool IsOrtVersionValid(std::string_view version, uint32_t expected_api_version = ORT_API_VERSION) {
+constexpr bool IsOrtVersionValid(std::string_view version, uint32_t expected_api_version = ORT_API_VERSION) {
   size_t first_dot = version.find('.');
   if (first_dot == std::string_view::npos) return false;
   size_t second_dot = version.find('.', first_dot + 1);
diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py
index 19f46189e2933..69be7a7657adf 100644
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@@ -17,6 +17,29 @@
 from onnxruntime import InferenceSession, SessionOptions, get_available_providers, get_device
 from onnxruntime.backend.backend_rep import OnnxRuntimeBackendRep
 
+# Allowlist of SessionOptions attributes that are safe to set via the backend API.
+# Dangerous attributes intentionally excluded:
+#   optimized_model_filepath  — triggers Model::Save(), overwrites arbitrary files
+#   profile_file_prefix       — writes profiling JSON to arbitrary path
+#   enable_profiling          — causes uncontrolled file writes to cwd
+_ALLOWED_SESSION_OPTIONS = frozenset(
+    {
+        "enable_cpu_mem_arena",
+        "enable_mem_pattern",
+        "enable_mem_reuse",
+        "execution_mode",
+        "execution_order",
+        "graph_optimization_level",
+        "inter_op_num_threads",
+        "intra_op_num_threads",
+        "log_severity_level",
+        "log_verbosity_level",
+        "logid",
+        "use_deterministic_compute",
+        "use_per_session_threads",
+    }
+)
+
 
 class OnnxRuntimeBackend(Backend):
     """
@@ -93,16 +116,18 @@ def supports_device(cls, device):
     @classmethod
     def prepare(cls, model, device=None, **kwargs):
         """
-        Load the model and creates a :class:`onnxruntime.InferenceSession`
+        Load the model and creates an :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep`
         ready to be used as a backend.
 
-        :param model: ModelProto (returned by `onnx.load`),
-            string for a filename or bytes for a serialized model
+        :param model: the model to prepare — accepts a file path (str), serialized
+            model (bytes), :class:`onnx.ModelProto`, :class:`onnxruntime.InferenceSession`,
+            or :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep` (returned as-is)
         :param device: requested device for the computation,
             None means the default one which depends on
             the compilation settings
-        :param kwargs: see :class:`onnxruntime.SessionOptions`
-        :return: :class:`onnxruntime.InferenceSession`
+        :param kwargs: only a safe subset of :class:`onnxruntime.SessionOptions` attributes are
+            accepted; see ``_ALLOWED_SESSION_OPTIONS`` for the list
+        :return: :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep`
         """
         if isinstance(model, OnnxRuntimeBackendRep):
             return model
@@ -111,8 +136,14 @@ def prepare(cls, model, device=None, **kwargs):
         elif isinstance(model, (str, bytes)):
             options = SessionOptions()
             for k, v in kwargs.items():
-                if hasattr(options, k):
+                if k in _ALLOWED_SESSION_OPTIONS:
                     setattr(options, k, v)
+                elif hasattr(options, k):
+                    raise RuntimeError(
+                        f"SessionOptions attribute '{k}' is not permitted via the backend API. "
+                        f"Allowed attributes: {', '.join(sorted(_ALLOWED_SESSION_OPTIONS))}"
+                    )
+                # else: silently ignore unknown keys
 
             excluded_providers = os.getenv("ORT_ONNX_BACKEND_EXCLUDE_PROVIDERS", default="").split(",")
             providers = [x for x in get_available_providers() if (x not in excluded_providers)]
@@ -148,13 +179,21 @@ def run_model(cls, model, inputs, device=None, **kwargs):
         """
         Compute the prediction.
 
-        :param model: :class:`onnxruntime.InferenceSession` returned
-            by function *prepare*
+        :param model: the model to run — accepts a file path (str), serialized
+            model (bytes), :class:`onnx.ModelProto`, :class:`onnxruntime.InferenceSession`,
+            or :class:`onnxruntime.backend.backend_rep.OnnxRuntimeBackendRep`
         :param inputs: inputs
         :param device: requested device for the computation,
             None means the default one which depends on
             the compilation settings
-        :param kwargs: see :class:`onnxruntime.RunOptions`
+        :param kwargs: ``run_model()`` forwards kwargs to both ``prepare()`` and ``rep.run()``.
+            ``prepare()`` validates and applies ``_ALLOWED_SESSION_OPTIONS`` only when creating
+            a new session from a model path or bytes; if ``model`` is already an
+            ``InferenceSession`` or ``OnnxRuntimeBackendRep``, session-option kwargs are
+            silently ignored. ``rep.run()`` always validates against ``_ALLOWED_RUN_OPTIONS``
+            and raises ``RuntimeError`` for known-but-blocked run attributes.
+            Logging-related kwargs (``log_severity_level``, ``log_verbosity_level``, ``logid``)
+            appear in both allowlists.
         :return: predictions
         """
         rep = cls.prepare(model, device, **kwargs)
diff --git a/onnxruntime/python/backend/backend_rep.py b/onnxruntime/python/backend/backend_rep.py
index a30569d004d34..950ce417c6c2d 100644
--- a/onnxruntime/python/backend/backend_rep.py
+++ b/onnxruntime/python/backend/backend_rep.py
@@ -10,11 +10,23 @@
 
 from onnxruntime import RunOptions
 
+# Allowlist of RunOptions attributes that are safe to set via the backend API.
+# 'terminate' excluded: setting it True would deny the current inference call.
+# 'training_mode' excluded: silently switches inference behavior in training builds.
+_ALLOWED_RUN_OPTIONS = frozenset(
+    {
+        "log_severity_level",
+        "log_verbosity_level",
+        "logid",
+        "only_execute_path_to_fetches",
+    }
+)
+
 
 class OnnxRuntimeBackendRep(BackendRep):
     """
-    Computes the prediction for a pipeline converted into
-    an :class:`onnxruntime.InferenceSession` node.
+    Wraps an :class:`onnxruntime.InferenceSession` to implement ONNX's
+    :class:`onnx.backend.base.BackendRep` interface for running predictions.
     """
 
     def __init__(self, session):
@@ -27,12 +39,24 @@ def run(self, inputs, **kwargs):  # type: (Any, **Any) -> Tuple[Any, ...]
         """
         Computes the prediction.
         See :meth:`onnxruntime.InferenceSession.run`.
+
+        :param inputs: a list of input arrays (one per model input) or a single
+            array when the model has exactly one input
+        :param kwargs: only a safe subset of :class:`onnxruntime.RunOptions` attributes are
+            accepted; see ``_ALLOWED_RUN_OPTIONS`` for the list
+        :return: list of output arrays
         """
 
         options = RunOptions()
         for k, v in kwargs.items():
-            if hasattr(options, k):
+            if k in _ALLOWED_RUN_OPTIONS:
                 setattr(options, k, v)
+            elif hasattr(options, k):
+                raise RuntimeError(
+                    f"RunOptions attribute '{k}' is not permitted via the backend API. "
+                    f"Allowed attributes: {', '.join(sorted(_ALLOWED_RUN_OPTIONS))}"
+                )
+            # else: silently ignore unknown keys
 
         if isinstance(inputs, list):
             inps = {}
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index def2240358c10..e35e3c5753d36 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -13,10 +13,11 @@
 from enum import IntEnum
 from typing import Any
 
+import numpy as np
+
 from onnxruntime.capi import _pybind_state as C
 
 if typing.TYPE_CHECKING:
-    import numpy as np
     import numpy.typing as npt
 
     import onnxruntime
@@ -1212,8 +1213,6 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
             If ``None`` (default), a copy will be made only if needed.
         :return: A numpy array with the same data as the OrtValue.
         """
-        import numpy as np  # noqa: PLC0415
-
         arr = self.numpy()
 
         if copy is not None:
@@ -1302,15 +1301,25 @@ def from_dlpack(cls, data, /) -> OrtValue:
 
         return cls(C.OrtValue.from_dlpack(capsule, is_bool))
 
-    def update_inplace(self, np_arr) -> None:
+    def update_inplace(self, data) -> None:
         """
-        Update the OrtValue in place with a new Numpy array. The numpy contents
-        are copied over to the device memory backing the OrtValue. It can be used
-        to update the input valuess for an InferenceSession with CUDA graph
-        enabled or other scenarios where the OrtValue needs to be updated while
-        the memory address can not be changed.
+        Update the OrtValue in place. The source data is copied over to the device
+        memory backing the OrtValue. It can be used to update the input values for
+        an InferenceSession with CUDA graph enabled or other scenarios where the
+        OrtValue needs to be updated while the memory address can not be changed.
+
+        :param data: The source data, which can be a Numpy array or another OrtValue.
+            When an OrtValue is provided, data can be copied between devices (e.g.,
+            GPU to GPU) without going through the CPU.
         """
-        self._ortvalue.update_inplace(np_arr)
+        if isinstance(data, OrtValue):
+            self._ortvalue.update_inplace(data._ortvalue)
+            return
+
+        if not isinstance(data, np.ndarray):
+            raise TypeError("data must be a numpy.ndarray or an OrtValue.")
+
+        self._ortvalue.update_inplace(data)
 
 
 def copy_tensors(src: Sequence[OrtValue], dst: Sequence[OrtValue], stream=None) -> None:
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 89651c2d955de..fa609fe6ea83d 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -1071,5 +1071,116 @@ void CreateGenericMLValue(const onnxruntime::InputDefList* input_def_list, const
   }
 }
 
+void UpdateOrtValueInplace(OrtValue& dst, const OrtValue& src) {
+  if (!dst.IsTensor()) {
+    throw std::runtime_error("Inplace update of OrtValues is only supported for Tensors");
+  }
+  if (!src.IsTensor()) {
+    throw std::runtime_error("The source OrtValue must contain a Tensor");
+  }
+
+  const auto& dst_tensor = dst.Get<Tensor>();
+  const auto& src_tensor = src.Get<Tensor>();
+
+  if (dst_tensor.DataType() != src_tensor.DataType()) {
+    throw std::runtime_error("The source and destination OrtValues must have the same data type");
+  }
+
+  if (dst_tensor.Shape().Size() != src_tensor.Shape().Size()) {
+    throw std::runtime_error("The source and destination OrtValues must have the same size");
+  }
+
+  if (dst_tensor.IsDataTypeString()) {
+    throw std::runtime_error("Inplace update of string tensors is not supported");
+  }
+
+  size_t bytes = 0;
+  auto status = Tensor::CalculateTensorStorageSize(dst_tensor.DataType(), dst_tensor.Shape(), 0, bytes);
+  if (!status.IsOK()) {
+    throw std::runtime_error(status.ErrorMessage());
+  }
+
+  const auto src_device = src_tensor.Location().device;
+  const auto dst_device = dst_tensor.Location().device;
+
+  void* dst_ptr = dst.GetMutable<Tensor>()->MutableDataRaw();
+  const void* src_ptr = src_tensor.DataRaw();
+
+  if (src_device.UsesCpuMemory() && dst_device.UsesCpuMemory()) {
+    memcpy(dst_ptr, src_ptr, bytes);
+  } else {
+    auto copy_fn = CreateDataTransferMemCpy(src_device, dst_device);
+    if (!copy_fn) {
+      // Fall back to built-in EP copy functions.
+      // Gate each path on (Type, VendorId) so that builds with multiple GPU EPs
+      // (e.g. CUDA + DML) route through the correct backend.
+#ifdef USE_CUDA
+      const auto is_cuda_device = [](const OrtDevice& device) {
+        return device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::NVIDIA;
+      };
+
+      if (is_cuda_device(src_device) && is_cuda_device(dst_device)) {
+        auto data_transfer = GetGPUDataTransfer();
+        ORT_THROW_IF_ERROR(data_transfer->CopyTensor(src_tensor, *dst.GetMutable<Tensor>()));
+        return;
+      }
+      if (src_device.UsesCpuMemory() && is_cuda_device(dst_device)) {
+        CpuToCudaMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+      if (is_cuda_device(src_device) && dst_device.UsesCpuMemory()) {
+        CudaToCpuMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+#endif
+#if USE_MIGRAPHX
+      const auto is_migraphx_device = [](const OrtDevice& device) {
+        return device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::AMD;
+      };
+
+      if (src_device.UsesCpuMemory() && is_migraphx_device(dst_device)) {
+        CpuToMIGraphXMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+      if (is_migraphx_device(src_device) && dst_device.UsesCpuMemory()) {
+        MIGraphXToCpuMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+#endif
+#if USE_DML
+      const auto is_dml_device = [](const OrtDevice& device) {
+        return (device.Type() == OrtDevice::GPU && device.Vendor() == OrtDevice::VendorIds::MICROSOFT) ||
+               device.Type() == OrtDevice::DML;
+      };
+
+      if (src_device.UsesCpuMemory() && is_dml_device(dst_device)) {
+        CpuToDmlMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+      if (is_dml_device(src_device) && dst_device.UsesCpuMemory()) {
+        DmlToCpuMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+#endif
+#ifdef USE_CANN
+      const auto is_cann_device = [](const OrtDevice& device) {
+        return device.Type() == OrtDevice::NPU && device.Vendor() == OrtDevice::VendorIds::HUAWEI;
+      };
+
+      if (src_device.UsesCpuMemory() && is_cann_device(dst_device)) {
+        CpuToCannMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+      if (is_cann_device(src_device) && dst_device.UsesCpuMemory()) {
+        CannToCpuMemCpy(dst_ptr, src_ptr, bytes);
+        return;
+      }
+#endif
+      throw std::runtime_error("Unable to copy data between the source and destination devices");
+    }
+    copy_fn(dst_ptr, src_ptr, bytes);
+  }
+}
+
 }  // namespace python
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.h b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
index 144b3edcad404..097c5b4d20d65 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.h
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.h
@@ -138,6 +138,11 @@ pybind11::object GetPyObjFromTensor(const OrtValue& rtensor,
                                     const std::unordered_map<OrtDevice, MemCpyFunc>* mem_cpy_to_host_functions = nullptr,
                                     bool zero_copy_non_owning = false);
 
+// Update the tensor data in an OrtValue in-place from another OrtValue.
+// Both OrtValues must contain tensors of the same data type and size.
+// This function supports various device-to-device transfers.
+void UpdateOrtValueInplace(OrtValue& dst, const OrtValue& src);
+
 // The below two functions are used to convert OrtValue to numpy arrays
 
 /// <summary>
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index eb966ac5fc314..168d57fc0827b 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -237,6 +237,9 @@ void addOrtValueMethods(pybind11::module& m) {
           throw std::runtime_error("Unsupported device: Cannot update the OrtValue on this device");
         }
       })
+      .def("update_inplace", [](OrtValue* ml_value, const OrtValue& source) {
+        python::UpdateOrtValueInplace(*ml_value, source);
+      })
       // Create an ortvalue value on top of the numpy array, but interpret the data
       // as a different type with the same element size.
       .def_static("ortvalue_from_numpy_with_onnx_type", [](py::array& data, int32_t onnx_element_type) -> std::unique_ptr<OrtValue> {
diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
index 1154f3b9f88b8..c30501c431a6c 100644
--- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
+++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc
@@ -95,25 +95,29 @@ void addSparseTensorMethods(pybind11::module& m) {
   py::class_<PySparseCooView>(m, "SparseCooView")
       // Returns a numpy array of COO indices backed by Sparse Tensor memory
       // be aware that indices may reside on GPU if Sparse Tensor is on GPU
-      .def("indices", [](const PySparseCooView* view) -> py::array {
+      .def("indices", [](py::object self) -> py::array {
+        auto* view = self.cast<const PySparseCooView*>();
         const auto& indices = view->Indices();
-        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
+        return MakeNumpyArrayFromIndices(indices, self);
       });
 
   py::class_<PySparseCsrView>(m, "SparseCsrView")
-      .def("inner", [](const PySparseCsrView* view) -> py::array {
+      .def("inner", [](py::object self) -> py::array {
+        auto* view = self.cast<const PySparseCsrView*>();
         const auto& indices = view->Inner();
-        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
+        return MakeNumpyArrayFromIndices(indices, self);
       })
-      .def("outer", [](const PySparseCsrView* view) -> py::array {
+      .def("outer", [](py::object self) -> py::array {
+        auto* view = self.cast<const PySparseCsrView*>();
         const auto& indices = view->Outer();
-        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
+        return MakeNumpyArrayFromIndices(indices, self);
       });
 
   py::class_<PySparseBlockSparseView>(m, "SparseBlockSparseView")
-      .def("indices", [](const PySparseBlockSparseView* view) -> py::array {
+      .def("indices", [](py::object self) -> py::array {
+        auto* view = self.cast<const PySparseBlockSparseView*>();
         const auto& indices = view->Indices();
-        return MakeNumpyArrayFromIndices(indices, py::cast(*view));
+        return MakeNumpyArrayFromIndices(indices, self);
       });
 
   py::class_<PySparseTensor> sparse_bind(m, "SparseTensor");
@@ -296,7 +300,8 @@ void addSparseTensorMethods(pybind11::module& m) {
           })
       // Returns a numpy array that is backed by SparseTensor values memory
       // be aware that it may be on GPU
-      .def("values", [](const PySparseTensor* py_tensor) -> py::array {
+      .def("values", [](py::object self) -> py::array {
+        auto* py_tensor = self.cast<const PySparseTensor*>();
         const SparseTensor& sparse_tensor = py_tensor->Instance();
         if (sparse_tensor.Format() == SparseFormat::kUndefined) {
           ORT_THROW("This sparse tensor instance does not contain data");
@@ -311,7 +316,7 @@ void addSparseTensorMethods(pybind11::module& m) {
           auto dtype = t_disp.InvokeRet<py::dtype, MakeDType>();
           const auto& values = sparse_tensor.Values();
           // See https://github.com/pybind/pybind11/issues/2271
-          py::array result(dtype, values.Shape().GetDims(), values.DataRaw(), py::cast(*py_tensor));
+          py::array result(dtype, values.Shape().GetDims(), values.DataRaw(), self);
           assert(!result.owndata());
           // Set a read-only flag
           PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(result.ptr()), NPY_ARRAY_WRITEABLE);
diff --git a/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py b/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py
new file mode 100644
index 0000000000000..12511512314a5
--- /dev/null
+++ b/onnxruntime/test/common/test_cpuinfo_sysfs_fallback.py
@@ -0,0 +1,563 @@
+#!/usr/bin/env python3
+"""
+Simulation test for the cpuinfo sysfs fallback fix.
+
+This test verifies two fixes for https://github.com/microsoft/onnxruntime/issues/10038:
+
+1. Safe logging in env.cc - PosixEnv constructor no longer crashes when the
+   logging system is not yet initialized and cpuinfo_initialize() fails.
+
+2. cpuinfo sysfs fallback - The patched cpuinfo library falls back to
+   sysconf(_SC_NPROCESSORS_ONLN) for both processor counts and per-CPU
+   present/possible flags when /sys/devices/system/cpu/{possible,present}
+   files are missing.
+
+Testing approach:
+- Test 1: Compile a small C++ program that calls the safe logging pattern
+  without a registered logger. Verify it doesn't crash.
+- Test 2: Compile a small C program that validates the sysconf fallback
+  arithmetic and verifies that the fallback marks each online CPU with both
+  PRESENT and POSSIBLE flags. This catches the incomplete count-only fallback.
+- Test 3: Use an LD_PRELOAD shim (like the lambda-arm64-onnx workaround)
+  to simulate missing sysfs files and verify ORT loads without crash.
+
+Note: Tests 2 and 3 require a build of ORT with the patches applied.
+Test 1 can run standalone.
+"""
+
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import textwrap
+import unittest
+
+
+def _require_linux():
+    if sys.platform != "linux":
+        raise unittest.SkipTest("Test requires Linux")
+
+
+def _require_gcc():
+    if not shutil.which("gcc"):
+        raise unittest.SkipTest("gcc not found")
+
+
+def _require_gpp():
+    if not shutil.which("g++"):
+        raise unittest.SkipTest("g++ not found")
+
+
+class TestCpuinfoSysfsFallback(unittest.TestCase):
+    def test_safe_logging_pattern(self):
+        """Verify the safe logging pattern doesn't crash when no logger exists.
+
+        This simulates the fix in env.cc where we check HasDefaultLogger() before
+        calling LOGS_DEFAULT(). We compile a minimal C++ program that:
+        - Does NOT register a default logger
+        - Calls the safe logging pattern
+        - Verifies it writes to stderr instead of crashing
+        """
+        _require_linux()
+        _require_gpp()
+
+        source = textwrap.dedent(r"""
+        #include <iostream>
+        #include <string_view>
+
+        // Minimal simulation of ORT's logging check pattern
+        namespace logging {
+        class LoggingManager {
+        public:
+            // Simulate: no default logger registered
+            static bool HasDefaultLogger() { return false; }
+        };
+        }  // namespace logging
+
+        void LogEarlyWarning(std::string_view message) {
+            if (logging::LoggingManager::HasDefaultLogger()) {
+                // Would call LOGS_DEFAULT(WARNING) here - but logger doesn't exist
+                // This path should NOT be taken
+                std::cerr << "BUG: should not reach here\n";
+                return;
+            }
+            // Safe fallback to stderr
+            std::cerr << "onnxruntime warning: " << message << "\n";
+        }
+
+        int main() {
+            // This simulates what PosixEnv() does when cpuinfo_initialize() fails
+            bool cpuinfo_available = false;  // Simulating failure
+            if (!cpuinfo_available) {
+                LogEarlyWarning("cpuinfo_initialize failed. "
+                               "May cause CPU EP performance degradation due to undetected CPU features.");
+            }
+            std::cout << "PASS: Safe logging pattern works without crash\n";
+            return 0;
+        }
+        """)
+
+        with tempfile.NamedTemporaryFile(suffix=".cc", mode="w", delete=False) as f:
+            f.write(source)
+            src_path = f.name
+
+        try:
+            exe_path = src_path.replace(".cc", "")
+            result = subprocess.run(
+                ["g++", "-std=c++17", "-o", exe_path, src_path], check=False, capture_output=True, text=True
+            )
+            self.assertEqual(result.returncode, 0, f"Compilation failed: {result.stderr}")
+
+            result = subprocess.run([exe_path], check=False, capture_output=True, text=True, timeout=10)
+            self.assertEqual(
+                result.returncode, 0, f"Program crashed with exit code {result.returncode}: {result.stderr}"
+            )
+            self.assertIn("PASS", result.stdout)
+        finally:
+            os.unlink(src_path)
+            if os.path.exists(src_path.replace(".cc", "")):
+                os.unlink(src_path.replace(".cc", ""))
+
+    def test_sysconf_fallback(self):
+        """Verify sysconf(_SC_NPROCESSORS_ONLN) works as a complete fallback.
+
+        This doesn't test the actual cpuinfo patch (that requires building cpuinfo)
+        but verifies the fallback mechanism produces correct counts and marks
+        present/possible flags for each online CPU.
+        """
+        _require_linux()
+        _require_gcc()
+
+        source = textwrap.dedent(r"""
+        #include <stdint.h>
+        #include <stdio.h>
+        #include <unistd.h>
+
+        #define CPUINFO_LINUX_FLAG_PRESENT 0x1
+        #define CPUINFO_LINUX_FLAG_POSSIBLE 0x2
+
+        int main() {
+            long nproc = sysconf(_SC_NPROCESSORS_ONLN);
+            if (nproc <= 0) {
+                printf("FAIL: sysconf(_SC_NPROCESSORS_ONLN) returned %ld\n", nproc);
+                return 1;
+            }
+            // Simulate what the patched cpuinfo max-count helpers return:
+            // max_processor = nproc - 1 (0-indexed). Then arm_linux_init does:
+            // 1 + max_processor = nproc.
+            unsigned int max_processor = (unsigned int)(nproc - 1);
+            unsigned int arm_linux_processors_count = 1 + max_processor;
+
+            uint32_t processor_flags[1024] = {0};
+            unsigned int processors_count = arm_linux_processors_count;
+            if (processors_count > 1024) {
+                processors_count = 1024;
+            }
+
+            // Simulate cpuinfo_linux_detect_possible_processors() and
+            // cpuinfo_linux_detect_present_processors() fallback helpers.
+            for (unsigned int processor = 0; processor < processors_count; ++processor) {
+                processor_flags[processor] |= CPUINFO_LINUX_FLAG_PRESENT;
+                processor_flags[processor] |= CPUINFO_LINUX_FLAG_POSSIBLE;
+            }
+
+            unsigned int valid_processors = 0;
+            const uint32_t valid_processor_mask = CPUINFO_LINUX_FLAG_PRESENT | CPUINFO_LINUX_FLAG_POSSIBLE;
+            for (unsigned int processor = 0; processor < processors_count; ++processor) {
+                if ((processor_flags[processor] & valid_processor_mask) == valid_processor_mask) {
+                    ++valid_processors;
+                }
+            }
+
+            printf("sysconf(_SC_NPROCESSORS_ONLN) = %ld\n", nproc);
+            printf("Simulated max_processor = %u\n", max_processor);
+            printf("Simulated arm_linux_processors_count = %u\n", arm_linux_processors_count);
+            printf("Simulated valid_processors = %u\n", valid_processors);
+
+            if (arm_linux_processors_count == (unsigned int)nproc && valid_processors == processors_count) {
+                printf("PASS: Fallback produces correct processor count and flags\n");
+                return 0;
+            }
+            printf("FAIL: Processor count or flags mismatch\n");
+            return 1;
+        }
+        """)
+
+        with tempfile.NamedTemporaryFile(suffix=".c", mode="w", delete=False) as f:
+            f.write(source)
+            src_path = f.name
+
+        try:
+            exe_path = src_path.replace(".c", "")
+            result = subprocess.run(["gcc", "-o", exe_path, src_path], check=False, capture_output=True, text=True)
+            self.assertEqual(result.returncode, 0, f"Compilation failed: {result.stderr}")
+
+            result = subprocess.run([exe_path], check=False, capture_output=True, text=True, timeout=10)
+            self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stdout}")
+            self.assertIn("PASS", result.stdout)
+        finally:
+            os.unlink(src_path)
+            if os.path.exists(src_path.replace(".c", "")):
+                os.unlink(src_path.replace(".c", ""))
+
+    def test_sysfs_hide_with_ld_preload(self):
+        """Verify LD_PRELOAD shim can hide sysfs files.
+
+        This compiles a small shim that intercepts open-family calls to return
+        ENOENT for /sys/devices/system/cpu/{possible,present}, then runs a test
+        program that opens those files.
+        """
+        _require_linux()
+        _require_gcc()
+
+        shim_source = textwrap.dedent(r"""
+        #define _GNU_SOURCE
+        #include <dlfcn.h>
+        #include <errno.h>
+        #include <fcntl.h>
+        #include <stdarg.h>
+        #include <stdio.h>
+        #include <string.h>
+        #include <sys/types.h>
+
+#ifndef O_TMPFILE
+#define O_TMPFILE 0
+#endif
+
+        static const char *CPU_POSSIBLE = "/sys/devices/system/cpu/possible";
+        static const char *CPU_PRESENT  = "/sys/devices/system/cpu/present";
+
+        static int is_blocked(const char *path) {
+            return (strcmp(path, CPU_POSSIBLE) == 0 || strcmp(path, CPU_PRESENT) == 0);
+        }
+
+        static mode_t get_mode_if_needed(int flags, va_list args) {
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) ? va_arg(args, mode_t) : 0;
+        }
+
+        int open(const char *path, int flags, ...) {
+            static int (*real_open)(const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_open) real_open = dlsym(RTLD_NEXT, "open");
+            if (is_blocked(path)) {
+                errno = ENOENT;
+                return -1;
+            }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_open(path, flags, mode)
+                       : real_open(path, flags);
+        }
+
+        int open64(const char *path, int flags, ...) {
+            static int (*real_open64)(const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_open64) real_open64 = dlsym(RTLD_NEXT, "open64");
+            if (is_blocked(path)) {
+                errno = ENOENT;
+                return -1;
+            }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_open64(path, flags, mode)
+                       : real_open64(path, flags);
+        }
+
+        int openat(int dirfd, const char *path, int flags, ...) {
+            static int (*real_openat)(int, const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_openat) real_openat = dlsym(RTLD_NEXT, "openat");
+            if (path && is_blocked(path)) {
+                errno = ENOENT;
+                return -1;
+            }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_openat(dirfd, path, flags, mode)
+                       : real_openat(dirfd, path, flags);
+        }
+
+        int openat64(int dirfd, const char *path, int flags, ...) {
+            static int (*real_openat64)(int, const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_openat64) real_openat64 = dlsym(RTLD_NEXT, "openat64");
+            if (path && is_blocked(path)) {
+                errno = ENOENT;
+                return -1;
+            }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_openat64(dirfd, path, flags, mode)
+                       : real_openat64(dirfd, path, flags);
+        }
+
+        FILE *fopen(const char *restrict path, const char *restrict mode) {
+            static FILE *(*real_fopen)(const char *, const char *) = NULL;
+            if (!real_fopen) real_fopen = dlsym(RTLD_NEXT, "fopen");
+
+            if (is_blocked(path)) {
+                errno = ENOENT;
+                return NULL;
+            }
+            return real_fopen(path, mode);
+        }
+        """)
+
+        test_source = textwrap.dedent(r"""
+        #include <errno.h>
+        #include <fcntl.h>
+        #include <stdio.h>
+        #include <string.h>
+        #include <unistd.h>
+
+        static int try_open(const char *path) {
+            int fd = open(path, O_RDONLY);
+            if (fd >= 0) {
+                close(fd);
+            }
+            return fd;
+        }
+
+        int main() {
+            int fd;
+            int pass = 1;
+
+            fd = try_open("/sys/devices/system/cpu/possible");
+            if (fd >= 0) {
+                printf("FAIL: /sys/devices/system/cpu/possible should be blocked\n");
+                pass = 0;
+            } else {
+                printf("OK: /sys/devices/system/cpu/possible blocked (errno=%d: %s)\n",
+                       errno, strerror(errno));
+            }
+
+            fd = try_open("/sys/devices/system/cpu/present");
+            if (fd >= 0) {
+                printf("FAIL: /sys/devices/system/cpu/present should be blocked\n");
+                pass = 0;
+            } else {
+                printf("OK: /sys/devices/system/cpu/present blocked (errno=%d: %s)\n",
+                       errno, strerror(errno));
+            }
+
+            // Verify other files still work
+            fd = try_open("/proc/cpuinfo");
+            if (fd < 0) {
+                printf("WARN: /proc/cpuinfo not accessible (may be OK in some envs)\n");
+            } else {
+                printf("OK: /proc/cpuinfo still accessible\n");
+            }
+
+            if (pass) {
+                printf("PASS: LD_PRELOAD sysfs-hiding shim works correctly\n");
+            }
+            return pass ? 0 : 1;
+        }
+        """)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            shim_path = os.path.join(tmpdir, "hide_sysfs.c")
+            shim_so = os.path.join(tmpdir, "hide_sysfs.so")
+            test_path = os.path.join(tmpdir, "test_sysfs.c")
+            test_exe = os.path.join(tmpdir, "test_sysfs")
+
+            with open(shim_path, "w") as f:
+                f.write(shim_source)
+            with open(test_path, "w") as f:
+                f.write(test_source)
+
+            # Compile shim
+            result = subprocess.run(
+                ["gcc", "-shared", "-fPIC", "-o", shim_so, shim_path, "-ldl"],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(result.returncode, 0, f"Shim compilation failed: {result.stderr}")
+
+            # Compile test
+            result = subprocess.run(["gcc", "-o", test_exe, test_path], check=False, capture_output=True, text=True)
+            self.assertEqual(result.returncode, 0, f"Test compilation failed: {result.stderr}")
+
+            # Run with LD_PRELOAD
+            env = os.environ.copy()
+            env["LD_PRELOAD"] = shim_so
+            result = subprocess.run([test_exe], check=False, capture_output=True, text=True, timeout=10, env=env)
+            self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stdout}")
+            self.assertIn("PASS", result.stdout)
+
+    def test_ort_import_with_hidden_sysfs(self):
+        """Integration test - import onnxruntime with hidden sysfs files.
+
+        This uses the LD_PRELOAD shim to hide /sys/devices/system/cpu/{possible,present}
+        and then imports onnxruntime. This is the actual end-to-end test that
+        verifies both fixes work together.
+
+        NOTE: This requires onnxruntime to be built with the patches applied.
+        """
+        _require_linux()
+        _require_gcc()
+
+        # Check if onnxruntime is importable
+        result = subprocess.run(
+            [sys.executable, "-c", "import onnxruntime"], check=False, capture_output=True, text=True, timeout=30
+        )
+        if result.returncode != 0:
+            self.skipTest("onnxruntime not installed/importable")
+
+        shim_source = textwrap.dedent(r"""
+        #define _GNU_SOURCE
+        #include <dlfcn.h>
+        #include <errno.h>
+        #include <fcntl.h>
+        #include <stdarg.h>
+        #include <stdio.h>
+        #include <string.h>
+        #include <sys/types.h>
+
+#ifndef O_TMPFILE
+#define O_TMPFILE 0
+#endif
+
+        static const char *CPU_POSSIBLE = "/sys/devices/system/cpu/possible";
+        static const char *CPU_PRESENT  = "/sys/devices/system/cpu/present";
+
+        static int is_blocked(const char *path) {
+            return (strcmp(path, CPU_POSSIBLE) == 0 || strcmp(path, CPU_PRESENT) == 0);
+        }
+
+        static mode_t get_mode_if_needed(int flags, va_list args) {
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE)) ? va_arg(args, mode_t) : 0;
+        }
+
+        int open(const char *path, int flags, ...) {
+            static int (*real_open)(const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_open) real_open = dlsym(RTLD_NEXT, "open");
+            if (is_blocked(path)) { errno = ENOENT; return -1; }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_open(path, flags, mode)
+                       : real_open(path, flags);
+        }
+
+        int open64(const char *path, int flags, ...) {
+            static int (*real_open64)(const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_open64) real_open64 = dlsym(RTLD_NEXT, "open64");
+            if (is_blocked(path)) { errno = ENOENT; return -1; }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_open64(path, flags, mode)
+                       : real_open64(path, flags);
+        }
+
+        int openat(int dirfd, const char *path, int flags, ...) {
+            static int (*real_openat)(int, const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_openat) real_openat = dlsym(RTLD_NEXT, "openat");
+            if (path && is_blocked(path)) { errno = ENOENT; return -1; }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_openat(dirfd, path, flags, mode)
+                       : real_openat(dirfd, path, flags);
+        }
+
+        int openat64(int dirfd, const char *path, int flags, ...) {
+            static int (*real_openat64)(int, const char *, int, ...) = NULL;
+            va_list args;
+            mode_t mode = 0;
+
+            if (!real_openat64) real_openat64 = dlsym(RTLD_NEXT, "openat64");
+            if (path && is_blocked(path)) { errno = ENOENT; return -1; }
+
+            va_start(args, flags);
+            mode = get_mode_if_needed(flags, args);
+            va_end(args);
+            return ((flags & O_CREAT) || ((flags & O_TMPFILE) == O_TMPFILE))
+                       ? real_openat64(dirfd, path, flags, mode)
+                       : real_openat64(dirfd, path, flags);
+        }
+
+        FILE *fopen(const char *restrict path, const char *restrict mode) {
+            static FILE *(*real_fopen)(const char *, const char *) = NULL;
+            if (!real_fopen) real_fopen = dlsym(RTLD_NEXT, "fopen");
+            if (is_blocked(path)) { errno = ENOENT; return NULL; }
+            return real_fopen(path, mode);
+        }
+        """)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            shim_path = os.path.join(tmpdir, "hide_sysfs.c")
+            shim_so = os.path.join(tmpdir, "hide_sysfs.so")
+
+            with open(shim_path, "w") as f:
+                f.write(shim_source)
+
+            result = subprocess.run(
+                ["gcc", "-shared", "-fPIC", "-o", shim_so, shim_path, "-ldl"],
+                check=False,
+                capture_output=True,
+                text=True,
+            )
+            self.assertEqual(result.returncode, 0, f"Shim compilation failed: {result.stderr}")
+
+            env = os.environ.copy()
+            env["LD_PRELOAD"] = shim_so
+
+            # Try importing onnxruntime with hidden sysfs
+            ort_script = (
+                "import onnxruntime; print('PASS: onnxruntime imported successfully'); "
+                "print(f'Version: {onnxruntime.__version__}'); "
+                "print(f'Providers: {onnxruntime.get_available_providers()}')"
+            )
+            result = subprocess.run(
+                [sys.executable, "-c", ort_script],
+                check=False,
+                capture_output=True,
+                text=True,
+                timeout=60,
+                env=env,
+            )
+            self.assertEqual(result.returncode, 0, f"exit code {result.returncode}: {result.stderr}")
+            self.assertIn("PASS", result.stdout)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 1fc410c37da14..880c10137f3fe 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -937,10 +937,11 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_CUDA_Passthroug
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
   // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
 
   // Output should equal input when position_id is OOB (pass-through).
   test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
@@ -1054,5 +1055,122 @@ TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_RejectsRank4MalformedCacheWidth
            {}, nullptr, &execution_providers);
 }
 
+// Test that OOB position_ids on WebGPU (format 1) pass through input unchanged (shader-side defense).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Both position_ids exceed max_sequence_length = 8 — shader passes through input unchanged.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {999, 999});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+
+  // Output should equal input when position_id is OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+// Test that format-0 OOB position_ids base offset passes through on WebGPU (shader-side defense).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Format0_OOB_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Format 0: base offset 8, effective positions = [8, 9] — both OOB for max_sequence_length = 8.
+  test.AddInput<int64_t>("position_ids", {1}, {8});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+
+  // Output should equal input when all positions are OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0).
+TEST(RotaryEmbeddingTest, ContribRotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Negative position_id — shader checks raw_pos < 0 and passes through.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-5});
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+
+  // Output should equal input when position_id is negative (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index f97fefb085d84..59ec8f51b4f4e 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -2294,6 +2294,251 @@ TEST(SparseTensorConversionTests, SparseTensorProtoToDense_ValuesSizeMismatch_Ra
   EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("values data size does not match expected"));
 }
 
+// Tests for SparseTensorProtoToDenseTensorProto with negative indices (model-loading path)
+TEST(SparseTensorConversionTests, SparseTensorProtoToDense_NegativeIndex_Rank1) {
+  // Dense size 4
+  // Index -1 -> negative, out of bounds
+  ONNX_NAMESPACE::SparseTensorProto sparse;
+  sparse.mutable_values()->set_name("test_neg_idx");
+  sparse.add_dims(4);
+
+  auto* val = sparse.mutable_values();
+  val->add_dims(1);
+  val->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  val->add_float_data(1.0f);
+
+  auto* ind = sparse.mutable_indices();
+  ind->add_dims(1);
+  ind->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  ind->add_int64_data(-1);
+
+  ONNX_NAMESPACE::TensorProto dense;
+  auto status = utils::SparseTensorProtoToDenseTensorProto(sparse, {}, dense);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("index is out of bounds"));
+}
+
+TEST(SparseTensorConversionTests, SparseTensorProtoToDense_NegativeIndex_Rank2) {
+  // Dense Shape [3, 3]
+  // Index [-1, 0] -> negative row, out of bounds
+  ONNX_NAMESPACE::SparseTensorProto sparse;
+  sparse.mutable_values()->set_name("test_neg_idx_2d");
+  sparse.add_dims(3);
+  sparse.add_dims(3);
+
+  auto* val = sparse.mutable_values();
+  val->add_dims(1);
+  val->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  val->add_float_data(1.0f);
+
+  auto* ind = sparse.mutable_indices();
+  ind->add_dims(1);
+  ind->add_dims(2);
+  ind->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64);
+  ind->add_int64_data(-1);
+  ind->add_int64_data(0);
+
+  ONNX_NAMESPACE::TensorProto dense;
+  auto status = utils::SparseTensorProtoToDenseTensorProto(sparse, {}, dense);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("index is out of bounds"));
+}
+
+// Tests for SparseCooToDenseTensor and SparseCsrToDenseTensor (sparse_utils.cc paths)
+TEST(SparseTensorConversionTests, SparseCooToDense_NegativeLinearIndex) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // Create a SparseTensor with COO format and a negative linear index
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1, 2, 3};
+  std::vector<int64_t> bad_indices = {-1, 3, 5};  // -1 is invalid
+
+  ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(), gsl::make_span(bad_indices)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO index"));
+}
+
+TEST(SparseTensorConversionTests, SparseCooToDense_UpperBoundLinearIndex) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // Dense 3x3 = 9 elements. Index 9 is out of bounds (valid: 0-8)
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1, 2, 3};
+  std::vector<int64_t> bad_indices = {0, 3, 9};
+
+  ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(), gsl::make_span(bad_indices)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO index"));
+}
+
+TEST(SparseTensorConversionTests, SparseCooToDense_Negative2DIndex) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // 2D indices: (-1, 0) is invalid
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1, 2};
+  std::vector<int64_t> bad_indices = {-1, 0, 1, 1};  // 2D, first entry has negative row
+
+  ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(), gsl::make_span(bad_indices)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index"));
+}
+
+TEST(SparseTensorConversionTests, SparseCsrToDense_NegativeColumnIndex) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // 3x3 dense, CSR with a negative column index
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1, 2, 3};
+  std::vector<int64_t> inner = {-1, 0, 2};  // -1 is invalid column
+  std::vector<int64_t> outer = {0, 1, 2, 3};
+
+  ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(),
+                                   gsl::make_span(inner), gsl::make_span(outer)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid CSR column index"));
+}
+
+TEST(SparseTensorConversionTests, SparseCsrToDense_ColumnIndexOutOfBounds) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // 3x3 dense, CSR with column index 3 (valid: 0-2)
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1, 2, 3};
+  std::vector<int64_t> inner = {1, 3, 1};  // 3 is out of bounds for 3 columns
+  std::vector<int64_t> outer = {0, 1, 2, 3};
+
+  ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(),
+                                   gsl::make_span(inner), gsl::make_span(outer)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid CSR column index"));
+}
+
+// Regression test: SparseCsrToDenseTensor must use correct source index for each
+// non-zero value. Previously src_idx was never incremented, so all entries got values[0].
+// Using distinct values exposes this bug.
+TEST(SparseTensorConversionTests, SparseCsrToDense_DistinctValuesRoundtrip) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  // 3x3 dense matrix:
+  //   0  0  10
+  //  20  0  30
+  //   0  0   0
+  // CSR: values={10, 20, 30}, inner(col)={2, 0, 2}, outer={0, 1, 3, 3}
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {10, 20, 30};
+  std::vector<int64_t> inner = {2, 0, 2};
+  std::vector<int64_t> outer = {0, 1, 3, 3};
+
+  ASSERT_STATUS_OK(src.MakeCsrData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(),
+                                   gsl::make_span(inner), gsl::make_span(outer)));
+
+  Tensor dense_dst;
+  ASSERT_STATUS_OK(sparse_utils::SparseCsrToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst));
+
+  std::vector<int32_t> expected_dense = {
+      0, 0, 10,
+      20, 0, 30,
+      0, 0, 0};
+
+  auto dense_span = dense_dst.DataAsSpan<int32_t>();
+  ASSERT_EQ(dense_span.size(), expected_dense.size());
+  for (size_t i = 0; i < expected_dense.size(); ++i) {
+    EXPECT_EQ(dense_span[i], expected_dense[i]) << "Mismatch at index " << i;
+  }
+}
+
+// Test that COO 2D validation catches out-of-range column even when
+// the linearized index would be in bounds. E.g., for a 3x3 matrix,
+// (row=0, col=4) gives linear index 4 which is in [0,9), but col=4 >= cols=3.
+TEST(SparseTensorConversionTests, SparseCooToDense_2DColumnOutOfRange) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1};
+  // (row=0, col=4): linear index = 0*3+4 = 4, valid linear but col >= cols
+  std::vector<int64_t> bad_indices = {0, 4};
+
+  ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(), gsl::make_span(bad_indices)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index"));
+}
+
+// Test that COO 2D validation catches out-of-range row.
+TEST(SparseTensorConversionTests, SparseCooToDense_2DRowOutOfRange) {
+  auto* cpu_provider = TestCPUExecutionProvider();
+  auto cpu_allocator = cpu_provider->CreatePreferredAllocators()[0];
+
+  DataTransferManager dtm;
+  ASSERT_STATUS_OK(dtm.RegisterDataTransfer(cpu_provider->GetDataTransfer()));
+
+  SparseTensor src(DataTypeImpl::GetType<int32_t>(), TensorShape{3, 3}, cpu_allocator);
+  std::vector<int32_t> values = {1};
+  // (row=3, col=0): row >= rows
+  std::vector<int64_t> bad_indices = {3, 0};
+
+  ASSERT_STATUS_OK(src.MakeCooData(*cpu_provider->GetDataTransfer(), cpu_allocator->Info(),
+                                   values.size(), values.data(), gsl::make_span(bad_indices)));
+
+  Tensor dense_dst;
+  auto status = sparse_utils::SparseCooToDenseTensor(dtm, src, cpu_allocator, cpu_allocator, dense_dst);
+  EXPECT_FALSE(status.IsOK());
+  EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("Invalid COO 2D index"));
+}
+
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/mlas/bench/riscv64/README.md b/onnxruntime/test/mlas/bench/riscv64/README.md
new file mode 100644
index 0000000000000..136c40d39430f
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/riscv64/README.md
@@ -0,0 +1,77 @@
+# RISC-V MLAS Benchmarks
+
+This directory stores the standalone benchmarks and compare tools used while
+bringing up and tuning the RVV path in MLAS.
+
+Files:
+
+- `sgemm_riscv_bench.cpp`: standalone SGEMM timing harness with checksum
+  output. Useful for RVV versus scalar comparisons.
+- `softmax_rvv_compare.cpp`: scalar versus RVV validation and timing tool for
+  the Softmax critical path.
+
+These tools are intentionally kept separate from `onnxruntime_mlas_benchmark`.
+Each source file has its own `main()` and is built as an independent target.
+
+## Build
+
+On a riscv64 RVV build, first regenerate the build tree:
+
+```bash
+python3 tools/ci_build/build.py \
+  --config Release \
+  --build_dir build/k1_rvv_resync \
+  --update \
+  --skip_tests \
+  --skip_pip_install \
+  --skip_submodule_sync \
+  --no_sve \
+  --enable_rvv
+```
+
+Then build both standalone tools directly with CMake:
+
+```bash
+cmake --build build/k1_rvv_resync/Release \
+  --config Release \
+  --target onnxruntime_mlas_sgemm_riscv_bench onnxruntime_mlas_softmax_riscv_compare \
+  -- -j8
+```
+
+The resulting binaries are typically placed under:
+
+```bash
+build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench
+build/k1_rvv_resync/Release/onnxruntime_mlas_softmax_riscv_compare
+```
+
+## SGEMM examples
+
+RVV, packed-B:
+
+```bash
+taskset -c 0 build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench \
+  --m=128 --n=3072 --k=768 --iters=10 --warmup=3 --pack_b=1 --trans_a=0 --trans_b=0
+```
+
+Scalar baseline on the same binary:
+
+```bash
+ORT_MLAS_RISCV_FORCE_SCALAR=1 taskset -c 0 \
+  build/k1_rvv_resync/Release/onnxruntime_mlas_sgemm_riscv_bench \
+  --m=128 --n=3072 --k=768 --iters=10 --warmup=3 --pack_b=1 --trans_a=0 --trans_b=0
+```
+
+## Softmax examples
+
+```bash
+taskset -c 0 build/k1_rvv_resync/Release/onnxruntime_mlas_softmax_riscv_compare
+```
+
+## Notes
+
+- The RVV SGEMM path is written to be VLEN-agnostic. The MLAS packing format
+  remains 16 columns wide, but each tile is consumed using runtime `vsetvl`
+  chunking so the same binary works across different VLENs such as 128 and 256.
+- `ORT_MLAS_RISCV_FORCE_SCALAR=1` disables the RVV dispatch at runtime and is
+  the preferred way to gather scalar baselines from the same build.
diff --git a/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp b/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp
new file mode 100644
index 0000000000000..d94840ffec518
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/riscv64/sgemm_riscv_bench.cpp
@@ -0,0 +1,240 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sgemm_riscv_bench.cpp
+
+Abstract:
+
+    This module implements a standalone SGEMM benchmark used while tuning the
+    RISC-V MLAS path. It is intentionally separate from the Google Benchmark
+    suite so it can print pack time, compute time, checksum, and compare RVV
+    against scalar execution via ORT_MLAS_RISCV_FORCE_SCALAR.
+
+--*/
+
+#include "mlas.h"
+
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string_view>
+#include <vector>
+
+namespace {
+
+struct Options {
+  size_t m = 128;
+  size_t n = 3072;
+  size_t k = 768;
+  size_t iters = 20;
+  size_t warmup = 3;
+  bool pack_b = false;
+  bool trans_a = false;
+  bool trans_b = false;
+  float alpha = 1.0f;
+  float beta = 0.0f;
+};
+
+void PrintUsage(const char* argv0) {
+  std::cout
+      << "Usage: " << argv0 << " [--m=N] [--n=N] [--k=N] [--iters=N] [--warmup=N]\n"
+      << "       [--pack_b=0|1] [--trans_a=0|1] [--trans_b=0|1]\n"
+      << "       [--alpha=F] [--beta=F]\n";
+}
+
+bool ParseBool(std::string_view value) {
+  return value == "1" || value == "true" || value == "on" || value == "yes";
+}
+
+float MakeValue(size_t index) {
+  uint32_t x = static_cast<uint32_t>(index * 747796405u + 2891336453u);
+  x ^= x >> 16;
+  x *= 2246822519u;
+  x ^= x >> 13;
+  const uint32_t bucket = x % 2048u;
+  return (static_cast<float>(bucket) / 1024.0f) - 1.0f;
+}
+
+Options ParseArgs(int argc, char** argv) {
+  Options options;
+
+  for (int i = 1; i < argc; ++i) {
+    std::string_view arg(argv[i]);
+    if (arg == "--help" || arg == "-h") {
+      PrintUsage(argv[0]);
+      std::exit(0);
+    }
+
+    const auto split = arg.find('=');
+    if (split == std::string_view::npos || split == 0 || split + 1 >= arg.size()) {
+      continue;
+    }
+
+    const std::string_view key = arg.substr(0, split);
+    const std::string_view value = arg.substr(split + 1);
+
+    if (key == "--m") {
+      options.m = std::strtoull(value.data(), nullptr, 10);
+    } else if (key == "--n") {
+      options.n = std::strtoull(value.data(), nullptr, 10);
+    } else if (key == "--k") {
+      options.k = std::strtoull(value.data(), nullptr, 10);
+    } else if (key == "--iters") {
+      options.iters = std::strtoull(value.data(), nullptr, 10);
+    } else if (key == "--warmup") {
+      options.warmup = std::strtoull(value.data(), nullptr, 10);
+    } else if (key == "--pack_b") {
+      options.pack_b = ParseBool(value);
+    } else if (key == "--trans_a") {
+      options.trans_a = ParseBool(value);
+    } else if (key == "--trans_b") {
+      options.trans_b = ParseBool(value);
+    } else if (key == "--alpha") {
+      options.alpha = std::strtof(value.data(), nullptr);
+    } else if (key == "--beta") {
+      options.beta = std::strtof(value.data(), nullptr);
+    }
+  }
+
+  return options;
+}
+
+template <typename Fn>
+double TimeLoop(size_t iterations, Fn&& fn) {
+  const auto begin = std::chrono::steady_clock::now();
+  for (size_t i = 0; i < iterations; ++i) {
+    fn();
+  }
+  const auto end = std::chrono::steady_clock::now();
+  return std::chrono::duration<double, std::milli>(end - begin).count();
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  const Options options = ParseArgs(argc, argv);
+
+  if (options.m == 0 || options.n == 0 || options.k == 0 || options.iters == 0) {
+    std::cerr << "m, n, k, and iters must be > 0" << std::endl;
+    return 1;
+  }
+
+  const size_t a_size = options.m * options.k;
+  const size_t b_size = options.n * options.k;
+  const size_t c_size = options.m * options.n;
+
+  std::vector<float> a(a_size);
+  std::vector<float> b(b_size);
+  std::vector<float> c(c_size, 0.0f);
+
+  for (size_t i = 0; i < a.size(); ++i) {
+    a[i] = MakeValue(i);
+  }
+  for (size_t i = 0; i < b.size(); ++i) {
+    b[i] = MakeValue(i + a.size());
+  }
+
+  const CBLAS_TRANSPOSE trans_a = options.trans_a ? CblasTrans : CblasNoTrans;
+  const CBLAS_TRANSPOSE trans_b = options.trans_b ? CblasTrans : CblasNoTrans;
+  const size_t lda = options.trans_a ? options.m : options.k;
+  const size_t ldb = options.trans_b ? options.k : options.n;
+  const size_t ldc = options.n;
+
+  std::vector<uint8_t> packed_b;
+  double pack_ms = 0.0;
+
+  if (options.pack_b) {
+    const size_t packed_b_size = MlasGemmPackBSize(trans_a, trans_b, options.n, options.k, nullptr);
+    if (packed_b_size == 0) {
+      std::cerr << "packing is not supported for this configuration" << std::endl;
+      return 2;
+    }
+
+    packed_b.resize(packed_b_size);
+
+    pack_ms = TimeLoop(options.iters, [&]() {
+      MlasGemmPackB(trans_a, trans_b, options.n, options.k, b.data(), ldb, packed_b.data(), nullptr);
+    });
+
+    MlasGemmPackB(trans_a, trans_b, options.n, options.k, b.data(), ldb, packed_b.data(), nullptr);
+  }
+
+  auto run_once = [&]() {
+    if (options.beta == 0.0f) {
+      std::fill(c.begin(), c.end(), 0.0f);
+    }
+
+    if (options.pack_b) {
+      MlasGemm(
+          trans_a,
+          options.m,
+          options.n,
+          options.k,
+          options.alpha,
+          a.data(),
+          lda,
+          packed_b.data(),
+          options.beta,
+          c.data(),
+          ldc,
+          nullptr,
+          nullptr);
+    } else {
+      MlasGemm(
+          trans_a,
+          trans_b,
+          options.m,
+          options.n,
+          options.k,
+          options.alpha,
+          a.data(),
+          lda,
+          b.data(),
+          ldb,
+          options.beta,
+          c.data(),
+          ldc,
+          nullptr,
+          nullptr);
+    }
+  };
+
+  for (size_t i = 0; i < options.warmup; ++i) {
+    run_once();
+  }
+
+  const double compute_ms = TimeLoop(options.iters, run_once);
+  const double avg_compute_ms = compute_ms / static_cast<double>(options.iters);
+  const double avg_pack_ms = pack_ms / static_cast<double>(options.iters);
+  const double flops = 2.0 * static_cast<double>(options.m) * static_cast<double>(options.n) *
+                       static_cast<double>(options.k);
+  const double gflops = flops / (avg_compute_ms * 1.0e6);
+  const double checksum = std::accumulate(c.begin(), c.end(), 0.0);
+
+  std::cout << std::fixed << std::setprecision(4);
+  std::cout << "M=" << options.m
+            << " N=" << options.n
+            << " K=" << options.k
+            << " pack_b=" << (options.pack_b ? 1 : 0)
+            << " trans_a=" << (options.trans_a ? 1 : 0)
+            << " trans_b=" << (options.trans_b ? 1 : 0)
+            << " iters=" << options.iters
+            << " warmup=" << options.warmup << '\n';
+  if (options.pack_b) {
+    std::cout << "pack_total_ms=" << pack_ms << " pack_avg_ms=" << avg_pack_ms << '\n';
+  }
+  std::cout << "compute_total_ms=" << compute_ms
+            << " compute_avg_ms=" << avg_compute_ms
+            << " gflops=" << gflops << '\n';
+  std::cout << "checksum=" << checksum << std::endl;
+
+  return 0;
+}
diff --git a/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp b/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp
new file mode 100644
index 0000000000000..e4411d3920408
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/riscv64/softmax_rvv_compare.cpp
@@ -0,0 +1,241 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    softmax_rvv_compare.cpp
+
+Abstract:
+
+    This module implements a standalone RVV versus scalar validation and
+    timing tool for the Softmax critical path on riscv64.
+
+--*/
+
+#include "mlas.h"
+
+#include <iostream>
+
+#if !defined(MLAS_TARGET_RISCV64)
+
+int main() {
+  std::cout << "softmax_rvv_compare is only supported on riscv64." << std::endl;
+  return 0;
+}
+
+#elif !defined(MLAS_USE_RVV)
+
+int main() {
+  std::cout << "softmax_rvv_compare requires an RVV-enabled MLAS build." << std::endl;
+  return 0;
+}
+
+#else
+
+#include "core/mlas/lib/mlasi.h"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace {
+
+struct CompareStats {
+  float max_abs_diff = 0.0f;
+  float max_rel_diff = 0.0f;
+  double checksum_scalar = 0.0;
+  double checksum_rvv = 0.0;
+};
+
+struct TimingStats {
+  double scalar_ms = 0.0;
+  double rvv_ms = 0.0;
+};
+
+void ScalarSoftmaxRow(const float* input, float* output, size_t d, bool log_softmax, bool smooth_softmax) {
+  float maximum = MlasReduceMaximumF32Kernel(input, d);
+  if (smooth_softmax && maximum < 0.0f) {
+    maximum = 0.0f;
+  }
+
+  const float negative_maximum = -maximum;
+
+  if (log_softmax) {
+    float accumulation = MlasComputeSumExpF32Kernel(input, nullptr, d, &negative_maximum);
+    if (smooth_softmax) {
+      accumulation += std::exp(-maximum);
+    }
+
+    const float parameters[2] = {negative_maximum, std::log(accumulation)};
+    MlasComputeLogSoftmaxOutputF32Kernel(input, output, d, parameters);
+    return;
+  }
+
+  float accumulation = MlasComputeSumExpF32Kernel(input, output, d, &negative_maximum);
+  if (smooth_softmax) {
+    accumulation += std::exp(-maximum);
+  }
+
+  const float parameters[1] = {1.0f / accumulation};
+  MlasComputeSoftmaxOutputF32Kernel(output, d, parameters);
+}
+
+void RvvSoftmaxRow(const float* input, float* output, size_t d, bool log_softmax, bool smooth_softmax) {
+  auto& platform = GetMlasPlatform();
+
+  float maximum = platform.ReduceMaximumF32Kernel(input, d);
+  if (smooth_softmax && maximum < 0.0f) {
+    maximum = 0.0f;
+  }
+
+  const float negative_maximum = -maximum;
+
+  if (log_softmax) {
+    float accumulation = platform.ComputeSumExpF32Kernel(input, nullptr, d, &negative_maximum);
+    if (smooth_softmax) {
+      accumulation += std::exp(-maximum);
+    }
+
+    const float parameters[2] = {negative_maximum, std::log(accumulation)};
+    platform.ComputeLogSoftmaxOutputF32Kernel(input, output, d, parameters);
+    return;
+  }
+
+  float accumulation = platform.ComputeSumExpF32Kernel(input, output, d, &negative_maximum);
+  if (smooth_softmax) {
+    accumulation += std::exp(-maximum);
+  }
+
+  const float parameters[1] = {1.0f / accumulation};
+  platform.ComputeSoftmaxOutputF32Kernel(output, d, parameters);
+}
+
+CompareStats CompareCase(size_t rows, size_t d, bool log_softmax, bool smooth_softmax) {
+  std::vector<float> input(rows * d);
+  std::vector<float> scalar_output(rows * d);
+  std::vector<float> rvv_output(rows * d);
+
+  std::mt19937 rng(
+      static_cast<uint32_t>(rows * 131 + d * 17 + (log_softmax ? 7 : 0) + (smooth_softmax ? 19 : 0)));
+  std::uniform_real_distribution<float> dist(-150.0f, 190.0f);
+
+  for (float& value : input) {
+    value = dist(rng);
+  }
+
+  for (size_t row = 0; row < rows; ++row) {
+    const float* row_input = input.data() + row * d;
+    ScalarSoftmaxRow(row_input, scalar_output.data() + row * d, d, log_softmax, smooth_softmax);
+    RvvSoftmaxRow(row_input, rvv_output.data() + row * d, d, log_softmax, smooth_softmax);
+  }
+
+  CompareStats stats;
+  for (size_t i = 0; i < rows * d; ++i) {
+    const float scalar = scalar_output[i];
+    const float rvv = rvv_output[i];
+    const float abs_diff = std::fabs(scalar - rvv);
+    const float rel_diff = abs_diff / std::max(std::fabs(scalar), 1.0e-12f);
+    stats.max_abs_diff = std::max(stats.max_abs_diff, abs_diff);
+    stats.max_rel_diff = std::max(stats.max_rel_diff, rel_diff);
+    stats.checksum_scalar += scalar;
+    stats.checksum_rvv += rvv;
+  }
+
+  return stats;
+}
+
+TimingStats TimeCase(size_t rows, size_t d, size_t repeats, bool log_softmax, bool smooth_softmax) {
+  std::vector<float> input(rows * d);
+  std::vector<float> scalar_output(rows * d);
+  std::vector<float> rvv_output(rows * d);
+
+  std::mt19937 rng(static_cast<uint32_t>(rows * 97 + d * 29 + repeats));
+  std::uniform_real_distribution<float> dist(-10.0f, 10.0f);
+
+  for (float& value : input) {
+    value = dist(rng);
+  }
+
+  const auto scalar_begin = std::chrono::steady_clock::now();
+  for (size_t repeat = 0; repeat < repeats; ++repeat) {
+    for (size_t row = 0; row < rows; ++row) {
+      ScalarSoftmaxRow(input.data() + row * d, scalar_output.data() + row * d, d, log_softmax, smooth_softmax);
+    }
+  }
+  const auto scalar_end = std::chrono::steady_clock::now();
+
+  const auto rvv_begin = std::chrono::steady_clock::now();
+  for (size_t repeat = 0; repeat < repeats; ++repeat) {
+    for (size_t row = 0; row < rows; ++row) {
+      RvvSoftmaxRow(input.data() + row * d, rvv_output.data() + row * d, d, log_softmax, smooth_softmax);
+    }
+  }
+  const auto rvv_end = std::chrono::steady_clock::now();
+
+  TimingStats stats;
+  stats.scalar_ms =
+      std::chrono::duration_cast<std::chrono::duration<double, std::milli> >(scalar_end - scalar_begin).count();
+  stats.rvv_ms =
+      std::chrono::duration_cast<std::chrono::duration<double, std::milli> >(rvv_end - rvv_begin).count();
+  return stats;
+}
+
+void PrintCompareCase(const std::string& name, size_t rows, size_t d, bool log_softmax, bool smooth_softmax) {
+  const auto stats = CompareCase(rows, d, log_softmax, smooth_softmax);
+  std::cout << name << " rows=" << rows << " d=" << d << " log_softmax=" << log_softmax
+            << " smooth=" << smooth_softmax << '\n';
+  std::cout << "  max_abs_diff=" << std::setprecision(9) << stats.max_abs_diff
+            << " max_rel_diff=" << stats.max_rel_diff << '\n';
+  std::cout << "  checksum_scalar=" << std::setprecision(12) << stats.checksum_scalar
+            << " checksum_rvv=" << stats.checksum_rvv << '\n';
+}
+
+void PrintTimingCase(
+    const std::string& name, size_t rows, size_t d, size_t repeats, bool log_softmax, bool smooth_softmax) {
+  const auto stats = TimeCase(rows, d, repeats, log_softmax, smooth_softmax);
+  const double speedup = stats.rvv_ms > 0.0 ? stats.scalar_ms / stats.rvv_ms : 0.0;
+  std::cout << name << " rows=" << rows << " d=" << d << " repeats=" << repeats
+            << " log_softmax=" << log_softmax << " smooth=" << smooth_softmax << '\n';
+  std::cout << "  scalar_ms=" << std::fixed << std::setprecision(3) << stats.scalar_ms
+            << " rvv_ms=" << stats.rvv_ms << " speedup=" << speedup << "x\n";
+}
+
+}  // namespace
+
+int main() {
+  auto& platform = GetMlasPlatform();
+
+  std::cout << std::boolalpha;
+  std::cout << "dispatch_is_rvv_reduce="
+            << (platform.ReduceMaximumF32Kernel == MlasReduceMaximumF32KernelRvv) << '\n';
+  std::cout << "dispatch_is_rvv_sumexp="
+            << (platform.ComputeSumExpF32Kernel == MlasComputeSumExpF32KernelRvv) << '\n';
+  std::cout << "dispatch_is_rvv_softmax="
+            << (platform.ComputeSoftmaxOutputF32Kernel == MlasComputeSoftmaxOutputF32KernelRvv) << '\n';
+  std::cout << "dispatch_is_rvv_logsoftmax="
+            << (platform.ComputeLogSoftmaxOutputF32Kernel == MlasComputeLogSoftmaxOutputF32KernelRvv) << '\n';
+  std::cout << '\n';
+
+  PrintCompareCase("regression_case_3x128_softmax", 3, 128, false, true);
+  PrintCompareCase("regression_case_3x128_logsoftmax", 3, 128, true, true);
+  PrintCompareCase("regression_case_63x95_softmax", 63, 95, false, true);
+  PrintCompareCase("regression_case_16x211_softmax", 16, 211, false, true);
+  std::cout << '\n';
+
+  PrintTimingCase("perf_case_attention_like", 4096, 128, 100, false, true);
+  PrintTimingCase("perf_case_long_seq", 1024, 1024, 20, false, true);
+
+  return 0;
+}
+
+#endif
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index b60615e0c967f..f56c81d2e89de 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -946,6 +946,224 @@ TEST(CoreMLExecutionProviderTest, HardSigmoidTest) {
   TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
 #endif
 }
+
+TEST(CoreMLExecutionProviderTest, QuickGeluTest) {
+  // Single com.microsoft:QuickGelu node (produced by ORT's QuickGeluFusion pass
+  // from the pattern x * sigmoid(alpha * x)). Verify the CoreML MLProgram path
+  // claims the whole graph and produces the same output as CPU.
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* onnx_opset = model_proto.add_opset_import();
+  onnx_opset->set_domain("");
+  onnx_opset->set_version(13);
+  auto* ms_opset = model_proto.add_opset_import();
+  ms_opset->set_domain("com.microsoft");
+  ms_opset->set_version(1);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("quick_gelu_test");
+
+  auto* input = graph_proto->add_input();
+  input->set_name("X");
+  auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape();
+  input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_shape->add_dim()->set_dim_value(1);
+  input_shape->add_dim()->set_dim_value(3);
+  input_shape->add_dim()->set_dim_value(2);
+  input_shape->add_dim()->set_dim_value(4);
+
+  auto* output = graph_proto->add_output();
+  output->set_name("Y");
+  auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape();
+  output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  output_shape->add_dim()->set_dim_value(1);
+  output_shape->add_dim()->set_dim_value(3);
+  output_shape->add_dim()->set_dim_value(2);
+  output_shape->add_dim()->set_dim_value(4);
+
+  auto* node = graph_proto->add_node();
+  node->set_op_type("QuickGelu");
+  node->set_domain("com.microsoft");
+  node->add_input("X");
+  node->add_output("Y");
+  // Use a non-default alpha so the test catches any attribute-wiring bug.
+  auto* alpha_attr = node->add_attribute();
+  alpha_attr->set_name("alpha");
+  alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT);
+  alpha_attr->set_f(1.5f);
+
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 3, 2, 4};
+  std::vector<float> input_data = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f,
+                                   10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f,
+                                   -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f};
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  RunAndVerifyOutputsWithEP(model_span, "QuickGeluTest_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, QuickGeluTestAlphaOne) {
+  // alpha == 1.0 triggers the "skip leading mul" optimization in the op
+  // builder. Verify correctness on that branch — the emitted MIL graph is
+  // sigmoid(x) -> mul(x, sigmoid(x)) instead of the 3-op decomposition.
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* onnx_opset = model_proto.add_opset_import();
+  onnx_opset->set_domain("");
+  onnx_opset->set_version(13);
+  auto* ms_opset = model_proto.add_opset_import();
+  ms_opset->set_domain("com.microsoft");
+  ms_opset->set_version(1);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("quick_gelu_alpha_one_test");
+
+  auto* input = graph_proto->add_input();
+  input->set_name("X");
+  auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape();
+  input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  input_shape->add_dim()->set_dim_value(1);
+  input_shape->add_dim()->set_dim_value(3);
+  input_shape->add_dim()->set_dim_value(2);
+  input_shape->add_dim()->set_dim_value(4);
+
+  auto* output = graph_proto->add_output();
+  output->set_name("Y");
+  auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape();
+  output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  output_shape->add_dim()->set_dim_value(1);
+  output_shape->add_dim()->set_dim_value(3);
+  output_shape->add_dim()->set_dim_value(2);
+  output_shape->add_dim()->set_dim_value(4);
+
+  auto* node = graph_proto->add_node();
+  node->set_op_type("QuickGelu");
+  node->set_domain("com.microsoft");
+  node->add_input("X");
+  node->add_output("Y");
+  auto* alpha_attr = node->add_attribute();
+  alpha_attr->set_name("alpha");
+  alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT);
+  alpha_attr->set_f(1.0f);
+
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 3, 2, 4};
+  std::vector<float> input_data = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f,
+                                   10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f,
+                                   -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f};
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  RunAndVerifyOutputsWithEP(model_span, "QuickGeluTestAlphaOne_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
+TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) {
+  // FLOAT16 variant of QuickGeluTest. Exercises the MLFloat16 branch of the
+  // alpha-scalar wiring in QuickGeluOpBuilder::AddToModelBuilderImpl.
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* onnx_opset = model_proto.add_opset_import();
+  onnx_opset->set_domain("");
+  onnx_opset->set_version(13);
+  auto* ms_opset = model_proto.add_opset_import();
+  ms_opset->set_domain("com.microsoft");
+  ms_opset->set_version(1);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("quick_gelu_fp16_test");
+
+  auto* input = graph_proto->add_input();
+  input->set_name("X");
+  auto* input_shape = input->mutable_type()->mutable_tensor_type()->mutable_shape();
+  input->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+  input_shape->add_dim()->set_dim_value(1);
+  input_shape->add_dim()->set_dim_value(3);
+  input_shape->add_dim()->set_dim_value(2);
+  input_shape->add_dim()->set_dim_value(4);
+
+  auto* output = graph_proto->add_output();
+  output->set_name("Y");
+  auto* output_shape = output->mutable_type()->mutable_tensor_type()->mutable_shape();
+  output->mutable_type()->mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+  output_shape->add_dim()->set_dim_value(1);
+  output_shape->add_dim()->set_dim_value(3);
+  output_shape->add_dim()->set_dim_value(2);
+  output_shape->add_dim()->set_dim_value(4);
+
+  auto* node = graph_proto->add_node();
+  node->set_op_type("QuickGelu");
+  node->set_domain("com.microsoft");
+  node->add_input("X");
+  node->add_output("Y");
+  auto* alpha_attr = node->add_attribute();
+  alpha_attr->set_name("alpha");
+  alpha_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOAT);
+  alpha_attr->set_f(1.5f);
+
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<int64_t> dims = {1, 3, 2, 4};
+  const std::vector<float> input_floats = {-10.0f, -3.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 3.0f,
+                                           10.0f, -5.0f, 5.0f, 2.0f, -2.0f, 4.0f, -4.0f, 0.25f,
+                                           -0.25f, 7.0f, -7.0f, 1.5f, -1.5f, 0.1f, -0.1f, 20.0f};
+  std::vector<MLFloat16> input_data;
+  input_data.reserve(input_floats.size());
+  for (float f : input_floats) input_data.emplace_back(f);
+
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<MLFloat16>(allocator, dims, input_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  EPVerificationParams params{};
+  params.ep_node_assignment = ExpectedEPNodeAssignment::All;
+  // fp16 accumulates larger absolute error than fp32 across the three-op
+  // decomposition (mul, sigmoid, mul). Outputs are bounded by |x|, max ~20 in
+  // this test; fp16 ulp at that magnitude is ~0.01, so 2e-2 leaves headroom.
+  params.fp32_abs_err = 2e-2f;
+
+  RunAndVerifyOutputsWithEP(model_span, "QuickGeluTestFp16_MLProgram",
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds, params);
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+
 #endif  // !(ORT_MINIMAL_BUILD)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
index 6a3b0d8160d53..2f51b8a7a5690 100644
--- a/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/providers/cpu/llm/rotary_embedding_op_test.cc
@@ -1208,10 +1208,11 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_CUDA_Passthrough) {
   }
 
   test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
   test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 1.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
   test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
-                       std::vector<float>(max_sequence_length * head_size / 2, 0.0f));
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
   // position_id = 2048 exceeds max_sequence_length = 8 — CUDA should pass through input unchanged.
   test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
 
@@ -1291,5 +1292,125 @@ TEST(RotaryEmbeddingTest, RotaryEmbedding_RejectsRank3HiddenSizeNotDivisibleByNu
            "hidden_size=5 must be divisible by num_heads=2 for rank-3 input", {}, nullptr, &execution_providers);
 }
 
+// Test that OOB position_ids on WebGPU pass through input unchanged (shader-side defense).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+  // position_id = 2048 exceeds max_sequence_length = 8 — shader passes through input unchanged.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {2048});
+
+  // Output should equal input when position_id is OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+// Test that negative position_ids pass through on WebGPU (shader-side defense catches raw_pos < 0).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_Negative_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 1;
+  int sequence_length = 1;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  std::vector<float> input_data(hidden_size);
+  for (int i = 0; i < hidden_size; ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+  // Negative position_id — shader checks raw_pos < 0 and passes through.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {-1});
+
+  // Output should equal input when position_id is negative (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
+// Test that OOB position_ids in a batch pass through on WebGPU (shader-side defense).
+TEST(RotaryEmbeddingTest, RotaryEmbedding_PositionIds_OOB_InBatch_WebGPU_Passthrough) {
+  if (nullptr == DefaultWebGpuExecutionProvider().get()) {
+    GTEST_SKIP() << "WebGPU execution provider is not available.";
+  }
+
+  int batch_size = 2;
+  int sequence_length = 2;
+  int num_heads = 2;
+  int head_size = 4;
+  int max_sequence_length = 8;
+  int hidden_size = num_heads * head_size;
+
+  OpTester test("RotaryEmbedding", 23, onnxruntime::kOnnxDomain);
+  test.AddAttribute<int64_t>("interleaved", static_cast<int64_t>(0));
+  test.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+  std::vector<float> input_data(batch_size * sequence_length * hidden_size);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i + 1);
+  }
+
+  test.AddInput<float>("input", {batch_size, sequence_length, hidden_size}, input_data);
+  // Non-trivial cache values ensure pass-through (output=input) differs from valid rotary output.
+  test.AddInput<float>("cos_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.5f));
+  test.AddInput<float>("sin_cache", {max_sequence_length, head_size / 2},
+                       std::vector<float>(max_sequence_length * head_size / 2, 0.866f));
+  // All OOB position_ids — shader passes through input unchanged.
+  test.AddInput<int64_t>("position_ids", {batch_size, sequence_length}, {100, 200, 300, 400});
+
+  // Output should equal input when all position_ids are OOB (pass-through).
+  test.AddOutput<float>("output", {batch_size, sequence_length, hidden_size}, input_data);
+  test.SetOutputAbsErr("output", 0.0f);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 45b961ee21849..f624ecf57d05e 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -181,6 +181,38 @@ std::vector<MatMulTestData<T>> GenerateTestCases() {
            // clang-format on
        })});
 
+  test_cases.push_back(
+      {"test 3D tensors with batchA = 3, M = 2, N = 3",
+       {3, 2, 8},
+       {1, 8, 3},
+       {3, 2, 3},
+       real_expected_vals({
+           // clang-format off
+              420, 448, 476,
+              1092, 1184, 1276,
+              1764, 1920, 2076,
+              2436, 2656, 2876,
+              3108, 3392, 3676,
+              3780, 4128, 4476,
+           // clang-format on
+       })});
+
+  test_cases.push_back(
+      {"test 3D tensors with batchA = 3, M = 2, N = 4",
+       {3, 2, 8},
+       {1, 8, 4},
+       {3, 2, 4},
+       real_expected_vals({
+           // clang-format off
+              560, 588, 616, 644,
+              1456, 1548, 1640, 1732,
+              2352, 2508, 2664, 2820,
+              3248, 3468, 3688, 3908,
+              4144, 4428, 4712, 4996,
+              5040, 5388, 5736, 6084,
+           // clang-format on
+       })});
+
   test_cases.push_back(
       {"test 4D tensors with M = 1",
        {2, 3, 1, 8},
@@ -598,6 +630,63 @@ TEST(MathOpTest, MatMulSharedPrepackedWeights) {
   }
 }
 
+// Test MatMul with batch_size > 1 that exercises the Split-K path.
+// Split-K is triggered when dim_inner is large relative to dim_a_outer * dim_b_outer,
+// is_vec4 is true, and the GPU supports it. This test validates correctness when
+// batch_size > 1 with dimensions that would trigger Split-K on supported hardware.
+TEST(MathOpTest, MatMulBatchedSplitK) {
+  // Dimensions chosen so dim_inner is large (triggers Split-K) and vec4-compatible.
+  // batch=2, M=4, K=768, N=64
+  constexpr int64_t batch = 2;
+  constexpr int64_t M = 4;
+  constexpr int64_t K = 768;
+  constexpr int64_t N = 64;
+
+  std::vector<int64_t> A_shape = {batch, M, K};
+  std::vector<int64_t> B_shape = {batch, K, N};
+  std::vector<int64_t> Y_shape = {batch, M, N};
+
+  // Generate sequential data so the expected output is deterministic.
+  int64_t a_size = batch * M * K;
+  int64_t b_size = batch * K * N;
+  std::vector<float> A_data(a_size);
+  std::vector<float> B_data(b_size);
+
+  // Use small values to avoid fp32 overflow.
+  for (int64_t i = 0; i < a_size; ++i) {
+    A_data[i] = static_cast<float>((i % 11) - 5) * 0.01f;
+  }
+  for (int64_t i = 0; i < b_size; ++i) {
+    B_data[i] = static_cast<float>((i % 13) - 6) * 0.01f;
+  }
+
+  // Compute expected output on CPU.
+  std::vector<float> expected(batch * M * N, 0.0f);
+  for (int64_t b_idx = 0; b_idx < batch; ++b_idx) {
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          float a_val = A_data[b_idx * M * K + m * K + k];
+          float b_val = B_data[b_idx * K * N + k * N + n];
+          sum += a_val * b_val;
+        }
+        expected[b_idx * M * N + m * N + n] = sum;
+      }
+    }
+  }
+
+  OpTester test("MatMul", 13);
+  test.AddInput<float>("A", A_shape, A_data);
+  test.AddInput<float>("B", B_shape, B_data);
+  test.AddOutput<float>("Y", Y_shape, expected);
+
+  // Exclude providers that don't support this configuration.
+  test.ConfigExcludeEps({kTensorrtExecutionProvider, kOpenVINOExecutionProvider, kQnnExecutionProvider})
+      .Config(run_with_tunable_op)
+      .RunWithConfig();
+}
+
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc b/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc
index 6f80b6f1dfb7a..8083874213b1f 100644
--- a/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/linearclassifer_test.cc
@@ -129,7 +129,7 @@ TEST(MLOpTest, LinearClassifierBinaryWithLabels) {
 TEST(MLOpTest, LinearClassifierInvalidCoefficientsSize) {
   OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain);
 
-  test.AddAttribute("coefficients", std::vector<float>{1.f, 2.f});
+  test.AddAttribute("coefficients", std::vector<float>{1.f, 2.f, 3.f});
   test.AddAttribute("intercepts", std::vector<float>{0.f, 0.f});
   test.AddAttribute("classlabels_ints", std::vector<int64_t>{0, 1});
 
@@ -202,6 +202,26 @@ TEST(MLOpTest, LinearClassifierInvalidCoefficientsSizeFails) {
            "coefficients size (3) is less than class_count (3) * num_features (2)");
 }
 
+TEST(MLOpTest, LinearClassifierExtraCoefficientsAreIgnored) {
+  OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {-0.22562418f, 0.34188559f, 0.68346153f,
+                                     -0.68051993f, -0.1975279f, 0.03748541f,
+                                     101.f, 102.f, 103.f};
+  std::vector<int64_t> classes = {1, 2, 3};
+  std::vector<float> intercepts = {-3.91601811f, 0.42575697f, 0.13731251f};
+
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("intercepts", intercepts);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 2}, {1.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {2LL});
+  test.AddOutput<float>("Z", {1, 3}, {-4.14164229f, 1.1092185f, -0.06021539f});
+
+  test.Run();
+}
+
 // Regression test: coefficients not divisible by class_count.
 TEST(MLOpTest, LinearClassifierCoefficientsSizeNotDivisibleByClassCountFails) {
   OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain);
@@ -220,7 +240,27 @@ TEST(MLOpTest, LinearClassifierCoefficientsSizeNotDivisibleByClassCountFails) {
   test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
 
   test.Run(OpTester::ExpectResult::kExpectFailure,
-           "coefficients size (5) must be a multiple of the number of classes (3)");
+           "coefficients size (5) is less than class_count (3) * num_features (2)");
+}
+
+TEST(MLOpTest, LinearClassifierInputFeatureCountMismatchFails) {
+  OpTester test("LinearClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {-0.22562418f, 0.34188559f, 0.68346153f,
+                                     -0.68051993f, -0.1975279f, 0.03748541f};
+  std::vector<int64_t> classes = {1, 2, 3};
+  std::vector<float> intercepts = {-3.91601811f, 0.42575697f, 0.13731251f};
+
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("intercepts", intercepts);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 3}, {1.f, 0.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {0LL});
+  test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "coefficients size (6) is less than class_count (3) * num_features (3)");
 }
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
index 3c5b71b90b4b8..2c89c03b6791b 100644
--- a/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/svmclassifier_test.cc
@@ -297,6 +297,31 @@ TEST(MLOpTest, SVMClassifierUndersizedCoefficients) {
   test.Run(OpTester::ExpectResult::kExpectFailure, "coefficients attribute size");
 }
 
+TEST(MLOpTest, SVMClassifierVectorsPerClassSizeMismatch) {
+  OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {1.f, 1.f, 1.f, 1.f};
+  std::vector<float> support_vectors = {0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f, 0.1f};
+  std::vector<float> rho = {0.1f, 0.1f, 0.1f};
+  std::vector<float> kernel_params = {0.01f, 0.f, 3.f};
+  std::vector<int64_t> classes = {0, 1, 2};
+  std::vector<int64_t> vectors_per_class = {1, 1};  // needs one entry per class
+
+  test.AddAttribute("kernel_type", std::string("RBF"));
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("support_vectors", support_vectors);
+  test.AddAttribute("vectors_per_class", vectors_per_class);
+  test.AddAttribute("rho", rho);
+  test.AddAttribute("kernel_params", kernel_params);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {1, 4}, {0.f, 0.f, 0.f, 0.f});
+  test.AddOutput<int64_t>("Y", {1}, {1});
+  test.AddOutput<float>("Z", {1, 3}, {0.f, 0.f, 0.f});
+
+  test.Run(OpTester::ExpectResult::kExpectFailure, "vectors_per_class");
+}
+
 TEST(MLOpTest, SVMClassifierInvalidInputFeatureCount) {
   OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
 
@@ -438,5 +463,47 @@ TEST(MLOpTest, SVMClassifierDifferentSizeKernelParameters) {
   test.Run(OpTester::ExpectResult::kExpectFailure, "kernel_params must be empty or have 3 values");
 }
 
+TEST(MLOpTest, SVMClassifierSVCLinearUndersizedVectorPerClass) {
+  OpTester test("SVMClassifier", 1, onnxruntime::kMLDomain);
+
+  std::vector<float> coefficients = {0.766398549079895f, 0.0871576070785522f, 0.110420741140842f,
+                                     -0.963976919651031f};
+  std::vector<float> support_vectors = {4.80000019073486f, 3.40000009536743f, 1.89999997615814f,
+                                        5.f, 3.f, 1.60000002384186f,
+                                        4.5f, 2.29999995231628f, 1.29999995231628f,
+                                        5.09999990463257f, 2.5f, 3.f};
+  std::vector<float> rho = {2.23510527610779f};
+  std::vector<float> kernel_params = {0.122462183237076f, 0.f, 3.f};  // gamma, coef0, degree
+  std::vector<int64_t> classes = {0, 1};
+  std::vector<int64_t> vectors_per_class = {3};  // undersized: 2 classes but only 1 entry
+
+  std::vector<float> X = {5.1f, 3.5f, 1.4f,
+                          4.9f, 3.f, 1.4f,
+                          4.7f, 3.2f, 1.3f,
+                          4.6f, 3.1f, 1.5f,
+                          5.f, 3.6f, 1.4f};
+  std::vector<float> scores_predictions = {-1.5556798f, 1.5556798f,
+                                           -1.2610321f, 1.2610321f,
+                                           -1.5795376f, 1.5795376f,
+                                           -1.3083477f, 1.3083477f,
+                                           -1.6572928f, 1.6572928f};
+
+  std::vector<int64_t> class_predictions = {0, 0, 0, 0, 0};
+
+  test.AddAttribute("kernel_type", std::string("LINEAR"));
+  test.AddAttribute("coefficients", coefficients);
+  test.AddAttribute("support_vectors", support_vectors);
+  test.AddAttribute("vectors_per_class", vectors_per_class);
+  test.AddAttribute("rho", rho);
+  test.AddAttribute("kernel_params", kernel_params);
+  test.AddAttribute("classlabels_ints", classes);
+
+  test.AddInput<float>("X", {5, 3}, X);
+  test.AddOutput<int64_t>("Y", {5}, class_predictions);
+  test.AddOutput<float>("Z", {5, 2}, scores_predictions);
+
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Mismatch between classlabels_ints/classlabels_strings and vectors_per_class dimensions.");
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index f3e233fd69a64..25d37846a2028 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -888,6 +888,97 @@ TEST(ConvTest, Conv2D_MatMul_SplitK_With_Bias) {
   TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
 }
 
+TEST(ConvTest, Conv2D_MatMul_Batched_No_Bias) {
+  ConvOpAndTestAttributes attrs = {
+      "",                                // auto_pad
+      std::vector<int64_t>{1, 1},        // dilations
+      1,                                 // group
+      std::vector<int64_t>{1, 1},        // kernel_shape
+      std::vector<int64_t>{0, 0, 0, 0},  // pads
+      std::vector<int64_t>{1, 1},        // strides
+      {}                                 // excluded EPs
+  };
+
+  constexpr int64_t batch = 2;  // batch > 1
+  constexpr int64_t M = 16;
+  constexpr int64_t K = 768;
+  constexpr int64_t N = 64;
+
+  std::vector<int64_t> X_shape = {batch, K, M, 1};
+  std::vector<int64_t> W_shape = {N, K, 1, 1};
+  std::vector<int64_t> Y_shape = {batch, N, M, 1};
+
+  RandomValueGenerator random{5678};
+  std::vector<float> X(random.Gaussian<float>(AsSpan(X_shape), 0.0f, 0.025f));
+  std::vector<float> W(random.Gaussian<float>(AsSpan(W_shape), 0.0f, 0.025f));
+
+  std::vector<float> expected_vals(batch * N * M, 0.0f);
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          int x_index = static_cast<int>(b * K * M + k * M + m);
+          int w_index = static_cast<int>(n * K + k);
+          sum += X[x_index] * W[w_index];
+        }
+        int y_index = static_cast<int>(b * N * M + n * M + m);
+        expected_vals[y_index] = sum;
+      }
+    }
+  }
+
+  TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(ConvTest, Conv2D_MatMul_Batched_With_Bias) {
+  ConvOpAndTestAttributes attrs = {
+      "",                                // auto_pad
+      std::vector<int64_t>{1, 1},        // dilations
+      1,                                 // group
+      std::vector<int64_t>{1, 1},        // kernel_shape
+      std::vector<int64_t>{0, 0, 0, 0},  // pads
+      std::vector<int64_t>{1, 1},        // strides
+      {}                                 // excluded EPs
+  };
+
+  constexpr int64_t batch = 2;
+  constexpr int64_t M = 16;
+  constexpr int64_t K = 768;
+  constexpr int64_t N = 64;
+
+  std::vector<int64_t> X_shape = {batch, K, M, 1};
+  std::vector<int64_t> W_shape = {N, K, 1, 1};
+  std::vector<int64_t> Y_shape = {batch, N, M, 1};
+  std::vector<int64_t> B_shape = {N};
+
+  RandomValueGenerator random{5678};
+  std::vector<float> X(random.Gaussian<float>(AsSpan(X_shape), 0.0f, 0.025f));
+  std::vector<float> W(random.Gaussian<float>(AsSpan(W_shape), 0.0f, 0.025f));
+  std::vector<float> B(random.Gaussian<float>(AsSpan(B_shape), 0.0f, 0.25f));
+
+  std::vector<float> expected_vals(batch * N * M, 0.0f);
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          int x_index = static_cast<int>(b * K * M + k * M + m);
+          int w_index = static_cast<int>(n * K + k);
+          sum += X[x_index] * W[w_index];
+        }
+        sum += B[static_cast<size_t>(n)];
+        int y_index = static_cast<int>(b * N * M + n * M + m);
+        expected_vals[y_index] = sum;
+      }
+    }
+  }
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true);
+}
+
 // Conv10
 TEST(ConvTest, Conv3D_1) {
   ConvOpAndTestAttributes attrs = {
diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
index 6004ae8e18c05..86a58c6e890f0 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc
@@ -525,7 +525,7 @@ TEST(ConvTransposeTest, ConvTranspose_InvalidKernelShape) {
                       // so drop the part that differs from the expected string
                       "kernel_shape num_dims is not compatible with W num_dims. kernel_shape: {1,1,1,5} W: {1,1,",
                       {kTensorrtExecutionProvider, kQnnExecutionProvider,
-                       kDmlExecutionProvider});  // TODO: Unskip when fixed #41968513
+                       kDmlExecutionProvider, kOpenVINOExecutionProvider});  // TODO: Unskip when fixed #41968513
 }
 
 TEST(ConvTransposeTest, ConvTranspose_InvalidBiasShape_1) {
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index ba3bf869b7f0a..f10aa5a49c120 100755
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -23,6 +23,10 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders() {
   execution_providers.push_back(DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true));
 #endif
 
+#ifdef USE_WEBGPU
+  execution_providers.push_back(DefaultWebGpuExecutionProvider());
+#endif
+
   return execution_providers;
 }
 
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 393adbede82fb..79b4156f5d6c0 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -1454,6 +1454,9 @@ void QuantizeLinearOp19Test(bool saturate) {
 }
 
 TEST(QuantizeLinearOpTest, Float8) {
+#ifdef USE_OPENVINO
+  GTEST_SKIP() << "Skipping Float8 QuantizeLinear test for OpenVINO EP";
+#endif
   constexpr int min_cuda_architecture = 11080;
   bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
   bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
@@ -1496,6 +1499,9 @@ void QuantizeLinearOp19F16Test(bool saturate) {
 }
 
 TEST(QuantizeLinearOpMLFloat16Test, Float8) {
+#ifdef USE_OPENVINO
+  GTEST_SKIP() << "Skipping Float8 QuantizeLinear test for OpenVINO EP";
+#endif
   constexpr int min_cuda_architecture = 11080;
   bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture);
   bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get());
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 416d9b6edecd1..bb83f6d36011f 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -2,6 +2,8 @@
 # Licensed under the MIT License.
 
 # -*- coding: UTF-8 -*-
+import os
+import tempfile
 import unittest
 
 import numpy as np
@@ -64,5 +66,128 @@ def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
             assert_allclose(session_run_results[0], -(inp0 - inp1))
 
 
+class TestBackendKwargsAllowlist(unittest.TestCase):
+    """Tests that the SessionOptions/RunOptions kwargs allowlist correctly blocks
+    dangerous attributes and allows safe ones, preventing arbitrary file writes
+    through user-controlled kwargs."""
+
+    def test_blocked_session_option_optimized_model_filepath_raises(self):
+        """optimized_model_filepath is a known SessionOptions attr but is not in the allowlist.
+        It must raise RuntimeError to prevent arbitrary file overwrites."""
+        name = get_name("mul_1.onnx")
+        with tempfile.NamedTemporaryFile(suffix=".bin") as tmp:
+            with self.assertRaises(RuntimeError) as ctx:
+                backend.prepare(name, optimized_model_filepath=tmp.name)
+            self.assertIn("not permitted", str(ctx.exception))
+
+    def test_blocked_session_option_profile_file_prefix_raises(self):
+        """profile_file_prefix is a known SessionOptions attr but is not in the allowlist.
+        It must raise RuntimeError to prevent arbitrary file writes via profiling output."""
+        name = get_name("mul_1.onnx")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            prefix = os.path.join(tmpdir, "profile")
+            with self.assertRaises(RuntimeError) as ctx:
+                backend.prepare(name, profile_file_prefix=prefix)
+            self.assertIn("not permitted", str(ctx.exception))
+
+    def test_blocked_session_option_enable_profiling_raises(self):
+        """enable_profiling is excluded from the allowlist because it causes uncontrolled
+        file writes (profiling JSON) to the current working directory."""
+        name = get_name("mul_1.onnx")
+        with self.assertRaises(RuntimeError) as ctx:
+            backend.prepare(name, enable_profiling=True)
+        self.assertIn("not permitted", str(ctx.exception))
+
+    def test_unknown_kwarg_is_silently_ignored(self):
+        """A kwarg that is not a SessionOptions attribute at all must be silently ignored.
+        This preserves backward compatibility for callers who pass extra kwargs."""
+        name = get_name("mul_1.onnx")
+        rep = backend.prepare(name, totally_unknown_kwarg="foo")
+        self.assertIsNotNone(rep)
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = rep.run(x)
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_safe_session_option_graph_optimization_level_is_accepted(self):
+        """graph_optimization_level is in the allowlist and must be accepted without error."""
+        name = get_name("mul_1.onnx")
+        rep = backend.prepare(name, graph_optimization_level=onnxrt.GraphOptimizationLevel.ORT_DISABLE_ALL)
+        self.assertIsNotNone(rep)
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = rep.run(x)
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_safe_session_option_intra_op_num_threads_is_accepted(self):
+        """intra_op_num_threads is in the allowlist and must be accepted without error."""
+        name = get_name("mul_1.onnx")
+        rep = backend.prepare(name, intra_op_num_threads=1)
+        self.assertIsNotNone(rep)
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = rep.run(x)
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_blocked_run_option_terminate_raises(self):
+        """terminate is a known RunOptions attr excluded from the allowlist; BackendRep.run() must raise RuntimeError when it is passed."""
+        name = get_name("mul_1.onnx")
+        rep = backend.prepare(name)
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        with self.assertRaises(RuntimeError) as ctx:
+            rep.run(x, terminate=True)
+        self.assertIn("not permitted", str(ctx.exception))
+
+    def test_run_model_with_safe_session_option(self):
+        """run_model() must accept safe SessionOptions kwargs and produce correct output."""
+        name = get_name("mul_1.onnx")
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = backend.run(name, [x], graph_optimization_level=onnxrt.GraphOptimizationLevel.ORT_DISABLE_ALL)
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_run_model_with_safe_run_option(self):
+        """run_model() must accept safe RunOptions kwargs and produce correct output."""
+        name = get_name("mul_1.onnx")
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = backend.run(name, [x], only_execute_path_to_fetches=True)
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_run_model_with_blocked_run_option_raises(self):
+        """run_model() must raise RuntimeError when given a blocked RunOptions attribute."""
+        name = get_name("mul_1.onnx")
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        with self.assertRaises(RuntimeError) as ctx:
+            backend.run(name, [x], terminate=True)
+        self.assertIn("not permitted", str(ctx.exception))
+
+    def test_unknown_kwarg_is_silently_ignored_in_run(self):
+        """A kwarg unknown to RunOptions must be silently ignored by rep.run()."""
+        name = get_name("mul_1.onnx")
+        rep = backend.prepare(name)
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = rep.run(x, completely_unknown_key="bar")
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_unknown_kwarg_is_silently_ignored_in_run_model(self):
+        """An unknown kwarg must be silently ignored by both prepare() and rep.run() in run_model()."""
+        name = get_name("mul_1.onnx")
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        res = backend.run(name, [x], completely_unknown_key="baz")
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(res[0], output_expected, rtol=1e-05, atol=1e-08)
+
+    def test_run_model_with_blocked_session_option_raises(self):
+        """run_model() must raise RuntimeError when given a blocked SessionOptions attribute."""
+        name = get_name("mul_1.onnx")
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        with tempfile.NamedTemporaryFile(suffix=".bin") as tmp:
+            with self.assertRaises(RuntimeError) as ctx:
+                backend.run(name, [x], optimized_model_filepath=tmp.name)
+            self.assertIn("not permitted", str(ctx.exception))
+
+
 if __name__ == "__main__":
     unittest.main(module=__name__, buffer=True)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index d6c1dd9cff3f3..987efd5af5e8e 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -76,6 +76,35 @@ def test_ort_value_update_in_place(self):
             ortvalue_gpu.update_inplace(x1)
             np.testing.assert_allclose(ortvalue_gpu.numpy(), x1)
 
+    def test_ort_value_update_in_place_from_ortvalue(self):
+        # Test CPU to CPU copy via OrtValue
+        x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        x1 = np.array([[10.0, 20.0], [30.0, 40.0], [50.0, 60.0]], dtype=np.float32)
+
+        ortvalue_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0)
+        ortvalue_src = onnxrt.OrtValue.ortvalue_from_numpy(x1)
+        ortvalue_dst.update_inplace(ortvalue_src)
+        np.testing.assert_allclose(ortvalue_dst.numpy(), x1)
+
+        if "CUDAExecutionProvider" in onnxrt.get_available_providers():
+            # Test GPU to GPU copy via OrtValue
+            ortvalue_gpu_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0)
+            ortvalue_gpu_src = onnxrt.OrtValue.ortvalue_from_numpy(x1, "cuda", 0)
+            ortvalue_gpu_dst.update_inplace(ortvalue_gpu_src)
+            np.testing.assert_allclose(ortvalue_gpu_dst.numpy(), x1)
+
+            # Test CPU OrtValue to GPU OrtValue copy
+            ortvalue_gpu_dst2 = onnxrt.OrtValue.ortvalue_from_numpy(x0, "cuda", 0)
+            ortvalue_cpu_src = onnxrt.OrtValue.ortvalue_from_numpy(x1)
+            ortvalue_gpu_dst2.update_inplace(ortvalue_cpu_src)
+            np.testing.assert_allclose(ortvalue_gpu_dst2.numpy(), x1)
+
+            # Test GPU OrtValue to CPU OrtValue copy
+            ortvalue_cpu_dst = onnxrt.OrtValue.ortvalue_from_numpy(x0)
+            ortvalue_gpu_src2 = onnxrt.OrtValue.ortvalue_from_numpy(x1, "cuda", 0)
+            ortvalue_cpu_dst.update_inplace(ortvalue_gpu_src2)
+            np.testing.assert_allclose(ortvalue_cpu_dst.numpy(), x1)
+
     def test_select_ep_to_run_cuda_graph(self):
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
             providers = [("TensorrtExecutionProvider", {"trt_cuda_graph_enable": True})]
diff --git a/onnxruntime/test/python/transformers/test_paged_attention_cuda.py b/onnxruntime/test/python/transformers/test_paged_attention_cuda.py
index 66eb4a885620b..fda861c8125ff 100644
--- a/onnxruntime/test/python/transformers/test_paged_attention_cuda.py
+++ b/onnxruntime/test/python/transformers/test_paged_attention_cuda.py
@@ -262,6 +262,7 @@ def paged_attention_func(
     cos=None,
     sin=None,
     window_size=-1,
+    sdpa_kernel=0,
 ):
     num_tokens = cumulative_sequence_length[-1].item()
     num_blocks = key_cache.shape[0]
@@ -282,7 +283,11 @@ def paged_attention_func(
         "block_table": block_table.detach().cpu().numpy(),
     }
     sess_options = SessionOptions()
-    ort_session = InferenceSession(onnx_model_str, sess_options, providers=[config.ep])
+    if sdpa_kernel != 0 and config.ep == "CUDAExecutionProvider":
+        providers = [(config.ep, {"sdpa_kernel": str(sdpa_kernel)})]
+    else:
+        providers = [config.ep]
+    ort_session = InferenceSession(onnx_model_str, sess_options, providers=providers)
     io_binding = ort_session.io_binding()
     if key is not None and value is not None:
         ort_inputs["key"] = key.detach().cpu().numpy()
@@ -490,6 +495,7 @@ def parity_check_paged_attention(
     config: Config,
     rtol=1e-3,
     atol=1e-3,
+    sdpa_kernel=0,
 ):
     # Generate padded inputs
     q = torch.randn(
@@ -620,6 +626,7 @@ def parity_check_paged_attention(
         cos,
         sin,
         left_window_size,
+        sdpa_kernel=sdpa_kernel,
     )
     num_tokens = q_unpad.shape[0]
     out = torch.reshape(out, (num_tokens, config.num_heads, config.head_size))
@@ -672,6 +679,25 @@ def has_flash_attention():
     )
 
 
+def has_memory_efficient_attention():
+    # CUTLASS fMHA (MemoryEfficientAttention) gate — these tests are fp16-only,
+    # so sm>=53 is sufficient. bf16 MEA would require sm>=80 but is not covered here.
+    if not torch.cuda.is_available():
+        return False
+    if "CUDAExecutionProvider" not in get_available_providers():
+        return False
+    major, minor = torch.cuda.get_device_capability()
+    return (major * 10 + minor) >= 53
+
+
+# Bit value matching AttentionBackend::EFFICIENT_ATTENTION in
+# onnxruntime/contrib_ops/cpu/bert/attention_common.h. Passing this as the
+# CUDA provider option `sdpa_kernel` forces the PagedAttention kernel to
+# select the MemoryEfficientAttention (CUTLASS fMHA) fallback even on SM>=80
+# where FlashAttention would otherwise be preferred.
+SDPA_KERNEL_EFFICIENT_ATTENTION = 2
+
+
 def paged_attention_test_cases():
     batches = [4] if pipeline_mode else [1, 3, 5]
     seqs = (
@@ -732,5 +758,25 @@ def test_paged_attention(self, _, config):
         parity_check_paged_attention(config, rtol=5e-3, atol=5e-3)
 
 
+@unittest.skipIf(
+    not has_memory_efficient_attention(),
+    reason="MemoryEfficientAttention (fp16) requires sm>=53; skipping.",
+)
+class TestPagedAttentionMEA(unittest.TestCase):
+    """Runs the same parity matrix as TestPagedAttention but forces the CUTLASS
+    memory-efficient attention fallback via the `sdpa_kernel` CUDA provider option.
+    This is the only coverage for the SM<80 fallback path introduced for PagedAttention;
+    on SM>=80 the class still runs to exercise the MEA dispatch end-to-end."""
+
+    @parameterized.expand(paged_attention_test_cases())
+    def test_paged_attention_mea(self, _, config):
+        parity_check_paged_attention(
+            config,
+            rtol=5e-3,
+            atol=5e-3,
+            sdpa_kernel=SDPA_KERNEL_EFFICIENT_ATTENTION,
+        )
+
+
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnxruntime/test/shared_lib/test_nontensor_types.cc b/onnxruntime/test/shared_lib/test_nontensor_types.cc
index ba16bd6c9888f..497298474b36a 100644
--- a/onnxruntime/test/shared_lib/test_nontensor_types.cc
+++ b/onnxruntime/test/shared_lib/test_nontensor_types.cc
@@ -1256,4 +1256,100 @@ TEST(CApiTest, SparseTensorFillSparseFormatStringsAPI) {
     }
   }
 }
+
+#if !defined(ORT_NO_EXCEPTIONS)
+TEST(CApiTest, SparseTensorInvalidIndicesValidation) {
+  auto allocator = Ort::AllocatorWithDefaultOptions();
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  // Common dense shape and values
+  const std::vector<int64_t> dense_shape{3, 3};
+  Ort::Value::Shape ort_dense_shape{dense_shape.data(), dense_shape.size()};
+  std::vector<int32_t> values = {1, 1, 1};
+  constexpr int64_t values_len = 3;
+
+  //
+  // COO Negative linear index
+  //
+  {
+    auto coo_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> linear_indices = {-1, 3, 5};
+    ASSERT_THROW(
+        coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}},
+                                   linear_indices.data(), linear_indices.size()),
+        Ort::Exception);
+  }
+
+  //
+  // COO Linear index out of upper bounds
+  //
+  {
+    auto coo_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> linear_indices = {0, 3, 9};  // 9 is out of bounds for 3x3=9 (0-8)
+    ASSERT_THROW(
+        coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}},
+                                   linear_indices.data(), linear_indices.size()),
+        Ort::Exception);
+  }
+
+  //
+  // COO 2D indices out of row bounds
+  //
+  {
+    auto coo_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> dim_indices = {
+        0, 1,  // Valid
+        3, 0,  // Invalid row 3
+        2, 2   // Valid
+    };
+    ASSERT_THROW(
+        coo_st.FillSparseTensorCoo(info, {&values_len, 1U, {values.data()}},
+                                   dim_indices.data(), dim_indices.size()),
+        Ort::Exception);
+  }
+
+  //
+  // CSR inner index out of column bounds
+  //
+  {
+    auto csr_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> inner_indices = {1, 3, 1};  // 3 is out of bounds for 3 cols (0-2)
+    std::vector<int64_t> outer_indices = {0, 1, 2, 3};
+    ASSERT_THROW(
+        csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}},
+                                   inner_indices.data(), inner_indices.size(),
+                                   outer_indices.data(), outer_indices.size()),
+        Ort::Exception);
+  }
+
+  //
+  // CSR outer index not monotonically non-decreasing
+  //
+  {
+    auto csr_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> inner_indices = {0, 1, 2};
+    std::vector<int64_t> outer_indices = {0, 2, 1, 3};  // Drops from 2 to 1
+    ASSERT_THROW(
+        csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}},
+                                   inner_indices.data(), inner_indices.size(),
+                                   outer_indices.data(), outer_indices.size()),
+        Ort::Exception);
+  }
+
+  //
+  // CSR outer index out of upper bounds (greater than inner_indices.size())
+  //
+  {
+    auto csr_st = Ort::Value::CreateSparseTensor<int32_t>(allocator, ort_dense_shape);
+    std::vector<int64_t> inner_indices = {0, 1, 2};
+    std::vector<int64_t> outer_indices = {0, 1, 2, 4};  // 4 is > inner_indices.size() (3)
+    ASSERT_THROW(
+        csr_st.FillSparseTensorCsr(info, {&values_len, 1U, {values.data()}},
+                                   inner_indices.data(), inner_indices.size(),
+                                   outer_indices.data(), outer_indices.size()),
+        Ort::Exception);
+  }
+}
+#endif  // !defined(ORT_NO_EXCEPTIONS)
+
 #endif  // !defined(DISABLE_SPARSE_TENSORS)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 4b231011832e0..f42617ba1b04c 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -888,6 +888,9 @@ def generate_build_tree(
     if args.enable_arm_neon_nchwc:
         cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]
 
+    if args.enable_rvv:
+        cmake_args += ["-Donnxruntime_USE_RVV=ON"]
+
     if not args.no_sve:
         cmake_args += ["-Donnxruntime_USE_SVE=ON"]
 
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index e30c5f8979183..b40bf4c2b25c6 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -673,6 +673,11 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     cpu_group.add_argument(
         "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels."
     )
+    cpu_group.add_argument(
+        "--enable_rvv",
+        action="store_true",
+        help="Enable riscv64 MLAS kernels that use the RISC-V Vector extension.",
+    )
 
     # --- DNNL (formerly MKL-DNN / oneDNN) ---
     dnnl_group = parser.add_argument_group("DNNL Execution Provider")
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
index 3b28f80f8ec1c..5bcdcc2e1ecee 100644
--- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
+++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -52,3 +52,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Tanh||
 |ai.onnx:Transpose||
 |ai.onnx:Unsqueeze||
+|com.microsoft:QuickGelu|Produced by ORT's `QuickGeluFusion` optimizer pass. Decomposed into `mul` / `sigmoid` / `mul`.|
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml
index 1d432b662034b..f816c915031a9 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda13-packaging-pipeline.yml
@@ -66,3 +66,17 @@ extends:
           cudnn_folder: '9.14.0.64_cuda13'
           cmake_cuda_archs: '75-real;80-real;86-real;89-real;90-real;100-real;120-real;120-virtual'
           docker_base_image: 'onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda13_x64_almalinux8_gcc14:20251107.1'
+          docker_base_image_aarch64: 'onnxruntimebuildcache.azurecr.io/public/azureml/onnxruntime_build_cuda13_aarch64_almalinux9_gcc14:20260323.1'
+          AArch64LinuxPythonConfigurations:
+          - python_version: '3.11'
+            docker_python_exe_path: '/opt/python/cp311-cp311/bin/python3.11'
+          - python_version: '3.12'
+            docker_python_exe_path: '/opt/python/cp312-cp312/bin/python3.12'
+          - python_version: '3.13'
+            docker_python_exe_path: '/opt/python/cp313-cp313/bin/python3.13'
+          - python_version: '3.13t'
+            docker_python_exe_path: '/opt/python/cp313-cp313t/bin/python3.13'
+          - python_version: '3.14'
+            docker_python_exe_path: '/opt/python/cp314-cp314/bin/python3.14'
+          - python_version: '3.14t'
+            docker_python_exe_path: '/opt/python/cp314-cp314t/bin/python3.14'
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index f22b59218db1e..c6ad801fe4aa4 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -60,7 +60,17 @@ parameters:
 
 - name: docker_base_image
   type: string
-  displayName: 'Linux docker base image'
+  displayName: 'Linux x86_64 docker base image'
+
+- name: docker_base_image_aarch64
+  type: string
+  displayName: 'Linux aarch64 docker base image'
+  default: ''
+
+- name: AArch64LinuxPythonConfigurations
+  type: object
+  displayName: 'aarch64 Linux Python build configurations'
+  default: []
 
 stages:
     # Use separated cudnn folder for CUDA 13.0 on Windows.
@@ -102,11 +112,30 @@ stages:
           ${{ if eq(config.python_version, '3.12') }}:
             build_intermediates_artifact_name: linux_gpu_wheel_x86_64
 
+    # Linux aarch64: one parallel stage per Python version
+    - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}:
+      - template: py-linux-gpu-stage.yml
+        parameters:
+          stage_name: Linux_py_GPU_Wheels_aarch64_${{ replace(config.python_version, '.', '_') }}
+          arch: 'aarch64'
+          machine_pool: 'onnxruntime-linux-ARM64-CPU-2019'
+          extra_build_arg: ${{ parameters.build_py_parameters }}
+          cmake_build_type: ${{ parameters.cmake_build_type }}
+          cuda_version: ${{ parameters.cuda_version }}
+          docker_base_image: ${{ parameters.docker_base_image_aarch64 }}
+          python_version: ${{ config.python_version }}
+          docker_python_exe_path: ${{ config.docker_python_exe_path }}
+          wheel_artifact_name: onnxruntime_gpu_aarch64_${{ replace(config.python_version, '.', '_') }}
+          ${{ if eq(config.python_version, '3.12') }}:
+            build_intermediates_artifact_name: linux_gpu_wheel_aarch64
+
     # Merge per-version Linux wheel artifacts into a single combined artifact for downstream consumers
     - stage: Linux_py_GPU_Wheels_Merge_Artifacts
       dependsOn:
       - ${{ each config in parameters.LinuxPythonConfigurations }}:
         - Linux_py_GPU_Wheels_x86_64_${{ replace(config.python_version, '.', '_') }}
+      - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}:
+        - Linux_py_GPU_Wheels_aarch64_${{ replace(config.python_version, '.', '_') }}
       jobs:
       - job: Linux_py_GPU_Wheels_Merge_Artifacts
         workspace:
@@ -130,3 +159,10 @@ stages:
             inputs:
               artifact: onnxruntime_gpu_${{ replace(config.python_version, '.', '_') }}
               targetPath: $(Build.ArtifactStagingDirectory)/onnxruntime_gpu
+
+        - ${{ each config in parameters.AArch64LinuxPythonConfigurations }}:
+          - task: DownloadPipelineArtifact@2
+            displayName: 'Download wheel - aarch64 Python ${{ config.python_version }}'
+            inputs:
+              artifact: onnxruntime_gpu_aarch64_${{ replace(config.python_version, '.', '_') }}
+              targetPath: $(Build.ArtifactStagingDirectory)/onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
index d8793c147477d..47ccd4cd2fe73 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-linux-gpu-stage.yml
@@ -63,6 +63,8 @@ stages:
     pool:
       name: ${{ parameters.machine_pool }}
       os: linux
+      ${{ if eq(parameters.arch, 'aarch64') }}:
+        hostArchitecture: Arm64
     templateContext:
       outputs:
       - output: pipelineArtifact
@@ -80,10 +82,17 @@ stages:
           value: ''
       - template: ../templates/common-variables.yml
       - name: trt_version
-        ${{ if eq(parameters.cuda_version, '13.0') }}:
+        ${{ if eq(parameters.arch, 'aarch64') }}:
+          value: ${{ variables.aarch64_trt_version }}
+        ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}:
           value: ${{ variables.linux_trt_version_cuda13 }}
-        ${{ if eq(parameters.cuda_version, '12.8') }}:
+        ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '12.8')) }}:
           value: ${{ variables.linux_trt_version_cuda12 }}
+      - name: trt_download_url
+        ${{ if and(eq(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}:
+          value: ${{ variables.aarch64_trt_download_url_cuda13 }}
+        ${{ else }}:
+          value: ''
     steps:
       - checkout: self
         clean: true
@@ -99,7 +108,7 @@ stages:
         parameters:
           Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda/Dockerfile
           Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda
-          DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg BUILD_UID=$( id -u )"
+          DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg TRT_DOWNLOAD_URL=${{ variables.trt_download_url }} --build-arg BUILD_UID=$( id -u )"
           Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
 
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
index 1191e9e98eef1..8c8dae9820810 100644
--- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml
@@ -1,8 +1,11 @@
 variables:
   cuda12_trt_version: '10.14.1.48'
   cuda13_trt_version: '10.14.1.48'
+  aarch64_trt_version: '10.15.1.29'
   # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below
   linux_trt_version_cuda13: ${{ variables.cuda13_trt_version }}-1.cuda13.0
   linux_trt_version_cuda12: ${{ variables.cuda12_trt_version }}-1.cuda12.9
+  # aarch64 TRT tar download (no RPMs available for aarch64)
+  aarch64_trt_download_url_cuda13: https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.15.1/tars/TensorRT-${{ variables.aarch64_trt_version }}.Linux.aarch64-gnu.cuda-13.1.tar.gz
   win_trt_folder_cuda13: TensorRT-${{ variables.cuda13_trt_version }}.Windows.win10.cuda-13.0
   win_trt_folder_cuda12: TensorRT-${{ variables.cuda12_trt_version }}.Windows.win10.cuda-12.9
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
index 1dde96e21a636..afcba73456558 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml
@@ -45,10 +45,17 @@ jobs:
   - name: skipComponentGovernanceDetection
     value: true
   - name: trt_version
-    ${{ if eq(parameters.cuda_version, '13.0') }}:
+    ${{ if eq(parameters.arch, 'aarch64') }}:
+      value: ${{ variables.aarch64_trt_version }}
+    ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}:
       value: ${{ variables.linux_trt_version_cuda13 }}
-    ${{ if eq(parameters.cuda_version, '12.8') }}:
+    ${{ if and(ne(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '12.8')) }}:
       value: ${{ variables.linux_trt_version_cuda12 }}
+  - name: trt_download_url
+    ${{ if and(eq(parameters.arch, 'aarch64'), eq(parameters.cuda_version, '13.0')) }}:
+      value: ${{ variables.aarch64_trt_download_url_cuda13 }}
+    ${{ else }}:
+      value: ''
   workspace:
     clean: all
   pool: ${{ parameters.machine_pool }}
@@ -77,7 +84,7 @@ jobs:
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda/Dockerfile
       Context: tools/ci_build/github/linux/docker/inference/${{ parameters.arch }}/python/cuda
-      DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg BUILD_UID=$( id -u )"
+      DockerBuildArgs: "--build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ variables.trt_version }} --build-arg TRT_DOWNLOAD_URL=${{ variables.trt_download_url }} --build-arg BUILD_UID=$( id -u )"
       Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }}
 
   - task: Bash@3
diff --git a/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml b/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml
index 212eca44ae3ec..e92eb0dafadcb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/set-plugin-build-variables-step.yml
@@ -59,7 +59,10 @@ steps:
               print("##vso[task.logissue type=error]Failed to get git info: {}".format(e))
               sys.exit(1)
           version_string = "{}-dev.{}+{}".format(original_ver, date_str, commit_sha)
-          universal_version = "{}-dev.{}.{}".format(original_ver, date_str, commit_sha)
+          # Prefix the SHA with "commit-" so the pre-release identifier always contains a
+          # non-digit. Otherwise, an all-numeric short SHA with a leading zero (e.g. "01234567")
+          # would violate SemVer 2.0.0's rule against leading zeros in numeric identifiers.
+          universal_version = "{}-dev.{}.commit-{}".format(original_ver, date_str, commit_sha)
 
       else:
           print("##vso[task.logissue type=error]Unknown package_version '{}'. Must be 'release', 'RC', or 'dev'.".format(package_version))
@@ -74,10 +77,10 @@ steps:
           print("##vso[task.logissue type=error]Version string '{}' is not valid semver 2.0.0.".format(version_string))
           sys.exit(1)
 
-      # Validate universal version (SemVer 1.0.0 - no build metadata)
+      # Validate universal version (SemVer 2.0.0, without build metadata)
       universal_semver_pattern = r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?$"
       if not re.match(universal_semver_pattern, universal_version):
-          print("##vso[task.logissue type=error]Universal version string '{}' is not valid semver 1.0.0.".format(universal_version))
+          print("##vso[task.logissue type=error]Universal version string '{}' is not valid semver 2.0.0 (without build metadata).".format(universal_version))
           sys.exit(1)
 
       print("##vso[task.setvariable variable=PluginPackageVersion]{}".format(version_string))
diff --git a/tools/ci_build/github/js/validate-npm-packages.py b/tools/ci_build/github/js/validate-npm-packages.py
index 73f76ea8bbb5b..2917331b84c07 100644
--- a/tools/ci_build/github/js/validate-npm-packages.py
+++ b/tools/ci_build/github/js/validate-npm-packages.py
@@ -113,8 +113,13 @@
 print(f"##vso[task.setvariable variable=ORT_COMMON_FROM]{ort_common_from}")
 
 if tag == "latest" or tag == "" or tag == "rc":
-    if not RELEASE_NODE or not RELEASE_WEB or not RELEASE_REACT_NATIVE:
-        raise Exception("@latest or @rc build must release all packages (node, web, react-native)")
+    # FUTURE WORK:  We will either punt `react-native` out of the core package set, or fix it and re-incorporate it.
+    #               Which one is TBD, but for now we are not requiring `react-native` for @latest or @rc builds.
+    if not RELEASE_NODE or not RELEASE_WEB:
+        raise Exception("@latest or @rc build must release the following packages: node, web")
+    if not RELEASE_REACT_NATIVE:
+        print("WARNING - @latest or @rc build should release `react-native` package. This is temporarily not required.")
+
     if count_ort_node_common_tgz != 1:
         raise Exception("expect one package file for onnxruntime-common for release build")
 
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 27bf6f9b9e1d1..7ba5406e00ec0 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -80,8 +80,20 @@ if [ "$BUILD_DEVICE" == "GPU" ]; then
     fi
 
     SHORT_CUDA_VERSION=$(echo "$CUDA_VERSION" | sed   's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
-    #Enable CUDA and TRT EPs.
-    BUILD_ARGS+=("--use_cuda" "--use_tensorrt" "--cuda_version=$SHORT_CUDA_VERSION" "--tensorrt_home=/usr" "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
+    CUDA_HOME=/usr/local/cuda-$SHORT_CUDA_VERSION
+    if [ ! -d "$CUDA_HOME" ] && [ -d /usr/local/cuda ]; then
+        # Allow the cu13 packaging flow to run on images that expose a newer CUDA minor version via /usr/local/cuda.
+        CUDA_HOME=/usr/local/cuda
+    fi
+    #Enable CUDA EP.
+    BUILD_ARGS+=("--use_cuda" "--cuda_version=$SHORT_CUDA_VERSION" "--cuda_home=$CUDA_HOME" "--cudnn_home=$CUDA_HOME" "--nvcc_threads=1" "--cmake_extra_defines" "CMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}" "onnxruntime_USE_FPA_INTB_GEMM=OFF")
+    # Enable TRT EP only if TensorRT is installed.
+    if [ -f /usr/include/NvInfer.h ]; then
+        BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/usr")
+    elif [ "$ARCH" != "aarch64" ] && [ -f /opt/tensorrt/include/NvInfer.h ]; then
+        # The aarch64 TensorRT tarball is not compatible with the packaging image's glibc baseline.
+        BUILD_ARGS+=("--use_tensorrt" "--tensorrt_home=/opt/tensorrt")
+    fi
 fi
 if [ "$BUILD_DEVICE" == "WEBGPU" ]; then
     BUILD_ARGS+=("--use_webgpu")
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile
new file mode 100644
index 0000000000000..b960961a20336
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/Dockerfile
@@ -0,0 +1,48 @@
+# The default ARGs are for cuda 12.8 with cudnn9, TensorRT is optional
+# Please overwrite BASEIMAGE, TRT_VERSION and other arguments with
+# --docker-build-args ' --build-arg BASEIMAGE=other_base_image --build-arg TRT_VERSION=other_trt_version etc...'
+# for other cuda version and TRT version
+ARG BASEIMAGE=nvidia/cuda:12.8.1-cudnn-devel-ubi8
+
+FROM $BASEIMAGE
+ARG TRT_VERSION
+# For aarch64 tar-based TensorRT install
+ARG TRT_DOWNLOAD_URL=""
+ARG TENSORRT_ROOT=/opt/tensorrt
+
+# Install TensorRT: use tar download for aarch64 since RPMs are not available
+RUN set -eux; \
+    if [ -z "${TRT_VERSION}" ]; then \
+        echo "TRT_VERSION is empty; skipping TensorRT installation"; \
+    elif [ -n "${TRT_DOWNLOAD_URL}" ]; then \
+        echo "Installing TensorRT ${TRT_VERSION} from tar"; \
+        mkdir -p /tmp/trt "${TENSORRT_ROOT}"; \
+        curl -fsSL "${TRT_DOWNLOAD_URL}" -o /tmp/trt/tensorrt.tar.gz; \
+        tar -xzf /tmp/trt/tensorrt.tar.gz -C /tmp/trt; \
+        extracted_dir="$(find /tmp/trt -mindepth 1 -maxdepth 1 -type d | head -n 1)"; \
+        cp -a "${extracted_dir}/." "${TENSORRT_ROOT}/"; \
+        rm -rf /tmp/trt; \
+        if [ -d "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/lib" ] && [ ! -e "${TENSORRT_ROOT}/lib" ]; then \
+            ln -s "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/lib" "${TENSORRT_ROOT}/lib"; \
+        fi; \
+        if [ -d "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/include" ] && [ ! -e "${TENSORRT_ROOT}/include" ]; then \
+            ln -s "${TENSORRT_ROOT}/targets/sbsa-linux-gnu/include" "${TENSORRT_ROOT}/include"; \
+        fi; \
+    else \
+        echo "TRT_VERSION is ${TRT_VERSION} but no TRT_DOWNLOAD_URL provided; skipping"; \
+    fi
+
+ENV TENSORRT_ROOT=${TENSORRT_ROOT}
+ENV PATH=${TENSORRT_ROOT}/bin:/usr/local/cuda/bin:${PATH}
+ENV LD_LIBRARY_PATH=${TENSORRT_ROOT}/lib:${LD_LIBRARY_PATH}
+ENV CPATH=${TENSORRT_ROOT}/include:${CPATH}
+ENV CUDA_MODULE_LOADING="LAZY"
+
+ADD scripts /tmp/scripts
+RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts
+
+ARG BUILD_UID=1001
+ARG BUILD_USER=onnxruntimedev
+RUN adduser --uid $BUILD_UID $BUILD_USER
+WORKDIR /home/$BUILD_USER
+USER $BUILD_USER
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh
new file mode 100755
index 0000000000000..d90683c468627
--- /dev/null
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cuda/scripts/install_centos.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1)
+
+echo "installing for os major version : $os_major_version"
+if [ "$os_major_version" -ge 9 ]; then
+  dnf install -y glibc-langpack-\* which expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+else
+  dnf install -y glibc-langpack-\* which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel perl-IPC-Cmd openssl-devel wget
+fi
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
index 7a29fd7fc728c..03f351d942e70 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/openvino/Dockerfile
@@ -22,8 +22,8 @@ RUN dnf install -y --nodocs \
     && dnf clean all \
     && rm -rf /var/cache/dnf
 
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2025.4.1
-ARG OPENVINO_PACKAGE_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/2025.4.1/linux/openvino_toolkit_rhel8_2025.4.1.20426.82bbf0292c5_x86_64.tgz
+ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2026.1.0
+ARG OPENVINO_PACKAGE_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/2026.1/linux/openvino_toolkit_rhel8_2026.1.0.21367.63e31528c62_x86_64.tgz
 ARG TEMP_DIR=/tmp/openvino_installer
 
 RUN mkdir -p ${TEMP_DIR} && \
diff --git a/tools/ci_build/github/linux/run_python_tests.sh b/tools/ci_build/github/linux/run_python_tests.sh
index 246bc076fd5b3..e1856e51e3c9c 100755
--- a/tools/ci_build/github/linux/run_python_tests.sh
+++ b/tools/ci_build/github/linux/run_python_tests.sh
@@ -9,7 +9,7 @@ BUILD_CONFIG="Release"
 while getopts "d:c:" parameter_Option
 do case "${parameter_Option}"
 in
-#GPU or CPU. 
+#GPU or CPU.
 d) BUILD_DEVICE=${OPTARG};;
 c) BUILD_CONFIG=${OPTARG};;
 esac
@@ -38,8 +38,20 @@ if [ $ARCH == "x86_64" ]; then
 fi
 if [ $BUILD_DEVICE == "GPU" ]; then
     SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed   's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
+    CUDA_HOME=/usr/local/cuda-$SHORT_CUDA_VERSION
+    if [ ! -d "$CUDA_HOME" ] && [ -d /usr/local/cuda ]; then
+        # Allow the cu13 packaging flow to run on images that expose a newer CUDA minor version via /usr/local/cuda.
+        CUDA_HOME=/usr/local/cuda
+    fi
 
-    BUILD_ARGS="$BUILD_ARGS --use_cuda --use_tensorrt --cuda_version=$SHORT_CUDA_VERSION --tensorrt_home=/usr --cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION --cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --use_cuda --cuda_version=$SHORT_CUDA_VERSION --cuda_home=$CUDA_HOME --cudnn_home=$CUDA_HOME"
+    # Enable TRT EP only if TensorRT is installed.
+    if [ -f /usr/include/NvInfer.h ]; then
+        BUILD_ARGS="$BUILD_ARGS --use_tensorrt --tensorrt_home=/usr"
+    elif [ "$ARCH" != "aarch64" ] && [ -f /opt/tensorrt/include/NvInfer.h ]; then
+        # The aarch64 TensorRT tarball is not compatible with the packaging image's glibc baseline.
+        BUILD_ARGS="$BUILD_ARGS --use_tensorrt --tensorrt_home=/opt/tensorrt"
+    fi
 fi
 
 python3 -m pip install --upgrade pip
@@ -47,7 +59,7 @@ python3 -m pip install --upgrade pip
 python3 -m pip install -r /build/$BUILD_CONFIG/requirements.txt
 # Install the packages that are needed for running test scripts
 python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/python/requirements.txt
-# The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no 
+# The "--no-index" flag is crucial. The local whl folder is just an additional source. Pypi's doc says "there is no
 # ordering in the locations that are searched" if we don't disable the default one with "--no-index"
 python3 -m pip install --no-index --find-links /build/whl $PYTHON_PACKAGE_NAME
 cd /build/$BUILD_CONFIG